diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 713bbc1..93beb37 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,17 +52,28 @@ jobs: print('ok:', sub) " + format: + name: black + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - run: pip install black + - run: black --check RiskLabAI test + lint: - name: ruff + black (advisory until format pass lands) + name: ruff (advisory until cleanup lands) runs-on: ubuntu-latest - # Non-blocking until the mechanical black/ruff formatting commit lands; - # then remove continue-on-error to make style CI-enforced. + # Advisory until the dedicated ruff-cleanup pass lands (Dict->dict + # modernization, star-import removal, unused-import pruning); then remove + # continue-on-error to enforce. continue-on-error: true steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.11" - - run: pip install ruff black + - run: pip install ruff - run: ruff check RiskLabAI test - - run: black --check RiskLabAI test diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md deleted file mode 100644 index fff911d..0000000 --- a/DOCUMENTATION.md +++ /dev/null @@ -1,6834 +0,0 @@ -# Documentation for `RiskLabAI` Library - -## 🌳 File Structure - -``` -πŸ“ RiskLabAI/ -β”œβ”€β”€ πŸ“ backtest/ -β”‚ β”œβ”€β”€ πŸ“ validation/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ adaptive_combinatorial_purged.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ bagged_combinatorial_purged.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ combinatorial_purged.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ cross_validator_controller.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ cross_validator_factory.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ cross_validator_interface.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ kfold.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ purged_kfold.py -β”‚ β”‚ └── πŸ“„ walk_forward.py -β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”œβ”€β”€ πŸ“„ backtest_statistics.py -β”‚ β”œβ”€β”€ πŸ“„ backtest_synthetic_data.py -β”‚ β”œβ”€β”€ πŸ“„ backtset_overfitting_in_the_machine_learning_era_simulation.py -β”‚ β”œβ”€β”€ πŸ“„ bet_sizing.py -β”‚ β”œβ”€β”€ πŸ“„ probabilistic_sharpe_ratio.py -β”‚ β”œβ”€β”€ πŸ“„ probability_of_backtest_overfitting.py -β”‚ β”œβ”€β”€ πŸ“„ strategy_risk.py -β”‚ └── πŸ“„ test_set_overfitting.py -β”œβ”€β”€ πŸ“ cluster/ -β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ └── πŸ“„ clustering.py -β”œβ”€β”€ πŸ“ controller/ -β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”œβ”€β”€ πŸ“„ bars_initializer.py -β”‚ └── πŸ“„ data_structure_controller.py -β”œβ”€β”€ πŸ“ data/ -β”‚ β”œβ”€β”€ πŸ“ denoise/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ └── πŸ“„ denoising.py -β”‚ β”œβ”€β”€ πŸ“ differentiation/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ └── πŸ“„ differentiation.py -β”‚ β”œβ”€β”€ πŸ“ distance/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ └── πŸ“„ distance_metric.py -β”‚ β”œβ”€β”€ πŸ“ labeling/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ financial_labels.py -β”‚ β”‚ └── πŸ“„ labeling.py -β”‚ β”œβ”€β”€ πŸ“ structures/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ abstract_bars.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ abstract_imbalance_bars.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ abstract_information_driven_bars.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ abstract_run_bars.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ data_structures_lopez.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ filtering_lopez.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ hedging.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ imbalance_bars.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ infomation_driven_bars.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ run_bars.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ standard_bars.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ standard_bars_lopez.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ time_bars.py -β”‚ β”‚ └── πŸ“„ utilities_lopez.py -β”‚ β”œβ”€β”€ πŸ“ synthetic_data/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ drift_burst_hypothesis.py -β”‚ β”‚ └── πŸ“„ synthetic_controlled_environment.py -β”‚ β”œβ”€β”€ πŸ“ weights/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ └── πŸ“„ sample_weights.py -β”‚ └── πŸ“„ __init__.py -β”œβ”€β”€ πŸ“ ensemble/ -β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ └── πŸ“„ bagging_classifier_accuracy.py -β”œβ”€β”€ πŸ“ features/ -β”‚ β”œβ”€β”€ πŸ“ entropy_features/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ entropy.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ kontoyiannis.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ lempel_ziv.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ plug_in.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ pmf.py -β”‚ β”‚ └── πŸ“„ shannon.py -β”‚ β”œβ”€β”€ πŸ“ feature_importance/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ clustered_feature_importance_mda.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ clustered_feature_importance_mdi.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ clustering.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ feature_importance_controller.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ feature_importance_factory.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ feature_importance_mda.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ feature_importance_mdi.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ feature_importance_sfi.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ feature_importance_strategy.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ FeatureImportance.ipynb -β”‚ β”‚ β”œβ”€β”€ πŸ“„ generate_synthetic_data.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ orthogonal_features.py -β”‚ β”‚ └── πŸ“„ weighted_tau.py -β”‚ β”œβ”€β”€ πŸ“ microstructural_features/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ bekker_parkinson_volatility_estimator.py -β”‚ β”‚ └── πŸ“„ corwin_schultz.py -β”‚ β”œβ”€β”€ πŸ“ structural_breaks/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ └── πŸ“„ structural_breaks.py -β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ └── πŸ“„ test.ipynb -β”œβ”€β”€ πŸ“ hpc/ -β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ └── πŸ“„ hpc.py -β”œβ”€β”€ πŸ“ optimization/ -β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”œβ”€β”€ πŸ“„ hrp.py -β”‚ β”œβ”€β”€ πŸ“„ hyper_parameter_tuning.py -β”‚ └── πŸ“„ nco.py -β”œβ”€β”€ πŸ“ pde/ -β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”œβ”€β”€ πŸ“„ equation.py -β”‚ β”œβ”€β”€ πŸ“„ model.py -β”‚ └── πŸ“„ solver.py -β”œβ”€β”€ πŸ“ utils/ -β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”œβ”€β”€ πŸ“„ constants.py -β”‚ β”œβ”€β”€ πŸ“„ ewma.py -β”‚ β”œβ”€β”€ πŸ“„ momentum_mean_reverting_strategy_sides.py -β”‚ β”œβ”€β”€ πŸ“„ progress.py -β”‚ β”œβ”€β”€ πŸ“„ smoothing_average.py -β”‚ └── πŸ“„ update_figure_layout.py -└── πŸ“„ __init__.py -``` - -## πŸ“„ Module & Function Reference - -### πŸ“„ `RiskLabAI\backtest\backtest_statistics.py` - -#### `function bet_timing` - -```python -def bet_timingtarget_positions: pd.Series: -``` - -> Determine the timing of bets when positions flatten or flip. - -:param target_positions: Series of target positions. -:return: Index of bet timing. - -#### `function calculate_holding_period` - -```python -def calculate_holding_periodtarget_positions: pd.Series: -``` - -> Derive average holding period (in days) using the average entry time pairing algorithm. - -:param target_positions: Series of target positions. -:return: Tuple containing holding period DataFrame and mean holding period. - -#### `function calculate_hhi_concentration` - -```python -def calculate_hhi_concentrationreturns: pd.Series: -``` - -> Calculate the HHI concentration measures. - -:param returns: Series of returns. -:return: Tuple containing positive returns HHI, negative returns HHI, and time-concentrated HHI. - -#### `function calculate_hhi` - -```python -def calculate_hhibet_returns: pd.Series: -``` - -> Calculate the Herfindahl-Hirschman Index (HHI) concentration measure. - -:param bet_returns: Series of bet returns. -:return: Calculated HHI value. - -#### `function compute_drawdowns_time_under_water` - -```python -def compute_drawdowns_time_under_waterseries: pd.Series, dollars: bool=False: -``` - -> Compute series of drawdowns and the time under water associated with them. - -:param series: Series of returns or dollar performance. -:param dollars: Whether the input series represents returns or dollar performance. -:return: Tuple containing drawdown series, time under water series, and drawdown analysis DataFrame. - - -### πŸ“„ `RiskLabAI\backtest\backtest_synthetic_data.py` - -#### `function synthetic_back_testing` - -```python -def synthetic_back_testingforecast: float, half_life: float, sigma: float, n_iteration: int=100000, maximum_holding_period: int=100, profit_taking_range: np.ndarray=np.linspace(0.5, 10, 20), stop_loss_range: np.ndarray=np.linspace(0.5, 10, 20), seed: int=0: -``` - -> Perform backtesting on synthetic price data generated using the Ornstein-Uhlenbeck process. - -The Ornstein-Uhlenbeck process is given by: -.. math:: P_t = (1 - \\rho) * F + \\rho * P_{t-1} + \\sigma * Z_t - -where: -- \(P_t\) is the price at time t -- \(F\) is the forecast price -- \(\\rho\) is the autoregression coefficient -- \(\\sigma\) is the standard deviation of noise -- \(Z_t\) is a random noise with mean 0 and standard deviation 1 - -Args: - forecast (float): The forecasted price. - half_life (float): The half-life time needed to reach half. - sigma (float): The standard deviation of the noise. - n_iteration (int): Number of iterations. Defaults to 100000. - maximum_holding_period (int): Maximum holding period. Defaults to 100. - profit_taking_range (np.ndarray): Profit taking range. Defaults to np.linspace(0.5, 10, 20). - stop_loss_range (np.ndarray): Stop loss range. Defaults to np.linspace(0.5, 10, 20). - seed (int): Initial seed value. Defaults to 0. - -Returns: - list[tuple[float, float, float, float, float]]: List of tuples containing profit taking, stop loss, mean, - standard deviation, and Sharpe ratio. - - -### πŸ“„ `RiskLabAI\backtest\backtset_overfitting_in_the_machine_learning_era_simulation.py` - -#### `function financial_features_backtest_overfitting_simulation` - -```python -def financial_features_backtest_overfitting_simulationprices: pd.Series, noise_scale: float=0.0, random_state: Optional[int]=None: -``` - -> Create a DataFrame of financial features from the given price series. - -Args: - prices (pd.Series): Time series of asset prices. - noise_scale (float): Scale of Gaussian noise to be added to the features. Default is 0.0. - random_state (Optional[int]): Seed for random number generator. Default is None. - -Returns: - pd.DataFrame: DataFrame containing the computed financial features. - -#### `function backtest_overfitting_simulation_results` - -```python -def backtest_overfitting_simulation_resultsprices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], cross_validators: Dict[str, Any], noise_scale: float=0.0, random_state: int=None, n_jobs: int=1: -``` - -> Conducts a simulation to evaluate the performance of trading strategies and models. - -This function simulates a trading environment to assess various cross-validation methods in the context of financial analytics. It uses a set of market regime parameters and machine learning models to backtest trading strategies and compute metrics indicative of overfitting. - -Args: - prices (pd.Series): Time series of asset prices. - strategy_parameters (dict): Parameters dictating trading strategy behavior, including window sizes and flags for mean reversion. - models (dict): A collection of machine learning models and their associated parameters. - cross_validators (dict): A dictionary of cross-validation methods. - noise_scale (float): Scale of Gaussian noise to be added to the features. - random_state (int): Seed for random number generator. - n_jobs (int): The number of jobs to run in parallel for cross_validator.backtest_predictions. - -Returns: - Dict[str, List[Dict[str, Any]]]: A dictionary containing the results of the backtest for each cross-validation method tested. - -#### `function overall_backtest_overfitting_simulation` - -```python -def overall_backtest_overfitting_simulationprices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, noise_scale: float=0.0, random_state: int=None, n_jobs: int=1: -``` - -> Conducts an overall backtest overfitting simulation to calculate the metrics. - -Args: - prices (pd.Series): Time series of asset prices. - strategy_parameters (dict): Parameters dictating trading strategy behavior. - models (dict): A collection of machine learning models and their associated parameters. - step_risk_free_rate (float): The risk-free rate used in the simulation for Sharpe ratio calculations. - noise_scale (float): Scale of Gaussian noise to be added to the features. - random_state (int): Seed for random number generator. - n_jobs (int): The number of jobs to run in parallel for cross_validator.backtest_predictions. - -Returns: - Tuple[Dict[str, float], Dict[str, float]]: A tuple containing two dictionaries, one for the Probability of Backtest Overfitting (PBO) and the other for the Deflated Sharpe Ratio (DSR), for each cross-validation method tested. - -#### `function temporal_backtest_overfitting_simulation` - -```python -def temporal_backtest_overfitting_simulationprices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, overfitting_partitions_length: int, n_jobs: int=1: -``` - -> Conducts a temporal backtest overfitting simulation to calculate the metrics in chunks. - -Args: - prices (pd.Series): Time series of asset prices. - strategy_parameters (dict): Parameters dictating trading strategy behavior. - models (dict): A collection of machine learning models and their associated parameters. - step_risk_free_rate (float): The risk-free rate used in the simulation for Sharpe ratio calculations. - overfitting_partitions_length (int): The number of partitions to divide the dataset into for temporal overfitting analysis. - n_jobs (int): The number of jobs to run in parallel for cross_validator.backtest_predictions. - -Returns: - Tuple[Dict[str, List[float]], Dict[str, List[float]]]: A tuple containing two dictionaries, one for the Probability of Backtest Overfitting (PBO) and the other for the Deflated Sharpe Ratio (DSR), for each cross-validation method tested. - -#### `function time_temporal_backtest_overfitting_simulation` - -```python -def time_temporal_backtest_overfitting_simulationprices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, overfitting_partitions_duration: str='A', n_jobs: int=1: -``` - -> Conducts a time-temporal backtest overfitting simulation to calculate the metrics in time-indexed chunks. - -Args: - prices (pd.Series): Time series of asset prices. - strategy_parameters (dict): Parameters dictating trading strategy behavior. - models (dict): A collection of machine learning models and their associated parameters. - step_risk_free_rate (float): The risk-free rate used in the simulation for Sharpe ratio calculations. - overfitting_partitions_duration (str): The frequency for time-based grouping to divide the dataset into for temporal overfitting analysis. - n_jobs (int): The number of jobs to run in parallel for cross_validator.backtest_predictions. - -Returns: - Tuple[Dict[str, pd.Series], Dict[str, pd.Series]]: A tuple containing two dictionaries, one for the Probability of Backtest Overfitting (PBO) and the other for the Deflated Sharpe Ratio (DSR), for each cross-validation method tested, indexed by time. - -#### `function varying_embargo_backtest_overfitting_simulation` - -```python -def varying_embargo_backtest_overfitting_simulationprices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, embargo_values: List[float], n_jobs: int=1: -``` - -> Conducts a backtest overfitting simulation with varying embargo values to calculate the metrics. - -Args: - prices (pd.Series): Time series of asset prices. - strategy_parameters (dict): Parameters dictating trading strategy behavior. - models (dict): A collection of machine learning models and their associated parameters. - step_risk_free_rate (float): The risk-free rate used in the simulation for Sharpe ratio calculations. - embargo_values (List[float]): List of embargo values to test. - n_jobs (int): The number of jobs to run in parallel for cross_validator.backtest_predictions. - -Returns: - Tuple[pd.DataFrame, pd.DataFrame]: Two DataFrames containing PBO and DSR values for each embargo value and cross-validation method. - -#### `function sharpe_ratio` - -```python -def sharpe_ratioreturns, risk_free_rate=0: -``` - -> Calculate the Sharpe ratio of the given returns. - -#### `function sortino_ratio` - -```python -def sortino_ratioreturns, risk_free_rate=0: -``` - -> Calculate the Sortino ratio of the given returns. - -#### `function expected_shortfall` - -```python -def expected_shortfallreturns, step_risk_free_rate, confidence_level=0.05: -``` - -> Calculate the expected shortfall (conditional VaR) of the given returns. - -#### `function backtest_overfitting_simulation_financial_metrics_rank_correlation` - -```python -def backtest_overfitting_simulation_financial_metrics_rank_correlationprices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, n_jobs: int=1: -``` - -> Conducts a backtest overfitting simulation and calculates the rank correlation of financial metrics. - -Args: - prices (pd.Series): Time series of asset prices. - strategy_parameters (dict): Parameters dictating trading strategy behavior. - models (dict): A collection of machine learning models and their associated parameters. - step_risk_free_rate (float): The risk-free rate used in the simulation for Sharpe ratio calculations. - n_jobs (int): The number of jobs to run in parallel for cross_validator.backtest_predictions. - -Returns: - pd.DataFrame: DataFrame containing the rank correlations for each cross-validation method and each metric. - -#### `function backtest_overfitting_simulation_model_complexity` - -```python -def backtest_overfitting_simulation_model_complexityprices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Any], step_risk_free_rate: float, n_jobs: int=1: -``` - -> Conducts a backtest overfitting simulation to compare the PBO and DSR values of each CV method for simple and complex models. - -Args: - prices (pd.Series): Time series of asset prices. - strategy_parameters (dict): Parameters dictating trading strategy behavior. - models (dict): A collection of machine learning models. - step_risk_free_rate (float): The risk-free rate used in the simulation for Sharpe ratio calculations. - n_jobs (int): The number of jobs to run in parallel for cross_validator.backtest_predictions. - -Returns: - Tuple[pd.DataFrame, pd.DataFrame]: Two DataFrames containing PBO and DSR values for each model and each CV method. - -#### `function noised_backtest_overfitting_simulation` - -```python -def noised_backtest_overfitting_simulationprices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, noise_scales: List[float], random_state: int=None, n_jobs: int=1: -``` - -> Conducts a noised backtest overfitting simulation to compare the new PBO/DSR values for different noise scales. - -Args: - prices (pd.Series): Time series of asset prices. - strategy_parameters (dict): Parameters dictating trading strategy behavior. - models (dict): A collection of machine learning models and their associated parameters. - step_risk_free_rate (float): The risk-free rate used in the simulation for Sharpe ratio calculations. - noise_scales (List[float]): List of noise scale values to test. - random_state (int): Seed for random number generator. - n_jobs (int): The number of jobs to run in parallel for cross_validator.backtest_predictions. - -Returns: - Tuple[pd.DataFrame, pd.DataFrame]: Two DataFrames containing PBO and DSR values for each noise scale and each CV method. - -#### `function overall_novel_methods_backtest_overfitting_simulation` - -```python -def overall_novel_methods_backtest_overfitting_simulationprices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, noise_scale: float=0.0, random_state: int=None, n_jobs: int=1: -``` - -> Conducts an overall backtest overfitting simulation to calculate the metrics for the novel CPCV methods. - -Args: - prices (pd.Series): Time series of asset prices. - strategy_parameters (dict): Parameters dictating trading strategy behavior. - models (dict): A collection of machine learning models and their associated parameters. - step_risk_free_rate (float): The risk-free rate used in the simulation for Sharpe ratio calculations. - noise_scale (float): Scale of Gaussian noise to be added to the features. - random_state (int): Seed for random number generator. - n_jobs (int): The number of jobs to run in parallel for cross_validator.backtest_predictions. - -Returns: - Tuple[Dict[str, float], Dict[str, float]]: A tuple containing two dictionaries, one for the Probability of Backtest Overfitting (PBO) and the other for the Deflated Sharpe Ratio (DSR), for each cross-validation method tested. - -#### `function get_cpu_info` - -```python -def get_cpu_info: -``` -#### `function format_cpu_info` - -```python -def format_cpu_infocpu_info: -``` -#### `function generate_random_data` - -```python -def generate_random_datan_samples: int, n_features: int: -``` -#### `function measure_computational_requirements` - -```python -def measure_computational_requirementscross_validator, model, data, target, weights, n_jobs: int=1: -``` -#### `function measure_all_cv_computational_requirements` - -```python -def measure_all_cv_computational_requirementscross_validators: Dict[str, Any], n_samples: int=40 * 252, n_features: int=22, n_jobs: int=1, n_repeats: int=30: -``` -#### `function measure_cpcv_parallelization` - -```python -def measure_cpcv_parallelizationn_samples: int=40 * 252, n_features: int=22, n_repeats: int=30, n_jobs_list: List[int]=range(1, 9): -``` -#### `function measure_cpcv_scalability` - -```python -def measure_cpcv_scalabilitysample_sizes: List[int], feature_sizes: List[int], n_repeats: int=1, n_jobs: int=1: -``` - -### πŸ“„ `RiskLabAI\backtest\bet_sizing.py` - -#### `function probability_bet_size` - -```python -def probability_bet_sizeprobabilities: np.ndarray, sides: np.ndarray: -``` - -> Calculate the bet size based on probabilities and side. - -:param probabilities: array of probabilities -:param sides: array indicating the side of the bet (e.g., long/short or buy/sell) -:return: array of bet sizes - -.. math:: - - ext{bet size} = ext{side} imes (2 imes ext{CDF}( ext{probabilities}) - 1) - -#### `function average_bet_sizes` - -```python -def average_bet_sizesprice_dates: np.ndarray, start_dates: np.ndarray, end_dates: np.ndarray, bet_sizes: np.ndarray: -``` - -> Compute average bet sizes for each date. - -:param price_dates: array of price dates -:param start_dates: array of start dates for bets -:param end_dates: array of end dates for bets -:param bet_sizes: array of bet sizes for each date range -:return: array of average bet sizes for each price date - -#### `function strategy_bet_sizing` - -```python -def strategy_bet_sizingprice_timestamps: pd.Series, times: pd.Series, sides: pd.Series, probabilities: pd.Series: -``` - -> Calculate the average bet size for a trading strategy given price timestamps. - -:param price_timestamps: series of price timestamps -:param times: series with start times as indices and end times as values -:param sides: series indicating the side of the position (e.g., long/short) -:param probabilities: series of probabilities associated with each position -:return: series of average bet sizes for each price timestamp - -#### `function avgActiveSignals` - -```python -def avgActiveSignalssignals, nThreads: -``` -#### `function mpAvgActiveSignals` - -```python -def mpAvgActiveSignalssignals, molecule: -``` -#### `function discreteSignal` - -```python -def discreteSignalsignal, stepSize: -``` -#### `function Signal` - -```python -def Signalevents, stepSize, probability, prediction, nClasses, nThreads: -``` -#### `function betSize` - -```python -def betSizew, x: -``` -#### `function TPos` - -```python -def TPosw, f, acctualPrice, maximumPositionSize: -``` -#### `function inversePrice` - -```python -def inversePricef, w, m: -``` -#### `function limitPrice` - -```python -def limitPricetargetPositionSize, cPosition, f, w, maximumPositionSize: -``` -#### `function getW` - -```python -def getWx, m: -``` - -### πŸ“„ `RiskLabAI\backtest\probabilistic_sharpe_ratio.py` - -#### `function probabilistic_sharpe_ratio` - -```python -def probabilistic_sharpe_ratioobserved_sharpe_ratio: float, benchmark_sharpe_ratio: float, number_of_returns: int, skewness_of_returns: float=0, kurtosis_of_returns: float=3, return_test_statistic: bool=False: -``` - -> Calculates the Probabilistic Sharpe Ratio (PSR) based on observed and benchmark Sharpe ratios. - -The PSR provides a means to test whether a track record would have achieved an observed -level of outperformance due to skill or luck. It is calculated using: - -.. math:: - \frac{(\hat{SR} - SR^*) \sqrt{T-1}}{\sqrt{1 - S \hat{SR} + \frac{K-1}{4} \hat{SR}^2}} - -Where: -- \(\hat{SR}\) is the observed Sharpe ratio -- \(SR^*\) is the benchmark Sharpe ratio -- \(T\) is the number of returns -- \(S\) is the skewness of returns -- \(K\) is the kurtosis of returns - -:param observed_sharpe_ratio: The observed Sharpe ratio. -:param benchmark_sharpe_ratio: The benchmark Sharpe ratio. -:param number_of_returns: The number of return observations. -:param skewness_of_returns: The skewness of the returns (default = 0). -:param kurtosis_of_returns: The kurtosis of the returns (default = 3). -:param return_test_statistic: Return the test statistic instead of the CDF value. -:return: The Probabilistic Sharpe Ratio. - -#### `function benchmark_sharpe_ratio` - -```python -def benchmark_sharpe_ratiosharpe_ratio_estimates: list: -``` - -> Calculates the Benchmark Sharpe Ratio based on Sharpe ratio estimates. - -The benchmark Sharpe ratio is computed using: - -.. math:: - \sigma_{SR} \left[ (1 - \gamma) \Phi^{-1}(1 - \frac{1}{N}) + \gamma \Phi^{-1}(1 - \frac{1}{N} e^{-1}) \right] - -Where: -- \(\sigma_{SR}\) is the standard deviation of Sharpe ratio estimates -- \(\gamma\) is the Euler's constant -- \(\Phi^{-1}\) is the inverse of the cumulative distribution function (CDF) of a standard normal distribution -- \(N\) is the number of Sharpe ratio estimates - -:param sharpe_ratio_estimates: List of Sharpe ratio estimates. -:return: The Benchmark Sharpe Ratio. - - -### πŸ“„ `RiskLabAI\backtest\probability_of_backtest_overfitting.py` - -#### `function sharpe_ratio` - -```python -def sharpe_ratioreturns: np.ndarray, risk_free_rate: float=0.0: -``` - -> Calculate the Sharpe Ratio for a given set of returns. - -:param returns: An array of returns for a portfolio. -:param risk_free_rate: The risk-free rate. -:return: The calculated Sharpe Ratio. - -.. math:: - - ext{Sharpe Ratio} = rac{ ext{Mean Portfolio Return} - ext{Risk-Free Rate}} - { ext{Standard Deviation of Portfolio Returns}} - -#### `function performance_evaluation` - -```python -def performance_evaluationtrain_partition: np.ndarray, test_partition: np.ndarray, n_strategies: int, metric: Callable, risk_free_return: float: -``` - -> Evaluate the performance of various strategies on given train and test partitions and -compute the logit value to determine if the best in-sample strategy is overfitting. - -:param train_partition: Training data partition used for evaluating in-sample performance. -:type train_partition: np.ndarray -:param test_partition: Testing data partition used for evaluating out-of-sample performance. -:type test_partition: np.ndarray -:param n_strategies: Number of strategies to evaluate. -:type n_strategies: int -:param metric: Metric function for evaluating strategy performance. - The function should accept a data array and risk_free_return as arguments. -:type metric: Callable -:param risk_free_return: Risk-free return used in the metric function, often used for Sharpe ratio. -:type risk_free_return: float - -:return: Tuple where the first value indicates if the best in-sample strategy is overfitting - (True if overfitting, False otherwise) and the second value is the logit value computed. -:rtype: Tuple[bool, float] - -#### `function probability_of_backtest_overfitting` - -```python -def probability_of_backtest_overfittingperformances: np.ndarray, n_partitions: int=16, risk_free_return: float=0.0, metric: Callable=None, n_jobs: int=1: -``` - -> Computes the Probability Of Backtest Overfitting. - -For instance, if \(S=16\), we will form 12,780 combinations. - -.. math:: - \left(\begin{array}{c} - S \\ - S / 2 - \end{array}\right) = \prod_{i=0}^{S / 2^{-1}} \frac{S-i}{S / 2-i} - -:param performances: Matrix of TΓ—N for T observations on N strategies. -:type performances: np.ndarray -:param n_partitions: Number of partitions (must be even). -:type n_partitions: int -:param metric: Metric function for evaluating strategy. -:type metric: Callable -:param risk_free_return: Risk-free return for calculating Sharpe ratio. -:type risk_free_return: float -:param n_jobs: Number of parallel jobs. -:type n_jobs: int - -:return: Tuple containing Probability Of Backtest Overfitting and an array of logit values. -:rtype: Tuple[float, List[float]] - - -### πŸ“„ `RiskLabAI\backtest\strategy_risk.py` - -#### `function sharpe_ratio_trials` - -```python -def sharpe_ratio_trialsp: float, n_run: int: -``` - -> Simulate trials to calculate the mean, standard deviation, and Sharpe ratio. - -The Sharpe ratio is calculated as follows: - -.. math:: S = \\frac{\\mu}{\\sigma} - -where: -- \(\\mu\) is the mean of the returns -- \(\\sigma\) is the standard deviation of the returns - -Args: - p (float): Probability of success. - n_run (int): Number of runs. - -Returns: - tuple[float, float, float]: Tuple containing mean, standard deviation, and Sharpe ratio. - -#### `function target_sharpe_ratio_symbolic` - -```python -def target_sharpe_ratio_symbolic: -``` - -> Calculate the target Sharpe ratio using symbolic operations. - -The Sharpe ratio is calculated using the following formula: - -.. math:: S = \\frac{p \\cdot u^2 + (1 - p) \\cdot d^2 - (p \\cdot u + (1 - p) \\cdot d)^2}{\\sigma} - -where: -- \(p\) is the probability of success -- \(u\) is the upward movement -- \(d\) is the downward movement -- \(\\sigma\) is the standard deviation of the returns - -Returns: - sympy.Add: Symbolic expression for target Sharpe ratio. - -#### `function implied_precision` - -```python -def implied_precisionstop_loss: float, profit_taking: float, frequency: float, target_sharpe_ratio: float: -``` - -> Calculate the implied precision for given parameters. - -The implied precision is calculated as follows: - -.. math:: - a = (f + S^2) * (p - s)^2 - b = (2 * f * s - S^2 * (p - s)) * (p - s) - c = f * s^2 - precision = (-b + \\sqrt{b^2 - 4 * a * c}) / (2 * a) - -where: -- \(f\) is the frequency of bets per year -- \(S\) is the target annual Sharpe ratio -- \(p\) is the profit-taking threshold -- \(s\) is the stop-loss threshold - -Args: - stop_loss (float): Stop-loss threshold. - profit_taking (float): Profit-taking threshold. - frequency (float): Number of bets per year. - target_sharpe_ratio (float): Target annual Sharpe ratio. - -Returns: - float: Calculated implied precision. - -#### `function bin_frequency` - -```python -def bin_frequencystop_loss: float, profit_taking: float, precision: float, target_sharpe_ratio: float: -``` - -> Calculate the number of bets per year needed to achieve a target Sharpe ratio with a certain precision. - -The frequency of bets is calculated as follows: - -.. math:: - frequency = \\frac{S^2 * (p - s)^2 * precision * (1 - precision)}{((p - s) * precision + s)^2} - -where: -- \(S\) is the target annual Sharpe ratio -- \(p\) is the profit-taking threshold -- \(s\) is the stop-loss threshold -- \(precision\) is the precision rate - -Args: - stop_loss (float): Stop-loss threshold. - profit_taking (float): Profit-taking threshold. - precision (float): Precision rate p. - target_sharpe_ratio (float): Target annual Sharpe ratio. - -Returns: - float: Calculated frequency of bets. - -#### `function binomial_sharpe_ratio` - -```python -def binomial_sharpe_ratiostop_loss: float, profit_taking: float, frequency: float, probability: float: -``` - -> Calculate the Sharpe Ratio for a binary outcome. - -The Sharpe ratio is calculated as follows: - -.. math:: - SR = \\frac{(p - s) * p + s}{(p - s) * \\sqrt{p * (1 - p)}} * \\sqrt{f} - -where: -- \(p\) is the profit-taking threshold -- \(s\) is the stop-loss threshold -- \(f\) is the frequency of bets per year - -Args: - stop_loss (float): Stop loss threshold. - profit_taking (float): Profit taking threshold. - frequency (float): Frequency of bets per year. - probability (float): Probability of success. - -Returns: - float: Calculated Sharpe Ratio. - -#### `function mix_gaussians` - -```python -def mix_gaussiansmu1: float, mu2: float, sigma1: float, sigma2: float, probability: float, n_obs: int: -``` - -> Generate a mixture of Gaussian-distributed bet outcomes. - -Args: - mu1 (float): Mean of the first Gaussian distribution. - mu2 (float): Mean of the second Gaussian distribution. - sigma1 (float): Standard deviation of the first Gaussian distribution. - sigma2 (float): Standard deviation of the second Gaussian distribution. - probability (float): Probability of success. - n_obs (int): Number of observations. - -Returns: - np.ndarray: Array of generated bet outcomes. - -#### `function failure_probability` - -```python -def failure_probabilityreturns: np.ndarray, frequency: float, target_sharpe_ratio: float: -``` - -> Calculate the probability that the strategy may fail. - -Args: - returns (np.ndarray): Array of returns. - frequency (float): Number of bets per year. - target_sharpe_ratio (float): Target annual Sharpe ratio. - -Returns: - float: Calculated failure probability. - -#### `function calculate_strategy_risk` - -```python -def calculate_strategy_riskmu1: float, mu2: float, sigma1: float, sigma2: float, probability: float, n_obs: int, frequency: float, target_sharpe_ratio: float: -``` - -> Calculate the strategy risk in practice. - -Args: - mu1 (float): Mean of the first Gaussian distribution. - mu2 (float): Mean of the second Gaussian distribution. - sigma1 (float): Standard deviation of the first Gaussian distribution. - sigma2 (float): Standard deviation of the second Gaussian distribution. - probability (float): Probability of success. - n_obs (int): Number of observations. - frequency (float): Number of bets per year. - target_sharpe_ratio (float): Target annual Sharpe ratio. - -Returns: - float: Calculated probability of strategy failure. - - -### πŸ“„ `RiskLabAI\backtest\test_set_overfitting.py` - -#### `function expected_max_sharpe_ratio` - -```python -def expected_max_sharpe_ration_trials: int, mean_sharpe_ratio: float, std_sharpe_ratio: float: -``` - -> Calculate the expected maximum Sharpe Ratio. - -Uses the formula: -.. math:: - \text{sharpe\_ratio} = (\text{mean\_sharpe\_ratio} - \gamma) \times \Phi^{-1}(1 - \frac{1}{n\_trials}) + - \gamma \times \Phi^{-1}(1 - n\_trials \times e^{-1}) - -where: -- \(\gamma\) is the Euler's gamma constant -- \(\Phi^{-1}\) is the inverse of the cumulative distribution function of the standard normal distribution - -:param n_trials: Number of trials. -:param mean_sharpe_ratio: Mean Sharpe Ratio. -:param std_sharpe_ratio: Standard deviation of Sharpe Ratios. - -:return: Expected maximum Sharpe Ratio. - -#### `function generate_max_sharpe_ratios` - -```python -def generate_max_sharpe_ratiosn_sims: int, n_trials_list: list, std_sharpe_ratio: float, mean_sharpe_ratio: float: -``` - -> Generate maximum Sharpe Ratios from simulations. - -:param n_sims: Number of simulations. -:param n_trials_list: List of numbers of trials. -:param std_sharpe_ratio: Standard deviation of Sharpe Ratios. -:param mean_sharpe_ratio: Mean of Sharpe Ratios. - -:return: DataFrame containing generated maximum Sharpe Ratios. - -#### `function mean_std_error` - -```python -def mean_std_errorn_sims0: int, n_sims1: int, n_trials: List[int], std_sharpe_ratio: float=1, mean_sharpe_ratio: float=0: -``` - -> Calculate mean and standard deviation of the predicted errors. - -:param n_sims0: Number of max{SR} used to estimate E[max{SR}]. -:param n_sims1: Number of errors on which std is computed. -:param n_trials: List of numbers of trials. -:param std_sharpe_ratio: Standard deviation of Sharpe Ratios. -:param mean_sharpe_ratio: Mean of Sharpe Ratios. - -:return: DataFrame containing mean and standard deviation of errors. - -#### `function estimated_sharpe_ratio_z_statistics` - -```python -def estimated_sharpe_ratio_z_statisticssharpe_ratio: float, t: int, true_sharpe_ratio: float=0, skew: float=0, kurt: int=3: -``` - -> Calculate z statistics for the estimated Sharpe Ratios. - -Uses the formula: -.. math:: - z = \frac{(sharpe\_ratio - true\_sharpe\_ratio) \times \sqrt{t - 1}}{\sqrt{1 - skew \times sharpe\_ratio + \frac{kurt - 1}{4} \times sharpe\_ratio^2}} - -:param sharpe_ratio: Estimated Sharpe Ratio. -:param t: Number of observations. -:param true_sharpe_ratio: True Sharpe Ratio. -:param skew: Skewness of returns. -:param kurt: Kurtosis of returns. - -:return: Calculated z statistics. - -#### `function strategy_type1_error_probability` - -```python -def strategy_type1_error_probabilityz: float, k: int=1: -``` - -> Calculate type I error probability of strategies. - -.. math:: - \alpha_k = 1 - (1 - \alpha)^k - -:param z: Z statistic for the estimated Sharpe Ratios. -:param k: Number of tests. - -:return: Calculated type I error probability. - -#### `function theta_for_type2_error` - -```python -def theta_for_type2_errorsharpe_ratio: float, t: int, true_sharpe_ratio: float=0, skew: float=0, kurt: int=3: -``` - -> Calculate ΞΈ parameter for type II error probability. - -.. math:: - \\theta = \\frac{\\text{true\_sharpe\_ratio} \cdot \\sqrt{t - 1}}{\\sqrt{1 - \\text{skew} \cdot \\text{sharpe\_ratio} + \\frac{\\text{kurt} - 1}{4} \cdot \\text{sharpe\_ratio}^2}} - -:param sharpe_ratio: Estimated Sharpe Ratio. -:param t: Number of observations. -:param true_sharpe_ratio: True Sharpe Ratio. -:param skew: Skewness of returns. -:param kurt: Kurtosis of returns. - -:return: Calculated ΞΈ parameter. - -#### `function strategy_type2_error_probability` - -```python -def strategy_type2_error_probabilityΞ±_k: float, k: int, ΞΈ: float: -``` - -> Calculate type II error probability of strategies. - -.. math:: - z = \text{ss.norm.ppf}((1 - \alpha_k)^{1.0 / k}) - \beta = \text{ss.norm.cdf}(z - \theta) - -:param Ξ±_k: Type I error. -:param k: Number of tests. -:param ΞΈ: Calculated ΞΈ parameter. - -:return: Calculated type II error probability. - - -### πŸ“„ `RiskLabAI\backtest\validation\adaptive_combinatorial_purged.py` - -#### `class AdaptiveCombinatorialPurged` - -##### `method __init__` - -```python -def __init__self, n_splits: int, n_test_groups: int, times: Union[pd.Series, Dict[str, pd.Series]], embargo: float=0, n_subsplits: int=3, external_feature: Union[pd.Series, Dict[str, pd.Series]]=None, lower_quantile: float=0.25, upper_quantile: float=0.75, subtract_border_adjustments: bool=True: -``` - -> Initialize the AdaptiveCombinatorialPurged class. - -Parameters ----------- -n_splits : int - Number of splits/groups to partition the data into. -n_test_groups : int - Size of the testing set in terms of groups. -times : Union[pd.Series, Dict[str, pd.Series]] - The timestamp series associated with the labels. -embargo : float - The embargo rate for purging. -n_subsplits : int - Number of subsplits within each split segment. -external_feature : Union[pd.Series, Dict[str, pd.Series]] - The external feature based on which the adaptive splitting is performed. -lower_quantile : float - The lower quantile threshold for adjusting the split segments. -upper_quantile : float - The upper quantile threshold for adjusting the split segments. -subtract_border_adjustments : bool - Flag to determine whether to subtract border adjustments instead of adding. - -##### `method _validate_input` - -```python -def _validate_inputself, single_times: pd.Series, single_data: pd.DataFrame, single_external_feature: pd.Series: -``` - -> Validate that the input data, times, and external feature share the same index. - -This function checks if the provided data, times, and external feature have the same index. -If they do not match, it raises a `ValueError`. - -:param single_times: Time series data to be validated. -:type single_times: pd.Series -:param single_data: Dataset with which the times should align. -:type single_data: pd.DataFrame -:param single_external_feature: External feature series to be validated. -:type single_external_feature: pd.Series -:raises ValueError: If the indices of the data, times, and external feature do not match. -:return: None - -##### `method _single_adaptive_split_segments` - -```python -def _single_adaptive_split_segmentsself, indices: np.ndarray, single_external_feature: pd.Series: -``` - -> Adaptively split data indices based on the external feature's values and quantile thresholds. - -Parameters ----------- -indices : np.ndarray - Array of data indices to be split. -single_external_feature : pd.Series - The external feature based on which the adaptive splitting is performed. - -Returns -------- -split_segments : List[np.ndarray] - List of adaptively split data indices. - -##### `method _single_split` - -```python -def _single_splitself, single_times: pd.Series, single_data: pd.DataFrame, single_external_feature: pd.Series: -``` - -> Splits data into train and test indices based on the defined combinatorial splits. - -This function is used to generate multiple train-test splits based on the combinatorial -cross-validation method. It ensures that each train-test split is properly purged and -embargoed to prevent data leakage. - -:param single_times: Timestamp series associated with the labels. -:param single_data: The input data to be split. -:param single_external_feature: External feature series used for adaptive splitting. - -:return: Generator that yields tuples of (train indices, test indices). - -.. note:: The function validates the input, and uses combinatorial cross-validation method to - produce the train-test splits. - -##### `method split` - -```python -def splitself, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], labels: Optional[Union[pd.Series, Dict[str, pd.Series]]]=None, groups: Optional[np.ndarray]=None: -``` - -> Split multiple datasets into train and test sets. - -This function either splits a single dataset or multiple datasets considering -purging and embargo. - -:param data: Dataset or dictionary of datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] -:param labels: Labels corresponding to the datasets, if available. -:type labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] -:param groups: Group information, if available. -:type groups: Optional[np.ndarray] -:return: Train and test indices or key with train and test indices for multiple datasets. -:rtype: Union[Generator[Tuple[np.ndarray, np.ndarray], None, None], - Generator[Tuple[str, Tuple[np.ndarray, np.ndarray]], None, None]] - -##### `method _combinations_and_path_locations_and_split_segments` - -```python -def _combinations_and_path_locations_and_split_segmentsself, data: pd.DataFrame, single_external_feature: pd.Series: -``` - -> Generate combinations, path locations, and split segments for the data. - -This function is a helper that computes necessary components for combinatorial cross-validation. - -:param data: The input dataframe to generate combinations, path locations, and split segments. -:param single_external_feature: External feature series used for adaptive splitting. - -:return: Tuple containing combinations, path locations, and split segments. - -.. math:: -\text{combinations} = \binom{n}{k} - -##### `method _single_backtest_paths` - -```python -def _single_backtest_pathsself, single_times: pd.Series, single_data: pd.DataFrame, single_external_feature: pd.Series: -``` - -> Generate the backtest paths for given input data. - -This function creates multiple backtest paths based on combinatorial splits, where -each path represents a sequence of train-test splits. It ensures that data leakage -is prevented by purging and applying embargo to the train-test splits. - -:param single_times: Timestamp series associated with the data. -:param single_data: Input data on which the backtest paths are based. -:param single_external_feature: External feature series used for adaptive splitting. - -:return: A dictionary where each key is a backtest path name, and the value is - a list of dictionaries with train and test index arrays. - -.. note:: This function relies on combinatorial cross-validation for backtesting to - generate multiple paths of train-test splits. - -##### `method backtest_paths` - -```python -def backtest_pathsself, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]]: -``` - -> Generate backtest paths for single or multiple datasets. - -This function checks whether multiple datasets are being used. If so, it iterates through each -dataset, generating backtest paths using the `_single_backtest_paths` method. Otherwise, it directly -returns the backtest paths for the single dataset. - -:param data: Input data on which the backtest paths are based. - Can be either a single DataFrame or a dictionary of DataFrames for multiple datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] - -:return: A dictionary where each key is a backtest path name, and the value is - a list of dictionaries with train and test index arrays. For multiple datasets, - a nested dictionary structure is returned. -:rtype: Union[ - Dict[str, List[Dict[str, np.ndarray]]], - Dict[str, Dict[str, List[Dict[str, List[np.ndarray]]]]] -] - -##### `method _single_backtest_predictions` - -```python -def _single_backtest_predictionsself, single_estimator: Any, single_times: pd.Series, single_data: pd.DataFrame, single_labels: pd.Series, single_weights: np.ndarray, single_external_feature: pd.Series, predict_probability: bool=False, n_jobs: int=1: -``` - -> Generate predictions for a single backtest using combinatorial splits. - -This method calculates predictions across various paths created by combinatorial splits -of the data. For each combinatorial split, a separate estimator is trained and then used -to predict on the corresponding test set. - -:param single_estimator: The machine learning model or estimator to be trained. -:param single_times: Timestamps corresponding to the data points. -:param single_data: Input data on which the model is trained and predictions are made. -:param single_labels: Labels corresponding to the input data. -:param single_weights: Weights for each data point. -:param single_external_feature: External feature series used for adaptive splitting. -:param predict_probability: If True, predict the probability of forecasts. -:type predict_probability: bool -:param n_jobs: Number of CPU cores to use for parallelization. Default is 1. - -:return: A dictionary where keys are path names and values are arrays of predictions. - -.. note:: This function relies on internal methods (e.g., `_get_train_indices`) - to manage data splits and training. - -.. note:: Parallelization is used to speed up the training of models for different splits. - -##### `method backtest_predictions` - -```python -def backtest_predictionsself, estimator: Union[Any, Dict[str, Any]], data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], labels: Union[pd.Series, Dict[str, pd.Series]], sample_weights: Optional[Union[np.ndarray, Dict[str, np.ndarray]]]=None, predict_probability: bool=False, n_jobs: int=1: -``` - -> Generate backtest predictions for single or multiple datasets. - -For each dataset, this function leverages the `_single_backtest_predictions` method to obtain -predictions for different train-test splits using the given estimator. - -:param estimator: Model or estimator to be trained and used for predictions. - Can be a single estimator or a dictionary of estimators for multiple datasets. -:type estimator: Union[Any, Dict[str, Any]] -:param data: Input data for training and testing. Can be a single dataset or - a dictionary of datasets for multiple datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] -:param labels: Target labels for training and testing. Can be a single series or - a dictionary of series for multiple datasets. -:type labels: Union[pd.Series, Dict[str, pd.Series]] -:param sample_weights: Weights for the observations in the dataset(s). - Can be a single array or a dictionary of arrays for multiple datasets. - Defaults to None, which means equal weights for all observations. -:type sample_weights: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] -:param predict_probability: If True, predict the probability of forecasts. -:type predict_probability: bool -:param n_jobs: The number of jobs to run in parallel. Default is 1. -:type n_jobs: int, optional -:return: Backtest predictions structured in a dictionary (or nested dictionaries for multiple datasets). -:rtype: Union[Dict[str, np.ndarray], Dict[str, Dict[str, np.ndarray]]] - - -### πŸ“„ `RiskLabAI\backtest\validation\bagged_combinatorial_purged.py` - -#### `class BaggedCombinatorialPurged` - -##### `method __init__` - -```python -def __init__self, n_splits: int, n_test_groups: int, times: Union[pd.Series, Dict[str, pd.Series]], embargo: float=0, classifier: bool=True, n_estimators: int=10, max_samples: float=1.0, max_features: float=1.0, bootstrap: bool=True, bootstrap_features: bool=False, random_state: int=None: -``` - -> Initialize the BaggedCombinatorialPurged class. - -Parameters ----------- -n_splits : int - Number of splits/groups to partition the data into. -n_test_groups : int - Size of the testing set in terms of groups. -times : Union[pd.Series, Dict[str, pd.Series]] - The timestamp series associated with the labels. -embargo : float - The embargo rate for purging. -classifier : bool - Determines whether to use a BaggingClassifier or BaggingRegressor. -n_estimators : int - The number of base estimators in the ensemble. -max_samples : float - The number of samples to draw from X to train each base estimator. -max_features : float - The number of features to draw from X to train each base estimator. -bootstrap : bool - Whether samples are drawn with replacement. -bootstrap_features : bool - Whether features are drawn with replacement. -random_state : int - The seed used by the random number generator. - -##### `method _single_backtest_predictions` - -```python -def _single_backtest_predictionsself, single_estimator: Any, single_times: pd.Series, single_data: pd.DataFrame, single_labels: pd.Series, single_weights: np.ndarray, predict_probability: bool=False, n_jobs: int=1: -``` - -> Generate predictions for a single backtest using combinatorial splits with bagging. - -This method calculates predictions across various paths created by combinatorial splits -of the data. For each combinatorial split, a bagged estimator is trained and then used -to predict on the corresponding test set. - -:param single_estimator: The machine learning model or estimator to be trained. -:param single_times: Timestamps corresponding to the data points. -:param single_data: Input data on which the model is trained and predictions are made. -:param single_labels: Labels corresponding to the input data. -:param single_weights: Weights for each data point. -:param predict_probability: If True, predict the probability of forecasts. -:type predict_probability: bool -:param n_jobs: Number of CPU cores to use for parallelization. Default is 1. - -:return: A dictionary where keys are path names and values are arrays of predictions. - -.. note:: This function relies on internal methods (e.g., `_get_train_indices`) - to manage data splits and training. - -.. note:: Parallelization is used to speed up the training of models for different splits. - - -### πŸ“„ `RiskLabAI\backtest\validation\combinatorial_purged.py` - -#### `class CombinatorialPurged` - -> Combinatorial Purged Cross-Validation (CPCV) implementation based on Marcos Lopez de Prado's method. - -This class provides a cross-validation scheme that aims to address the main drawback of the Walk Forward -and traditional Cross-Validation methods by testing multiple paths. Given a number of backtest paths, -CPCV generates the precise number of combinations of training/testing sets needed to generate those paths, -while purging training observations that might contain leaked information. - -Parameters ----------- -n_splits : int - Number of splits/groups to partition the data into. -n_test_groups : int - Size of the testing set in terms of groups. -times : Union[pd.Series, Dict[str, pd.Series]] - The timestamp series associated with the labels. -embargo : float - The embargo rate for purging. - -##### `method _path_locations` - -```python -def _path_locationsn_splits: int, combinations_: List[Tuple[int]]: -``` - -> Generate a labeled path matrix and return path locations for N choose K. - -This method generates a matrix where each entry corresponds to a specific combination of -training/testing sets, and helps in mapping these combinations to specific backtest paths. - -Parameters ----------- -n_splits : int - Number of splits/groups to partition the data into. -combinations_ : list - List of combinations for training/testing sets. - -Returns -------- -dict - A dictionary mapping each backtest path to its corresponding train/test combination. - -##### `method _combinatorial_splits` - -```python -def _combinatorial_splitscombinations_: List[Tuple[int]], split_segments: np.ndarray: -``` - -> Generate combinatorial test sets based on the number of test groups (n_test_groups). - -This method creates test sets by considering all possible combinations of group splits, allowing -for the creation of multiple test paths, as described in the CPCV methodology. - -Parameters ----------- -combinations_ : list - List of combinations for training/testing sets. -split_segments : np.ndarray - Array of data split segments. - -Returns -------- -Generator[np.ndarray] - A generator yielding the combinatorial test sets. - -##### `method __init__` - -```python -def __init__self, n_splits: int, n_test_groups: int, times: Union[pd.Series, Dict[str, pd.Series]], embargo: float=0: -``` - -> Initialize the CombinatorialPurged class. - -Parameters ----------- -n_splits : int - Number of splits/groups to partition the data into. -n_test_groups : int - Size of the testing set in terms of groups. -times : Union[pd.Series, Dict[str, pd.Series]] - The timestamp series associated with the labels. -embargo : float - The embargo rate for purging. - -##### `method get_n_splits` - -```python -def get_n_splitsself, data: Optional[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]=None, labels: Optional[Union[pd.Series, Dict[str, pd.Series]]]=None, groups: Optional[np.ndarray]=None: -``` - -> Return number of splits. - -:param data: Dataset or dictionary of datasets. -:type data: Optional[Union[pd.DataFrame, Dict[str, pd.DataFrame]]] - -:param labels: Labels or dictionary of labels. -:type labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] - -:param groups: Group labels for the samples. -:type groups: Optional[np.ndarray] - -:return: Number of splits. -:rtype: int - -##### `method _single_split` - -```python -def _single_splitself, single_times: np.ndarray, single_data: np.ndarray: -``` - -> Splits data into train and test indices based on the defined combinatorial splits. - -This function is used to generate multiple train-test splits based on the combinatorial -cross-validation method. It ensures that each train-test split is properly purged and -embargoed to prevent data leakage. - -:param single_times: Timestamp series associated with the labels. -:param single_data: The input data to be split. - -:return: Generator that yields tuples of (train indices, test indices). - -.. note:: The function validates the input, and uses combinatorial cross-validation method to - produce the train-test splits. - -##### `method _combinations_and_path_locations_and_split_segments` - -```python -def _combinations_and_path_locations_and_split_segmentsself, data: pd.DataFrame: -``` - -> Generate combinations, path locations, and split segments for the data. - -This function is a helper that computes necessary components for combinatorial cross-validation. - -:param data: The input dataframe to generate combinations, path locations, and split segments. - -:return: Tuple containing combinations, path locations, and split segments. - -.. math:: -\text{combinations} = \binom{n}{k} - -##### `method _single_backtest_paths` - -```python -def _single_backtest_pathsself, single_times: pd.Series, single_data: pd.DataFrame: -``` - -> Generate the backtest paths for given input data. - -This function creates multiple backtest paths based on combinatorial splits, where -each path represents a sequence of train-test splits. It ensures that data leakage -is prevented by purging and applying embargo to the train-test splits. - -:param single_times: Timestamp series associated with the data. -:param single_data: Input data on which the backtest paths are based. - -:return: A dictionary where each key is a backtest path name, and the value is - a list of dictionaries with train and test index arrays. - -.. note:: This function relies on combinatorial cross-validation for backtesting to - generate multiple paths of train-test splits. - -##### `method _single_backtest_predictions` - -```python -def _single_backtest_predictionsself, single_estimator: Any, single_times: pd.Series, single_data: pd.DataFrame, single_labels: pd.Series, single_weights: np.ndarray, predict_probability: bool=False, n_jobs: int=1: -``` - -> Generate predictions for a single backtest using combinatorial splits. - -This method calculates predictions across various paths created by combinatorial splits -of the data. For each combinatorial split, a separate estimator is trained and then used -to predict on the corresponding test set. - -:param single_estimator: The machine learning model or estimator to be trained. -:param single_times: Timestamps corresponding to the data points. -:param single_data: Input data on which the model is trained and predictions are made. -:param single_labels: Labels corresponding to the input data. -:param single_weights: Weights for each data point. -:param predict_probability: If True, predict the probability of forecasts. -:type predict_probability: bool -:param n_jobs: Number of CPU cores to use for parallelization. Default is 1. - -:return: A dictionary where keys are path names and values are arrays of predictions. - -.. note:: This function relies on internal methods (e.g., `_get_train_indices`) - to manage data splits and training. - -.. note:: Parallelization is used to speed up the training of models for different splits. - - -### πŸ“„ `RiskLabAI\backtest\validation\cross_validator_controller.py` - -#### `class CrossValidatorController` - -> Controller class to handle the cross-validation process. - -##### `method __init__` - -```python -def __init__self, validator_type: str, **kwargs: -``` - -> Initializes the CrossValidatorController. - -:param validator_type: Type of cross-validator to create and use. - This is passed to the factory to instantiate the appropriate cross-validator. -:type validator_type: str - -:param kwargs: Additional keyword arguments to be passed to the cross-validator's constructor. -:type kwargs: Type - - -### πŸ“„ `RiskLabAI\backtest\validation\cross_validator_factory.py` - -#### `class CrossValidatorFactory` - -> Factory class for creating cross-validator objects. - -##### `method create_cross_validator` - -```python -def create_cross_validatorvalidator_type: str, **kwargs: -``` - -> Factory method to create and return an instance of a cross-validator -based on the provided type. - -:param validator_type: Type of cross-validator to create. Options include - 'kfold', 'walkforward', 'purgedkfold', and 'combinatorialpurged'. -:type validator_type: str - -:param kwargs: Additional keyword arguments to be passed to the cross-validator's constructor. -:type kwargs: Type - -:return: An instance of the specified cross-validator. -:rtype: CrossValidator - -:raises ValueError: If an invalid validator type is provided. - - -### πŸ“„ `RiskLabAI\backtest\validation\cross_validator_interface.py` - -#### `class CrossValidator` - -> Abstract Base Class (ABC) for cross-validation strategies. -Handles both single data inputs and dictionary inputs. - -:param data: The input data, either as a single DataFrame or a dictionary of DataFrames. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] - -:param labels: The labels corresponding to the data, either as a single Series or a dictionary of Series. -:type labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] - -:param groups: Optional group labels for stratified splitting. -:type groups: Optional[np.ndarray] - -##### `method get_n_splits` - -```python -def get_n_splitsself, data: Optional[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]=None, labels: Optional[Union[pd.Series, Dict[str, pd.Series]]]=None, groups: Optional[np.ndarray]=None: -``` - -> Return number of splits. - -:param data: Dataset or dictionary of datasets. -:type data: Optional[Union[pd.DataFrame, Dict[str, pd.DataFrame]]] - -:param labels: Labels or dictionary of labels. -:type labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] - -:param groups: Group labels for the samples. -:type groups: Optional[np.ndarray] - -:return: Number of splits. -:rtype: int - -##### `method _single_split` - -```python -def _single_splitself, single_data: pd.DataFrame: -``` - -> Splits a single data set into train-test indices. - -This function provides train-test indices to split the data into train/test sets -by respecting the time order (if applicable) and the specified number of splits. - -:param single_data: Input dataset. -:type single_data: pd.DataFrame - -:return: Generator yielding train-test indices. -:rtype: Generator[Tuple[np.ndarray, np.ndarray], None, None] - -##### `method split` - -```python -def splitself, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], labels: Optional[Union[pd.Series, Dict[str, pd.Series]]]=None, groups: Optional[np.ndarray]=None: -``` - -> Splits data or a dictionary of data into train-test indices. - -This function returns a generator that yields train-test indices. If a dictionary -of data is provided, the generator yields a key followed by the train-test indices. - -:param data: Dataset or dictionary of datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] -:param labels: Labels or dictionary of labels. -:type labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] -:param groups: Group labels for the samples. -:type groups: Optional[np.ndarray] - -:return: Generator yielding either train-test indices directly or a key - followed by train-test indices. -:rtype: Union[ - Generator[Tuple[np.ndarray, np.ndarray], None, None], - Generator[Tuple[str, Tuple[np.ndarray, np.ndarray]], None, None] -] - -##### `method _single_backtest_paths` - -```python -def _single_backtest_pathsself, single_data: pd.DataFrame: -``` - -> Generates backtest paths for a single dataset. - -This function creates and returns backtest paths (i.e., combinations of training and test sets) -for a single dataset by applying k-fold splitting or any other splitting strategy defined -by the `_single_split` function. - -:param single_data: Input dataset. -:type single_data: pd.DataFrame - -:return: Dictionary of backtest paths. -:rtype: Dict[str, List[Dict[str, List[np.ndarray]]]] - -##### `method backtest_paths` - -```python -def backtest_pathsself, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]]: -``` - -> Generates backtest paths for data. - -This function returns backtest paths for either a single dataset or a dictionary -of datasets. Each backtest path consists of combinations of training and test sets. - -:param data: Dataset or dictionary of datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] -:param labels: Labels or dictionary of labels. -:type labels: Union[pd.Series, Dict[str, pd.Series]] - -:return: Dictionary of backtest paths or dictionary of dictionaries for multiple datasets. -:rtype: Union[ - Dict[str, List[Dict[str, np.ndarray]]], - Dict[str, Dict[str, List[Dict[str, List[np.ndarray]]]]] -] - -##### `method _single_backtest_predictions` - -```python -def _single_backtest_predictionsself, single_estimator: Any, single_data: pd.DataFrame, single_labels: pd.Series, single_weights: Optional[np.ndarray]=None, n_jobs: int=1: -``` - -> Obtain predictions for a single dataset during backtesting. - -This function leverages parallel computation to train and predict on different train-test splits -of a single dataset using a given estimator. It utilizes the `_single_split` method to generate -the train-test splits. - -:param single_estimator: Estimator or model to be trained and used for predictions. -:type single_estimator: Any -:param single_data: Data of the single dataset. -:type single_data: pd.DataFrame -:param single_labels: Labels corresponding to the single dataset. -:type single_labels: pd.Series -:param single_weights: Weights for the observations in the single dataset. - Defaults to equally weighted if not provided. -:type single_weights: np.ndarray, optional -:param n_jobs: The number of jobs to run in parallel. Default is 1. -:type n_jobs: int, optional -:return: Predictions structured in a dictionary for the backtest paths. -:rtype: Dict[str, np.ndarray] - -##### `method backtest_predictions` - -```python -def backtest_predictionsself, estimator: Union[Any, Dict[str, Any]], data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], labels: Union[pd.Series, Dict[str, pd.Series]], sample_weights: Optional[Union[np.ndarray, Dict[str, np.ndarray]]]=None, predict_probability: bool=False, n_jobs: int=1: -``` - -> Generate backtest predictions for single or multiple datasets. - -For each dataset, this function leverages the `_single_backtest_predictions` method to obtain -predictions for different train-test splits using the given estimator. - -:param estimator: Model or estimator to be trained and used for predictions. - Can be a single estimator or a dictionary of estimators for multiple datasets. -:type estimator: Union[Any, Dict[str, Any]] -:param data: Input data for training and testing. Can be a single dataset or - a dictionary of datasets for multiple datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] -:param labels: Target labels for training and testing. Can be a single series or - a dictionary of series for multiple datasets. -:type labels: Union[pd.Series, Dict[str, pd.Series]] -:param sample_weights: Weights for the observations in the dataset(s). - Can be a single array or a dictionary of arrays for multiple datasets. - Defaults to None, which means equal weights for all observations. -:type sample_weights: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] -:param n_jobs: The number of jobs to run in parallel. Default is 1. -:type n_jobs: int, optional -:return: Backtest predictions structured in a dictionary (or nested dictionaries for multiple datasets). -:rtype: Union[Dict[str, np.ndarray], Dict[str, Dict[str, np.ndarray]]] - - -### πŸ“„ `RiskLabAI\backtest\validation\kfold.py` - -#### `class KFold` - -> K-Fold cross-validator. - -This class implements the K-Fold cross-validation strategy, where the dataset is -divided into `k` consecutive folds. Each fold is then used once as a validation set -while the `k - 1` remaining folds form the training set. - -##### `method __init__` - -```python -def __init__self, n_splits: int, shuffle: bool=False, random_seed: int=None: -``` - -> Initialize the K-Fold cross-validator. - -:param n_splits: Number of splits or folds for the cross-validation. - The dataset will be divided into `n_splits` consecutive parts. -:type n_splits: int -:param shuffle: Whether to shuffle the data before splitting it into folds. - If `shuffle` is set to True, the data will be shuffled before splitting. -:type shuffle: bool, optional -:param random_seed: Seed used for random shuffling. Set this seed for reproducibility. - Only used when `shuffle` is True. -:type random_seed: int, optional - -##### `method get_n_splits` - -```python -def get_n_splitsself, data: Optional[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]=None, labels: Optional[Union[pd.Series, Dict[str, pd.Series]]]=None, groups: Optional[np.ndarray]=None: -``` - -> Return number of splits. - -:param data: Dataset or dictionary of datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] - -:param labels: Labels or dictionary of labels. -:type labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] - -:param groups: Group labels for the samples. -:type groups: Optional[np.ndarray] - -:return: Number of splits. -:rtype: int - -##### `method _single_split` - -```python -def _single_splitself, single_data: pd.DataFrame: -``` - -> Splits a single data set into train-test indices. - -This function provides train-test indices to split the data into train/test sets -by respecting the time order (if applicable) and the specified number of splits. - -:param single_data: Input dataset. -:type single_data: pd.DataFrame - -:return: Generator yielding train-test indices. -:rtype: Generator[Tuple[np.ndarray, np.ndarray], None, None] - -##### `method split` - -```python -def splitself, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], labels: Optional[Union[pd.Series, Dict[str, pd.Series]]]=None, groups: Optional[np.ndarray]=None: -``` - -> Splits data or a dictionary of data into train-test indices. - -This function returns a generator that yields train-test indices. If a dictionary -of data is provided, the generator yields a key followed by the train-test indices. - -:param data: Dataset or dictionary of datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] -:param labels: Labels or dictionary of labels. -:type labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] -:param groups: Group labels for the samples. -:type groups: Optional[np.ndarray] - -:return: Generator yielding either train-test indices directly or a key - followed by train-test indices. -:rtype: Union[ - Generator[Tuple[np.ndarray, np.ndarray], None, None], - Generator[Tuple[str, Tuple[np.ndarray, np.ndarray]], None, None] -] - -##### `method _single_backtest_paths` - -```python -def _single_backtest_pathsself, single_data: pd.DataFrame: -``` - -> Generates backtest paths for a single dataset. - -This function creates and returns backtest paths (i.e., combinations of training and test sets) -for a single dataset by applying k-fold splitting or any other splitting strategy defined -by the `_single_split` function. - -:param single_data: Input dataset. -:type single_data: pd.DataFrame - -:return: Dictionary of backtest paths. -:rtype: Dict[str, List[Dict[str, List[np.ndarray]]]] - -##### `method backtest_paths` - -```python -def backtest_pathsself, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]]: -``` - -> Generates backtest paths for data. - -This function returns backtest paths for either a single dataset or a dictionary -of datasets. Each backtest path consists of combinations of training and test sets. - -:param data: Dataset or dictionary of datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] - -:return: Dictionary of backtest paths or dictionary of dictionaries for multiple datasets. -:rtype: Union[ - Dict[str, List[Dict[str, np.ndarray]]], - Dict[str, Dict[str, List[Dict[str, List[np.ndarray]]]]] -] - -##### `method _single_backtest_predictions` - -```python -def _single_backtest_predictionsself, single_estimator: Any, single_data: pd.DataFrame, single_labels: pd.Series, single_weights: Optional[np.ndarray]=None, predict_probability: bool=False, n_jobs: int=1: -``` - -> Obtain predictions for a single dataset during backtesting. - -This function leverages parallel computation to train and predict on different train-test splits -of a single dataset using a given estimator. It utilizes the `_single_split` method to generate -the train-test splits. - -:param single_estimator: Estimator or model to be trained and used for predictions. -:type single_estimator: Any -:param single_data: Data of the single dataset. -:type single_data: pd.DataFrame -:param single_labels: Labels corresponding to the single dataset. -:type single_labels: pd.Series -:param single_weights: Weights for the observations in the single dataset. - Defaults to equally weighted if not provided. -:type single_weights: np.ndarray, optional -:param predict_probability: If True, predict the probability of forecasts. -:type predict_probability: bool -:param n_jobs: The number of jobs to run in parallel. Default is 1. -:type n_jobs: int, optional -:return: Predictions structured in a dictionary for the backtest paths. -:rtype: Dict[str, np.ndarray] - -##### `method backtest_predictions` - -```python -def backtest_predictionsself, estimator: Union[Any, Dict[str, Any]], data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], labels: Union[pd.Series, Dict[str, pd.Series]], sample_weights: Optional[Union[np.ndarray, Dict[str, np.ndarray]]]=None, predict_probability: bool=False, n_jobs: int=1: -``` - -> Generate backtest predictions for single or multiple datasets. - -For each dataset, this function leverages the `_single_backtest_predictions` method to obtain -predictions for different train-test splits using the given estimator. - -:param estimator: Model or estimator to be trained and used for predictions. - Can be a single estimator or a dictionary of estimators for multiple datasets. -:type estimator: Union[Any, Dict[str, Any]] -:param data: Input data for training and testing. Can be a single dataset or - a dictionary of datasets for multiple datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] -:param labels: Target labels for training and testing. Can be a single series or - a dictionary of series for multiple datasets. -:type labels: Union[pd.Series, Dict[str, pd.Series]] -:param sample_weights: Weights for the observations in the dataset(s). - Can be a single array or a dictionary of arrays for multiple datasets. - Defaults to None, which means equal weights for all observations. -:type sample_weights: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] -:param predict_probability: If True, predict the probability of forecasts. -:type predict_probability: bool -:param n_jobs: The number of jobs to run in parallel. Default is 1. -:type n_jobs: int, optional - -:return: Backtest predictions structured in a dictionary (or nested dictionaries for multiple datasets). -:rtype: Union[Dict[str, np.ndarray], Dict[str, Dict[str, np.ndarray]]] - - -### πŸ“„ `RiskLabAI\backtest\validation\purged_kfold.py` - -#### `class PurgedKFold` - -##### `method filtered_training_indices_with_embargo` - -```python -def filtered_training_indices_with_embargodata_info_range: pd.Series, test_time_range: pd.Series, embargo_fraction: float=0, continous_test_times: bool=False: -``` - -> Purge observations in the training set with embargo. - -Finds the training set indices based on the information on each record -and the test set range. It purges the training set of observations that -overlap with the test set in the time dimension and adds an embargo period -to further prevent potential information leakage. - -.. math:: - \text{embargo\_length} = \text{len(data\_info\_range)} \times \text{embargo\_fraction} - -:param data_info_range: Series detailing the information range for each record. - - *data_info_range.index*: Time when the information extraction started. - - *data_info_range.value*: Time when the information extraction ended. -:type data_info_range: pd.Series -:param test_time_range: Series containing times for the test dataset. -:type test_time_range: pd.Series -:param embargo_fraction: Fraction of the dataset trailing the test observations to exclude from training. -:type embargo_fraction: float -:param continuous_test_times: If set to True, considers the test time range as continuous. -:type continuous_test_times: bool - -:return: Series of filtered training data after applying embargo. -:rtype: pd.Series - -##### `method __init__` - -```python -def __init__self, n_splits: int, times: Union[pd.Series, Dict[str, pd.Series]], embargo: float=0: -``` - -> Purged k-fold cross-validation to prevent information leakage. - -Implements a cross-validation strategy where each fold is purged -of observations overlapping with the training set in the time dimension. -An embargo period is also introduced to further prevent potential -information leakage. - -Attributes: - n_splits (int): Number of splits/folds. - times (Union[pd.Series, Dict[str, pd.Series]]): Series or dict containing time data. - embargo (float): The embargo period. - is_multiple_datasets (bool): True if `times` is a dict, else False. - -:param n_splits: Number of splits or folds. -:type n_splits: int - -:param times: Series detailing the information range for each record. - - *times.index*: Time when the information extraction started. - - *times.value*: Time when the information extraction ended. -:type times: pd.Series - -:param embargo: The embargo period to further prevent potential - information leakage. -:type embargo: float - -##### `method get_n_splits` - -```python -def get_n_splitsself, data: Optional[Union[pd.DataFrame, Dict[str, pd.DataFrame]]]=None, labels: Optional[Union[pd.Series, Dict[str, pd.Series]]]=None, groups: Optional[np.ndarray]=None: -``` - -> Return number of splits. - -:param data: Dataset or dictionary of datasets. -:type data: Optional[Union[pd.DataFrame, Dict[str, pd.DataFrame]]] - -:param labels: Labels or dictionary of labels. -:type labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] - -:param groups: Group labels for the samples. -:type groups: Optional[np.ndarray] - -:return: Number of splits. -:rtype: int - -##### `method _validate_input` - -```python -def _validate_inputself, single_times: pd.Series, single_data: pd.DataFrame: -``` - -> Validate that the input data and times share the same index. - -This function checks if the provided data and its corresponding times -have the same index. If they do not match, it raises a `ValueError`. - -:param single_times: Time series data to be validated. -:type single_times: pd.Series -:param single_data: Dataset with which the times should align. -:type single_data: pd.DataFrame -:raises ValueError: If the indices of the data and times do not match. -:return: None - -##### `method _get_train_indices` - -```python -def _get_train_indicesself, test_indices: np.ndarray, single_times: pd.Series, continous_test_times: bool=False: -``` - -> Obtain the training indices considering purging and embargo. - -This function retrieves the training set indices based on the given test indices -while considering the purging and embargo strategy. - -:param test_indices: Indices used for the test set. -:type test_indices: np.ndarray -:param single_times: Time series data used for purging and embargo. -:type single_times: pd.Series -:return: Training indices after applying purging and embargo. -:rtype: np.ndarray - -##### `method _single_split` - -```python -def _single_splitself, single_times: pd.Series, single_data: pd.DataFrame: -``` - -> Split the data into train and test sets. - -This function splits the data for a single dataset considering purging and embargo. - -:param single_times: Time series data used for purging and embargo. -:type single_times: pd.Series -:param single_data: Dataset to split. -:type single_data: pd.DataFrame -:return: Train and test indices. -:rtype: Generator[Tuple[np.ndarray, np.ndarray], None, None] - -##### `method split` - -```python -def splitself, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], labels: Optional[Union[pd.Series, Dict[str, pd.Series]]]=None, groups: Optional[np.ndarray]=None: -``` - -> Split multiple datasets into train and test sets. - -This function either splits a single dataset or multiple datasets considering -purging and embargo. - -:param data: Dataset or dictionary of datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] -:param labels: Labels corresponding to the datasets, if available. -:type labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] -:param groups: Group information, if available. -:type groups: Optional[np.ndarray] -:return: Train and test indices or key with train and test indices for multiple datasets. -:rtype: Union[Generator[Tuple[np.ndarray, np.ndarray], None, None], - Generator[Tuple[str, Tuple[np.ndarray, np.ndarray]], None, None]] - -##### `method _single_backtest_paths` - -```python -def _single_backtest_pathsself, single_times: pd.Series, single_data: pd.DataFrame: -``` - -> Generate backtest paths based on training and testing indices. - -This function first validates the input data and times. Then, it generates -the training and testing indices for backtesting. These paths are organized -into a dictionary with a designated name for each backtest path. - -:param single_times: Time series data for validation. -:type single_times: pd.Series -:param single_data: Dataset with which the times should align. -:type single_data: pd.DataFrame -:return: Dictionary containing the backtest paths with training and testing indices. -:rtype: Dict[str, List[Dict[str, np.ndarray]]] - -##### `method backtest_paths` - -```python -def backtest_pathsself, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]]: -``` - -> Generate backtest paths for single or multiple datasets. - -This function checks whether multiple datasets are being used. If so, it iterates through each -dataset, generating backtest paths using the `_single_backtest_paths` method. Otherwise, it directly -returns the backtest paths for the single dataset. - -:param data: Input data on which the backtest paths are based. - Can be either a single DataFrame or a dictionary of DataFrames for multiple datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] - -:return: A dictionary where each key is a backtest path name, and the value is - a list of dictionaries with train and test index arrays. For multiple datasets, - a nested dictionary structure is returned. -:rtype: Union[ - Dict[str, List[Dict[str, np.ndarray]]], - Dict[str, Dict[str, List[Dict[str, List[np.ndarray]]]]] -] - -##### `method _single_backtest_predictions` - -```python -def _single_backtest_predictionsself, single_estimator: Any, single_times: pd.Series, single_data: pd.DataFrame, single_labels: pd.Series, single_weights: Optional[np.ndarray]=None, predict_probability: bool=False, n_jobs: int=1: -``` - -> Obtain predictions for a single dataset during backtesting. - -This function leverages parallel computation to train and predict on different train-test splits -of a single dataset using a given estimator. It utilizes the `_single_split` method to generate -the train-test splits. - -:param single_estimator: Estimator or model to be trained and used for predictions. -:type single_estimator: Any -:param single_times: Timestamps for the single dataset. -:type single_times: pd.Series -:param single_data: Data of the single dataset. -:type single_data: pd.DataFrame -:param single_labels: Labels corresponding to the single dataset. -:type single_labels: pd.Series -:param single_weights: Weights for the observations in the single dataset. - Defaults to equally weighted if not provided. -:type single_weights: np.ndarray, optional -:param predict_probability: If True, predict the probability of forecasts. -:type predict_probability: bool -:param n_jobs: The number of jobs to run in parallel. Default is 1. -:type n_jobs: int, optional -:return: Predictions structured in a dictionary for the backtest paths. -:rtype: Dict[str, np.ndarray] - -##### `method backtest_predictions` - -```python -def backtest_predictionsself, estimator: Union[Any, Dict[str, Any]], data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], labels: Union[pd.Series, Dict[str, pd.Series]], sample_weights: Optional[Union[np.ndarray, Dict[str, np.ndarray]]]=None, predict_probability: bool=False, n_jobs: int=1: -``` - -> Generate backtest predictions for single or multiple datasets. - -For each dataset, this function leverages the `_single_backtest_predictions` method to obtain -predictions for different train-test splits using the given estimator. - -:param estimator: Model or estimator to be trained and used for predictions. - Can be a single estimator or a dictionary of estimators for multiple datasets. -:type estimator: Union[Any, Dict[str, Any]] -:param data: Input data for training and testing. Can be a single dataset or - a dictionary of datasets for multiple datasets. -:type data: Union[pd.DataFrame, Dict[str, pd.DataFrame]] -:param labels: Target labels for training and testing. Can be a single series or - a dictionary of series for multiple datasets. -:type labels: Union[pd.Series, Dict[str, pd.Series]] -:param sample_weights: Weights for the observations in the dataset(s). - Can be a single array or a dictionary of arrays for multiple datasets. - Defaults to None, which means equal weights for all observations. -:type sample_weights: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] -:param predict_probability: If True, predict the probability of forecasts. -:type predict_probability: bool -:param n_jobs: The number of jobs to run in parallel. Default is 1. -:type n_jobs: int, optional -:return: Backtest predictions structured in a dictionary (or nested dictionaries for multiple datasets). -:rtype: Union[Dict[str, np.ndarray], Dict[str, Dict[str, np.ndarray]]] - - -### πŸ“„ `RiskLabAI\backtest\validation\walk_forward.py` - -#### `class WalkForward` - -> WalkForward Cross-Validator for Time Series Data. - -This cross-validator provides train/test indices meant to split time series data -in a "walk-forward" manner, which is suitable for time series forecasting tasks. -In each split, the training set progressively grows in size (subject to the optional -maximum size constraint) while the test set remains roughly constant in size. -A gap can be optionally introduced between the training and test set to simulate -forecasting on unseen future data after a certain interval. - -The WalkForward cross-validator is inherently different from traditional K-Fold -cross-validation which shuffles and splits the dataset into train/test without -considering the time order. In time series tasks, ensuring that the model is -trained on past data and validated on future data is crucial. This cross-validator -achieves that by progressively walking forward in time through the dataset. - -##### `method __init__` - -```python -def __init__self, n_splits: int=5, max_train_size: int=None, gap: int=0: -``` - -> Initialize the TimeSeriesWalkForward cross-validator. - -Parameters: ------------ -n_splits : int, default=5 - Number of splits/folds. Must be at least 2. - -max_train_size : int, optional - Maximum number of observations allowed in the training dataset. - If provided, the most recent `max_train_size` observations are used - for training. - -gap : int, default=0 - Number of observations to skip between the end of the training data - and the start of the test data. Useful for simulating forecasting - scenarios where the test data is not immediately after the training data. - -##### `method _single_split` - -```python -def _single_splitself, single_data: pd.DataFrame: -``` - -> Splits a single data set into train-test indices. - -This function provides train-test indices to split the data into train/test sets -by respecting the time order (if applicable) and the specified number of splits. - -:param single_data: Input dataset. -:type single_data: pd.DataFrame - -:return: Generator yielding train-test indices. -:rtype: Generator[Tuple[np.ndarray, np.ndarray], None, None] - -##### `method _single_backtest_predictions` - -```python -def _single_backtest_predictionsself, single_estimator: Any, single_data: pd.DataFrame, single_labels: pd.Series, single_weights: Optional[np.ndarray]=None, predict_probability: bool=False, n_jobs: int=1: -``` - -> Obtain predictions for a single dataset during backtesting. - -This function leverages parallel computation to train and predict on different train-test splits -of a single dataset using a given estimator. It utilizes the `_single_split` method to generate -the train-test splits. - -:param single_estimator: Estimator or model to be trained and used for predictions. -:type single_estimator: Any -:param single_data: Data of the single dataset. -:type single_data: pd.DataFrame -:param single_labels: Labels corresponding to the single dataset. -:type single_labels: pd.Series -:param single_weights: Weights for the observations in the single dataset. - Defaults to equally weighted if not provided. -:type single_weights: np.ndarray, optional -:param predict_probability: If True, predict the probability of forecasts. -:type predict_probability: bool -:param n_jobs: The number of jobs to run in parallel. Default is 1. -:type n_jobs: int, optional -:return: Predictions structured in a dictionary for the backtest paths. -:rtype: Dict[str, np.ndarray] - - -### πŸ“„ `RiskLabAI\cluster\clustering.py` - -#### `function covariance_to_correlation` - -```python -def covariance_to_correlationcovariance: np.ndarray: -``` - -> Derive the correlation matrix from a covariance matrix. - -.. math:: - \\text{correlation}_{ij} = \\frac{\\text{covariance}_{ij}}{\\sqrt{\\text{covariance}_{ii} \\text{covariance}_{jj}}} - -Reference: - De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - Snippet 2.3, Page 27 - -:param covariance: Covariance matrix. - -:return: Correlation matrix. - -#### `function cluster_k_means_base` - -```python -def cluster_k_means_basecorrelation: pd.DataFrame, max_clusters: int=10, iterations: int=10: -``` - -> Clustering using K-Means. - -Reference: - De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - Snippet 4.1, Page 56 - -:param correlation: Correlation matrix. -:param max_clusters: Maximum number of clusters. -:param iterations: Number of iterations for clustering. - -:return: Tuple containing the sorted correlation matrix, clusters, and silhouette scores. - -#### `function make_new_outputs` - -```python -def make_new_outputscorrelation: pd.DataFrame, clusters_1: dict, clusters_2: dict: -``` - -> Merge two clusters and produce new correlation matrix and silhouette scores. -Clusters are disjoint. - -Reference: - De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - Snippet 4.2, Page 58 - -:param correlation: Correlation matrix. -:param clusters_1: First cluster. -:param clusters_2: Second cluster. - -:return: Tuple containing the new correlation matrix, new clusters, and new silhouette scores. - -#### `function cluster_k_means_top` - -```python -def cluster_k_means_topcorrelation: pd.DataFrame, max_clusters: int=None, iterations: int=10: -``` - -> Clustering using ONC method. - -Reference: - De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - Snippet 4.2, Page 58 - -:param correlation: Correlation matrix. -:param max_clusters: Maximum Number of clusters. -:param iterations: Number of iterations. - -:return: Tuple containing the sorted correlation matrix, clusters, and silhouette scores. - -#### `function random_covariance_sub` - -```python -def random_covariance_subn_observations: int, n_columns: int, sigma: float, random_state: int=None: -``` - -> Generates covariance matrix for n_columns same normal random variables with a nomral noise scaled by sigma. -Variables have n_observations observations. - -Reference: - De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - Snippet 4.3, Page 61 - -:param n_observations: Number of observations. -:param n_columns: Number of columns. -:param sigma: Sigma for normal distribution. -:param random_state: Random state for reproducibility. - -:return: Sub covariance matrix. - -#### `function random_block_covariance` - -```python -def random_block_covariancen_columns: int, n_blocks: int, block_size_min: int=1, sigma: float=1.0, random_state: int=None: -``` - -> Compute random block covariance matrix. - -Reference: - De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - Snippet 4.3, Page 61 - -:param n_columns: Number of columns. -:param n_blocks: Number of blocks. -:param block_size_min: Minimum size of block. -:param sigma: Sigma for normal distribution. -:param random_state: Random state for reproducibility. - -:return: Random block covariance matrix. - -#### `function random_block_correlation` - -```python -def random_block_correlationn_columns: int, n_blocks: int, random_state: int=None, block_size_min: int=1: -``` - -> Compute random block correlation matrix. - -Reference: - De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - Snippet 4.3, Page 61 - -:param n_columns: Number of columns. -:param n_blocks: Number of blocks. -:param random_state: Random state for reproducibility. -:param block_size_min: Minimum size of block. - -:return: Random block correlation matrix. - - -### πŸ“„ `RiskLabAI\controller\bars_initializer.py` - -#### `class BarsInitializerController` - -> Controller for initializing various types of bars. - -##### `method __init__` - -```python -def __init__self: -``` -##### `method initialize_expected_dollar_imbalance_bars` - -```python -def initialize_expected_dollar_imbalance_barswindow_size_for_expected_n_ticks_estimation: int=3, window_size_for_expected_imbalance_estimation: int=10000, initial_estimate_of_expected_n_ticks_in_bar: int=20000, expected_ticks_number_bounds: Tuple[float]=None, analyze_thresholds: bool=False: -``` - -> Initialize expected dollar imbalance bars. - -:param window_size_for_expected_n_ticks_estimation: The window size for estimating the expected number of ticks. -:param window_size_for_expected_imbalance_estimation: The window size for estimating the expected imbalance. -:param initial_estimate_of_expected_n_ticks_in_bar: The initial estimate for the expected number of ticks in a bar. -:param expected_ticks_number_bounds: Bounds for the expected number of ticks in a bar. -:param analyze_thresholds: Flag indicating whether to analyze thresholds. - -:return: An instance of ExpectedImbalanceBars. - -##### `method initialize_expected_volume_imbalance_bars` - -```python -def initialize_expected_volume_imbalance_barswindow_size_for_expected_n_ticks_estimation: int=3, window_size_for_expected_imbalance_estimation: int=10000, initial_estimate_of_expected_n_ticks_in_bar: int=20000, expected_ticks_number_bounds: Tuple[float]=None, analyse_thresholds: bool=False: -``` -##### `method initialize_expected_tick_imbalance_bars` - -```python -def initialize_expected_tick_imbalance_barswindow_size_for_expected_n_ticks_estimation: int=3, window_size_for_expected_imbalance_estimation: int=10000, initial_estimate_of_expected_n_ticks_in_bar: int=20000, expected_ticks_number_bounds: Tuple[float]=None, analyse_thresholds: bool=False: -``` -##### `method initialize_fixed_dollar_imbalance_bars` - -```python -def initialize_fixed_dollar_imbalance_barswindow_size_for_expected_imbalance_estimation: int=10000, initial_estimate_of_expected_n_ticks_in_bar: int=20000, analyse_thresholds: bool=False: -``` -##### `method initialize_fixed_volume_imbalance_bars` - -```python -def initialize_fixed_volume_imbalance_barswindow_size_for_expected_imbalance_estimation: int=10000, initial_estimate_of_expected_n_ticks_in_bar: int=20000, analyse_thresholds: bool=False: -``` -##### `method initialize_fixed_tick_imbalance_bars` - -```python -def initialize_fixed_tick_imbalance_barswindow_size_for_expected_imbalance_estimation: int=10000, initial_estimate_of_expected_n_ticks_in_bar: int=20000, analyse_thresholds: bool=False: -``` -##### `method initialize_expected_dollar_run_bars` - -```python -def initialize_expected_dollar_run_barswindow_size_for_expected_n_ticks_estimation: int=3, window_size_for_expected_imbalance_estimation: int=10000, initial_estimate_of_expected_n_ticks_in_bar: int=20000, expected_ticks_number_bounds: Tuple[float]=None, analyse_thresholds: bool=False: -``` -##### `method initialize_expected_volume_run_bars` - -```python -def initialize_expected_volume_run_barswindow_size_for_expected_n_ticks_estimation: int=3, window_size_for_expected_imbalance_estimation: int=10000, initial_estimate_of_expected_n_ticks_in_bar: int=20000, expected_ticks_number_bounds: Tuple[float]=None, analyse_thresholds: bool=False: -``` -##### `method initialize_expected_tick_run_bars` - -```python -def initialize_expected_tick_run_barswindow_size_for_expected_n_ticks_estimation: int=3, window_size_for_expected_imbalance_estimation: int=10000, initial_estimate_of_expected_n_ticks_in_bar: int=20000, expected_ticks_number_bounds: Tuple[float]=None, analyse_thresholds: bool=False: -``` -##### `method initialize_fixed_dollar_run_bars` - -```python -def initialize_fixed_dollar_run_barswindow_size_for_expected_n_ticks_estimation: int=3, window_size_for_expected_imbalance_estimation: int=10000, initial_estimate_of_expected_n_ticks_in_bar: int=20000, analyse_thresholds: bool=False: -``` -##### `method initialize_fixed_volume_run_bars` - -```python -def initialize_fixed_volume_run_barswindow_size_for_expected_n_ticks_estimation: int=3, window_size_for_expected_imbalance_estimation: int=10000, initial_estimate_of_expected_n_ticks_in_bar: int=20000, analyse_thresholds: bool=False: -``` -##### `method initialize_fixed_tick_run_bars` - -```python -def initialize_fixed_tick_run_barswindow_size_for_expected_n_ticks_estimation: int=3, window_size_for_expected_imbalance_estimation: int=10000, initial_estimate_of_expected_n_ticks_in_bar: int=20000, analyse_thresholds: bool=False: -``` -##### `method initialize_dollar_standard_bars` - -```python -def initialize_dollar_standard_barsthreshold: Union[float, pd.Series]=70000000: -``` -##### `method initialize_volume_standard_bars` - -```python -def initialize_volume_standard_barsthreshold: Union[float, pd.Series]=30000: -``` -##### `method initialize_tick_standard_bars` - -```python -def initialize_tick_standard_barsthreshold: Union[float, pd.Series]=6000: -``` -##### `method initialize_time_bars` - -```python -def initialize_time_barsresolution_type: str='D', resolution_units: int=1: -``` - -### πŸ“„ `RiskLabAI\controller\data_structure_controller.py` - -#### `class Controller` - -##### `method __init__` - -```python -def __init__self: -``` -##### `method handle_input_command` - -```python -def handle_input_commandself, method_name: str, method_arguments: dict, input_data: Union[str, pd.DataFrame], output_path: Optional[str]=None, batch_size: int=1000000: -``` - -> Handles the input command to initialize bars and run on batches. - -:param method_name: Name of the method to call -:param method_arguments: Arguments for the method -:param input_data: Input data as a DataFrame or string path -:param output_path: Optional path to save results as CSV -:param batch_size: Size of each batch to process -:return: DataFrame of aggregated bars - -##### `method run_on_batches` - -```python -def run_on_batchesself, initialized_bars: AbstractBars, input_data: Union[str, pd.DataFrame], batch_size: int, output_path: Optional[str]=None: -``` - -> Runs the initialized bars on batches of data. - -:param initialized_bars: Initialized bars object -:param input_data: Input data as DataFrame or string path -:param batch_size: Size of each batch to process -:param output_path: Optional path to save results as CSV -:return: DataFrame of aggregated bars - -##### `method construct_bars_from_batch` - -```python -def construct_bars_from_batchbars: AbstractBars, data: pd.DataFrame: -``` - -> Construct bars from a single batch of data. - -:param bars: Initialized bars object -:param data: Data for this batch as a DataFrame -:return: List of constructed bars - -##### `method read_batches_from_string` - -```python -def read_batches_from_stringinput_path: str, batch_size: int: -``` - -> Reads data in batches from a CSV file. - -:param input_path: File path to read from -:param batch_size: Size of each batch -:return: Generator yielding batches of data - -##### `method read_batches_from_dataframe` - -```python -def read_batches_from_dataframeinput_data: pd.DataFrame, batch_size: int: -``` - -> Reads data in batches from a DataFrame. - -:param input_data: DataFrame to read from -:param batch_size: Size of each batch -:return: Generator yielding batches of data - - -### πŸ“„ `RiskLabAI\data\denoise\denoising.py` - -#### `function marcenko_pastur_pdf` - -```python -def marcenko_pastur_pdfvariance: float, q: float, num_points: int: -``` - -> Computes the Marcenko-Pastur probability density function (pdf). - -:param variance: Variance of the observations -:type variance: float -:param q: Ratio T/N -:type q: float -:param num_points: Number of points in the pdf -:type num_points: int -:return: The Marcenko-Pastur pdf as a pandas Series -:rtype: pd.Series - -The Marcenko-Pastur pdf is given by the formula: -.. math:: - \frac{q}{{2 \pi \sigma \lambda}} \sqrt{(\lambda_{max} - \lambda)(\lambda - \lambda_{min})} - -where: -- :math:`\lambda` is the eigenvalue -- :math:`\sigma` is the variance of the observations -- :math:`q` is the ratio T/N -- :math:`\lambda_{max}` and :math:`\lambda_{min}` are the maximum and minimum eigenvalues respectively - -#### `function pca` - -```python -def pcamatrix: np.ndarray: -``` - -> Computes the principal component analysis of a Hermitian matrix. - -:param matrix: Hermitian matrix -:type matrix: np.ndarray -:return: Eigenvalues and eigenvectors -:rtype: Tuple[np.ndarray, np.ndarray] - -The principal component analysis is computed using the eigen decomposition of the Hermitian matrix. - -#### `function fit_kde` - -```python -def fit_kdeobservations: Union[np.ndarray, pd.Series], bandwidth: float=0.25, kernel: str='gaussian', x: Optional[Union[np.ndarray, pd.Series]]=None: -``` - -> Fit a kernel density estimator to a series of observations. - -:param observations: Series of observations -:type observations: Union[np.ndarray, pd.Series] -:param bandwidth: Bandwidth of the kernel -:type bandwidth: float -:param kernel: Type of kernel to use (e.g., 'gaussian') -:type kernel: str -:param x: Array of values on which the fit KDE will be evaluated -:type x: Optional[Union[np.ndarray, pd.Series]] -:return: Kernel density estimate as a pandas Series -:rtype: pd.Series - -#### `function random_cov` - -```python -def random_covnum_columns: int, num_factors: int: -``` - -> Generate a random covariance matrix. - -:param num_columns: Number of columns in the covariance matrix -:type num_columns: int -:param num_factors: Number of factors for random covariance matrix -:type num_factors: int -:return: Random covariance matrix -:rtype: np.ndarray - -#### `function cov_to_corr` - -```python -def cov_to_corrcov: np.ndarray: -``` - -> Convert a covariance matrix to a correlation matrix. - -:param cov: Covariance matrix -:type cov: np.ndarray -:return: Correlation matrix -:rtype: np.ndarray - -#### `function error_pdfs` - -```python -def error_pdfsvariance: float, eigenvalues: np.ndarray, q: float, bandwidth: float, num_points: int=1000: -``` - -> Computes the sum of squared errors between the theoretical and empirical PDFs. - -:param variance: Variance of the observations -:type variance: float -:param eigenvalues: Eigenvalues of the correlation matrix -:type eigenvalues: np.ndarray -:param q: Ratio T/N -:type q: float -:param bandwidth: Bandwidth of the kernel -:type bandwidth: float -:param num_points: Number of points in the PDF -:type num_points: int -:return: Sum of squared errors between the theoretical and empirical PDFs -:rtype: float - -#### `function find_max_eval` - -```python -def find_max_evaleigenvalues: np.ndarray, q: float, bandwidth: float: -``` - -> Find the maximum random eigenvalue by fitting the Marcenko-Pastur distribution. - -:param eigenvalues: Eigenvalues of the correlation matrix -:type eigenvalues: np.ndarray -:param q: Ratio T/N -:type q: float -:param bandwidth: Bandwidth of the kernel -:type bandwidth: float -:return: Maximum random eigenvalue and its variance -:rtype: Tuple[float, float] - -#### `function denoised_corr` - -```python -def denoised_correigenvalues: np.ndarray, eigenvectors: np.ndarray, num_factors: int: -``` - -> Remove noise from the correlation matrix by fixing random eigenvalues. - -:param eigenvalues: Eigenvalues of the correlation matrix -:type eigenvalues: np.ndarray -:param eigenvectors: Eigenvectors of the correlation matrix -:type eigenvectors: np.ndarray -:param num_factors: Number of factors for the correlation matrix -:type num_factors: int -:return: Denoised correlation matrix -:rtype: np.ndarray - -#### `function denoised_corr2` - -```python -def denoised_corr2eigenvalues: np.ndarray, eigenvectors: np.ndarray, num_factors: int, alpha: float=0: -``` - -> Remove noise from the correlation matrix through targeted shrinkage. - -:param eigenvalues: Eigenvalues of the correlation matrix -:type eigenvalues: np.ndarray -:param eigenvectors: Eigenvectors of the correlation matrix -:type eigenvectors: np.ndarray -:param num_factors: Number of factors for the correlation matrix -:type num_factors: int -:param alpha: Shrinkage parameter -:type alpha: float -:return: Denoised correlation matrix -:rtype: np.ndarray - -#### `function form_block_matrix` - -```python -def form_block_matrixn_blocks: int, block_size: int, block_correlation: float: -``` - -> Forms a block diagonal correlation matrix. - -:param n_blocks: Number of blocks -:type n_blocks: int -:param block_size: Size of each block -:type block_size: int -:param block_correlation: Correlation within each block -:type block_correlation: float -:return: Block diagonal correlation matrix -:rtype: np.ndarray - -#### `function form_true_matrix` - -```python -def form_true_matrixn_blocks: int, block_size: int, block_correlation: float: -``` - -> Forms a shuffled block diagonal correlation matrix and the corresponding covariance matrix. - -:param n_blocks: Number of blocks -:type n_blocks: int -:param block_size: Size of each block -:type block_size: int -:param block_correlation: Correlation within each block -:type block_correlation: float -:return: Mean and covariance matrix -:rtype: Tuple[np.ndarray, np.ndarray] - -#### `function simulates_cov_mu` - -```python -def simulates_cov_mumu0: np.ndarray, cov0: np.ndarray, n_obs: int, shrink: bool=False: -``` - -> Simulates multivariate normal observations and computes the sample mean and covariance. - -:param mu0: True mean -:type mu0: np.ndarray -:param cov0: True covariance matrix -:type cov0: np.ndarray -:param n_obs: Number of observations -:type n_obs: int -:param shrink: Whether to use Ledoit-Wolf shrinkage -:type shrink: bool -:return: Sample mean and covariance matrix -:rtype: Tuple[np.ndarray, np.ndarray] - -#### `function corr_to_cov` - -```python -def corr_to_covcorr: np.ndarray, std: np.ndarray: -``` - -> Converts a correlation matrix to a covariance matrix. - -:param corr: Correlation matrix -:type corr: np.ndarray -:param std: Standard deviations -:type std: np.ndarray -:return: Covariance matrix -:rtype: np.ndarray - -#### `function denoise_cov` - -```python -def denoise_covcov0: np.ndarray, q: float, bandwidth: float: -``` - -> De-noises the covariance matrix. - -:param cov0: Covariance matrix -:type cov0: np.ndarray -:param q: Ratio of number of observations to number of variables -:type q: float -:param bandwidth: Bandwidth parameter -:type bandwidth: float -:return: De-noised covariance matrix -:rtype: np.ndarray - -#### `function optimal_portfolio` - -```python -def optimal_portfoliocov: np.ndarray, mu: np.ndarray=None: -``` - -> Computes the optimal portfolio weights. - -:param cov: Covariance matrix -:type cov: np.ndarray -:param mu: Expected returns -:type mu: np.ndarray -:return: Optimal portfolio weights -:rtype: np.ndarray - - -### πŸ“„ `RiskLabAI\data\differentiation\differentiation.py` - -#### `function calculate_weights` - -```python -def calculate_weightsdegree: float, size: int: -``` - -> Compute the weights for fractionally differentiated series. - -:param degree: Degree of the binomial series. -:param size: Length of the time series. -:return: Array of weights. - -Formula: - .. math:: - w(k) = -w(k-1) / k * (degree - k + 1) - -#### `function plot_weights` - -```python -def plot_weightsdegree_range: tuple[float, float], number_degrees: int, size: int: -``` - -> Plot the weights of fractionally differentiated series. - -:param degree_range: Tuple containing the minimum and maximum degree values. -:param number_degrees: Number of degrees to plot. -:param size: Length of the time series. - -#### `function fractional_difference` - -```python -def fractional_differenceseries: pd.DataFrame, degree: float, threshold: float=0.01: -``` - -> Compute the standard fractionally differentiated series. - -:param series: Dataframe of dates and prices. -:param degree: Degree of the binomial series. -:param threshold: Threshold for weight-loss. -:return: Dataframe of fractionally differentiated series. - -Methodology reference: - De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons, p. 82. - -#### `function calculate_weights_ffd` - -```python -def calculate_weights_ffddegree: float, threshold: float: -``` - -> Compute the weights for fixed-width window fractionally differentiated method. - -:param degree: Degree of the binomial series. -:param threshold: Threshold for weight-loss. -:return: Array of weights. - -Methodology reference: - De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons, p. 83. - -#### `function fractional_difference_fixed` - -```python -def fractional_difference_fixedseries: pd.DataFrame, degree: float, threshold: float=1e-05: -``` - -> Compute the fixed-width window fractionally differentiated series. - -:param series: Dataframe of dates and prices. -:param degree: Degree of the binomial series. -:param threshold: Threshold for weight-loss. -:return: Dataframe of fractionally differentiated series. - -Methodology reference: - De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons, p. 83. - -#### `function fractional_difference_fixed_single` - -```python -def fractional_difference_fixed_singleseries: pd.Series, degree: float, threshold: float=1e-05: -``` - -> Compute the fixed-width window fractionally differentiated series. - -:param series: Series of dates and prices. -:param degree: Degree of the binomial series. -:param threshold: Threshold for weight-loss. -:return: Fractionally differentiated series. - -Methodology reference: - De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons, p. 83. - -#### `function minimum_ffd` - -```python -def minimum_ffdinput_series: pd.DataFrame: -``` - -> Find the minimum degree value that passes the ADF test. - -:param input_series: Dataframe of input data. -:return: Dataframe of ADF test results. - -Methodology reference: - De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons, p. 85. - -#### `function get_weights` - -```python -def get_weightsdegree: float, length: int: -``` - -> Calculate the weights for the fractional differentiation method. - -:param degree: Degree of binomial series. -:param length: Length of the series. -:return: Array of calculated weights. - -Related mathematical formula: -.. math:: - w_i = -w_{i-1}/i*(degree - i + 1) - -#### `function fractional_difference` - -```python -def fractional_differenceseries: pd.DataFrame, degree: float, threshold: float=0.01: -``` - -> Calculate the fractionally differentiated series using the fixed-width window method. - -:param series: DataFrame of dates and prices. -:param degree: Degree of binomial series. -:param threshold: Threshold for weight-loss. -:return: DataFrame of fractionally differentiated series. - -Related mathematical formula: -.. math:: - F_t^{(d)} = \sum_{i=0}^{t} w_i F_{t-i} - -#### `function minimum_adf_degree` - -```python -def minimum_adf_degreeinput_series: pd.DataFrame: -``` - -> Find the minimum degree value that passes the ADF test. - -:param input_series: DataFrame of input series. -:return: DataFrame of output results with ADF statistics. - -Related mathematical formula: -.. math:: - F_t^{(d)} = \sum_{i=0}^{t} w_i F_{t-i} - -#### `function fractionally_differentiated_log_price` - -```python -def fractionally_differentiated_log_priceinput_series: pd.Series, threshold=0.01, step=0.1, base_p_value=0.05: -``` - -> Calculate the fractionally differentiated log price with the minimum degree differentiation -that passes the Augmented Dickey-Fuller (ADF) test. - -:param input_series: Time series of input data. -:param threshold: The threshold for fractionally differentiating the log price. -:param step: The increment step for adjusting the differentiation degree. -:param base_p_value: The significance level for the ADF test. -:return: Fractionally differentiated log price series. - -Methodology reference: - De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons, p. 85. - - -### πŸ“„ `RiskLabAI\data\distance\distance_metric.py` - -#### `function calculate_variation_of_information` - -```python -def calculate_variation_of_informationx: np.ndarray, y: np.ndarray, bins: int, norm: bool=False: -``` - -> Calculates Variation of Information. - -:param x: First data array. -:param y: Second data array. -:param bins: Number of bins for the histogram. -:param norm: If True, the result will be normalized. - -:return: Variation of Information. - -Reference: De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. -Methodology: Snippet 3.2, Page 44 - -#### `function calculate_number_of_bins` - -```python -def calculate_number_of_binsnum_observations: int, correlation: float=None: -``` - -> Calculates the optimal number of bins for discretization. - -:param num_observations: Number of observations. -:param correlation: Correlation value. If None, the function will use the univariate case. - -:return: Optimal number of bins. - -Reference: De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. -Methodology: Snippet 3.3, Page 46 - -#### `function calculate_variation_of_information_extended` - -```python -def calculate_variation_of_information_extendedx: np.ndarray, y: np.ndarray, norm: bool=False: -``` - -> Calculates Variation of Information with calculating number of bins. - -:param x: First data array. -:param y: Second data array. -:param norm: If True, the result will be normalized. - -:return: Variation of Information. - -Reference: De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. -Methodology: Snippet 3.3, Page 46 - -#### `function calculate_mutual_information` - -```python -def calculate_mutual_informationx: np.ndarray, y: np.ndarray, norm: bool=False: -``` - -> Calculates Mutual Information with calculating number of bins. - -:param x: First data array. -:param y: Second data array. -:param norm: If True, the result will be normalized. - -:return: Mutual Information. - -Reference: De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. -Methodology: Snippet 3.4, Page 48 - -#### `function calculate_distance` - -```python -def calculate_distancedependence: np.ndarray, metric: str='angular': -``` - -> Calculates distance from a dependence matrix. - -:param dependence: Dependence matrix. -:param metric: Metric used to calculate distance. Available options are "angular" and "absolute_angular". - -:return: Distance matrix. - -Reference: De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - -#### `function calculate_kullback_leibler_divergence` - -```python -def calculate_kullback_leibler_divergencep: np.ndarray, q: np.ndarray: -``` - -> Calculates Kullback-Leibler divergence from two discrete probability distributions defined on the same probability space. - -:param p: First distribution. -:param q: Second distribution. - -:return: Kullback-Leibler divergence. - -Reference: De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - -#### `function calculate_cross_entropy` - -```python -def calculate_cross_entropyp: np.ndarray, q: np.ndarray: -``` - -> Calculates cross-entropy from two discrete probability distributions defined on the same probability space. - -:param p: First distribution. -:param q: Second distribution. - -:return: Cross-entropy. - -Reference: De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - - -### πŸ“„ `RiskLabAI\data\labeling\financial_labels.py` - -#### `function calculate_t_value_linear_regression` - -```python -def calculate_t_value_linear_regressionprice: pd.Series: -``` - -> Calculate the t-value of a linear trend. - -This function computes the t-value of a linear trend in a time series of prices. -The t-value is calculated as the ratio of the slope to the standard error of the regression. - -:param price: Time series of prices as a Pandas Series. -:return: Calculated t-value as a float. - -#### `function find_trend_using_trend_scanning` - -```python -def find_trend_using_trend_scanningmolecule: pd.Index, close: pd.Series, span: Tuple[int, int]: -``` - -> Implement the trend scanning method to find trends. - -This function identifies trends in a time series of prices using the trend scanning method. -It calculates the t-value for linear regression of price over a range of spans and -identifies the span with the maximum absolute t-value as the trend. The sign of the t-value -indicates the direction of the trend. - -:param molecule: Index of observations to label as a Pandas Index. -:param close: Time series of prices as a Pandas Series. -:param span: Range of span lengths to evaluate for the maximum absolute t-value as a tuple. -:return: DataFrame containing trend information with columns ['End Time', 't-Value', 'Trend']. - - -### πŸ“„ `RiskLabAI\data\labeling\labeling.py` - -#### `function cusum_filter_events_dynamic_threshold` - -```python -def cusum_filter_events_dynamic_thresholdprices: pd.Series, threshold: pd.Series: -``` - -> Detect events using the Symmetric Cumulative Sum (CUSUM) filter. - -The Symmetric CUSUM filter is a change-point detection algorithm used to identify events where the price difference -exceeds a predefined threshold. - -:param prices: A pandas Series of prices. -:param threshold: A pandas Series containing the predefined threshold values for event detection. -:return: A pandas DatetimeIndex containing timestamps of detected events. - -References: -- De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. (Methodology: 39) - -#### `function symmetric_cusum_filter` - -```python -def symmetric_cusum_filterprices: pd.Series, threshold: float: -``` - -> Implements the symmetric CUSUM filter. - -The symmetric CUSUM filter is a change-point detection algorithm used to identify events where the price difference exceeds a predefined threshold. - -:param prices: A pandas Series of prices. -:param threshold: The predefined threshold for detecting events. -:return: A pandas DatetimeIndex of event timestamps. - -References: -- De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. (Methodology: 39) - -#### `function aggregate_ohlcv` - -```python -def aggregate_ohlcvtick_data_grouped: -``` - -> Aggregates tick data into OHLCV bars. - -:param tick_data_grouped: A pandas GroupBy object of tick data. -:return: A pandas DataFrame with OHLCV bars. - -#### `function generate_time_bars` - -```python -def generate_time_barstick_data: pd.DataFrame, frequency: str='5Min': -``` - -> Generates time bars from tick data. - -:param tick_data: A pandas DataFrame of tick data. -:param frequency: The frequency for time bar aggregation. -:return: A pandas DataFrame with time bars. - -#### `function compute_daily_volatility` - -```python -def compute_daily_volatilityclose: pd.Series, span: int=63: -``` - -> Computes the daily volatility at intraday estimation points. - -:param close: A pandas Series of close prices. -:param span: The span parameter for the EWMA. -:return: A pandas DataFrame with returns and volatilities. - -References: -- De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. (Methodology: Page 44) - -#### `function daily_volatility_with_log_returns` - -```python -def daily_volatility_with_log_returnsclose: pd.Series, span: int=100: -``` - -> Calculate the daily volatility at intraday estimation points using Exponentially Weighted Moving Average (EWMA). - -:param close: A pandas Series of daily close prices. -:param span: The span parameter for the Exponentially Weighted Moving Average (EWMA). -:return: A pandas Series containing daily volatilities. - -References: -- De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. (Methodology: Page 44) - -#### `function triple_barrier` - -```python -def triple_barrierclose: pd.Series, events: pd.DataFrame, profit_taking_stop_loss: list[float, float], molecule: list: -``` -#### `function get_barrier_touch_time` - -```python -def get_barrier_touch_timeclose: pd.Series, time_events: pd.DatetimeIndex, ptsl: float, target: pd.Series, return_min: float, num_threads: int, timestamp: pd.Series=False: -``` - -> Finds the time of the first barrier touch. - -:param close: A dataframe of dates and close prices. -:param time_events: A pandas time index containing the timestamps that will seed every triple barrier. -:param ptsl: A non-negative float that sets the width of the two barriers. -:param target: A pandas series of targets, expressed in terms of absolute returns. -:param return_min: The minimum target return required for running a triple barrier search. -:param num_threads: The number of threads. -:param timestamp: A pandas series with the timestamps of the vertical barriers (False when disabled). -:return: A dataframe with timestamp of the vertical barrier and unit width of the horizontal barriers. - -#### `function vertical_barrier` - -```python -def vertical_barrierclose: pd.Series, time_events: pd.DatetimeIndex, number_days: int: -``` - -> Shows one way to define a vertical barrier. - -:param close: A dataframe of prices and dates. -:param time_events: A vector of timestamps. -:param number_days: A number of days for the vertical barrier. -:return: A pandas series with the timestamps of the vertical barriers. - -#### `function get_labels` - -```python -def get_labelsevents: pd.DataFrame, close: pd.Series: -``` - -> Label the observations. - -:param events: A dataframe with timestamp of the vertical barrier and unit width of the horizontal barriers. -:param close: A dataframe of dates and close prices. -:return: A dataframe with the return realized at the time of the first touched barrier and the label. - -#### `function meta_events` - -```python -def meta_eventsclose: pd.Series, time_events: pd.DatetimeIndex, ptsl: List[float], target: pd.Series, return_min: float, num_threads: int, timestamp: pd.Series=False, side: pd.Series=None: -``` -#### `function meta_labeling` - -```python -def meta_labelingevents: pd.DataFrame, close: pd.Series: -``` - -> Expands label to incorporate meta-labeling. - -:param events: DataFrame with timestamp of vertical barrier and unit width of the horizontal barriers. -:param close: Series of close prices with date indices. -:return: DataFrame containing the return and binary labels for each event. - -Reference: -De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. -Methodology: 51 - -#### `function drop_label` - -```python -def drop_labelevents: pd.DataFrame, percent_min: float=0.05: -``` - -> Presents a procedure that recursively drops observations associated with extremely rare labels. - -:param events: DataFrame with columns: Dates, ret, and bin. -:param percent_min: Minimum percentage. -:return: DataFrame with the updated events. - -Reference: -De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. -Methodology: 54 - -#### `function lin_parts` - -```python -def lin_partsnum_atoms: int, num_threads: int: -``` - -> Partition of atoms with a single loop. - -:param num_atoms: Total number of atoms. -:param num_threads: Number of threads for parallel processing. -:return: Numpy array with partition indices. - -#### `function nested_parts` - -```python -def nested_partsnum_atoms: int, num_threads: int, upper_triang: bool=False: -``` - -> Partition of atoms with an inner loop. - -:param num_atoms: Total number of atoms. -:param num_threads: Number of threads for parallel processing. -:param upper_triang: Whether the first rows are the heaviest. -:return: Numpy array with partition indices. - -#### `function mp_pandas_obj` - -```python -def mp_pandas_objfunc, pd_obj, num_threads: int=24, mp_batches: int=1, lin_mols: bool=True, **kargs: -``` - -> Parallelize jobs, return a DataFrame or Series. - -:param func: Function to be parallelized. -:param pd_obj: Tuple with argument name for the molecule and list of atoms grouped into molecules. -:param num_threads: Number of threads for parallel processing. -:param mp_batches: Number of multi-processing batches. -:param lin_mols: Whether to use linear molecule partitioning. -:param kargs: Any other arguments needed by func. -:return: DataFrame with the results of the parallelized function. - -Example: -df1 = mp_pandas_obj(func, ('molecule', df0.index), 24, **kargs) - -#### `function process_jobs_` - -```python -def process_jobs_jobs: list: -``` - -> Run jobs sequentially, for debugging. - -:param jobs: List of jobs to be processed. -:return: List of job results. - -#### `function report_progress` - -```python -def report_progressjob_num: int, num_jobs: int, time0: float, task: str: -``` - -> Report progress as asynchronous jobs are completed. - -:param job_num: Current job number. -:param num_jobs: Total number of jobs. -:param time0: Start time. -:param task: Task name. -:return: None - -#### `function process_jobs` - -```python -def process_jobsjobs: list, task: str=None, num_threads: int=24: -``` - -> Run jobs in parallel in multiple threads. - -:param jobs: List of jobs to be processed. -:param task: Task name for progress reporting. -:param num_threads: Number of threads for parallel processing. -:return: List of job results. - -#### `function expand_call` - -```python -def expand_callkargs: -``` - -> Expand the arguments of a callback function, kargs['func']. - -:param kargs: Dictionary with the function to call and the arguments to pass. -:return: Result of the function call. - - -### πŸ“„ `RiskLabAI\data\structures\abstract_bars.py` - -> A base class for the various bar types. Includes the logic shared between classes, to minimise the amount of -duplicated code. - -#### `class AbstractBars` - -> Abstract class that contains the base properties which are shared between the subtypes. -This class subtypes are as follows: - 1- AbstractImbalanceBars - 2- AbstractRunBars - 3- StandardBars - 4- TimeBars - -##### `method __init__` - -```python -def __init__self, bar_type: str: -``` - -> AbstractBars constructor function -:param bar_type: type of bar. e.g. time_bars, expected_dollar_imbalance_bars, fixed_tick_run_bars, volume_standard_bars etc. - -##### `method construct_bars_from_data` - -```python -def construct_bars_from_dataself, data: Union[list, tuple, np.ndarray]: -``` - -> This function are implemented by all concrete or abstract subtypes. The function is used to construct bars from -input ticks data. -:param data: tabular data that contains date_time, price, and volume columns -:return: constructed bars - -##### `method update_base_fields` - -```python -def update_base_fieldsself, price: float, tick_rule: int, volume: float: -``` - -> Update the base fields (that all bars have them.) with price, tick rule and volume of current tick -:param price: price of current tick -:param tick_rule: tick rule of current tick computed before -:param volume: volume of current tick -:return: - -##### `method _bar_construction_condition` - -```python -def _bar_construction_conditionself, threshold: -``` - -> Compute the condition of whether next bar should sample with current and previous tick datas or not. -:return: whether next bar should form with current and previous tick datas or not. - -##### `method _reset_cached_fields` - -```python -def _reset_cached_fieldsself: -``` - -> This function are used (directly or override) by all concrete or abstract subtypes. The function is used to reset cached fields in bars construction process when next bar is sampled. -:return: - -##### `method _tick_rule` - -```python -def _tick_ruleself, price: float=0: -``` - -> Compute the tick rule term as explained on page 29 of Advances in Financial Machine Learning -:param price: price of current tick -:return: tick rule - -##### `method _high_and_low_price_update` - -```python -def _high_and_low_price_updateself, price: float: -``` - -> Update the high and low prices using the current tick price. -:param price: price of current tick -:return: updated high and low prices - -##### `method _construct_next_bar` - -```python -def _construct_next_barself, date_time: str, tick_index: int, price: float, high_price: float, low_price: float, threshold: float: -``` - -> sample next bar, given ticks data. the bar's fields are as follows: - 1- date_time - 2- open - 3- high - 4- low - 5- close - 6- cumulative_volume: total cumulative volume of to be constructed bar ticks - 7- cumulative_buy_volume: total cumulative buy volume of to be constructed bar ticks - 8- cumulative_ticks total cumulative ticks number of to be constructed bar ticks - 9- cumulative_dollar_value total cumulative dollar value (price * volume) of to be constructed bar ticks - -the bar will have appended to the total list of sampled bars. - -:param date_time: timestamp of the to be constructed bar -:param tick_index: -:param price: price of last tick of to be constructed bar (used as close price) -:param high_price: highest price of ticks in the period of bar sampling process -:param low_price: lowest price of ticks in the period of bar sampling process -:return: sampled bar - - -### πŸ“„ `RiskLabAI\data\structures\abstract_imbalance_bars.py` - -#### `class AbstractImbalanceBars` - -> Abstract class that contains the imbalance properties which are shared between the subtypes. -This class subtypes are as follows: - 1- ExpectedImbalanceBars - 2- FixedImbalanceBars - -The class implements imbalance bars sampling logic as explained on page 29,30 of Advances in Financial Machine Learning. - -##### `method __init__` - -```python -def __init__self, bar_type: str, window_size_for_expected_n_ticks_estimation: int, window_size_for_expected_imbalance_estimation: int, initial_estimate_of_expected_n_ticks_in_bar: int, analyse_thresholds: bool: -``` - -> AbstractImbalanceBars constructor function -:param bar_type: type of bar. e.g. expected_dollar_imbalance_bars, fixed_tick_imbalance_bars etc. -:param window_size_for_expected_n_ticks_estimation: window size used to estimate number of ticks expectation -:param initial_estimate_of_expected_n_ticks_in_bar: initial estimate of number of ticks expectation window size -:param window_size_for_expected_imbalance_estimation: window size used to estimate imbalance expectation -:param analyse_thresholds: whether return thresholds values (ΞΈ, number of ticks expectation, imbalance expectation) in a tabular format - -##### `method construct_bars_from_data` - -```python -def construct_bars_from_dataself, data: Union[list, tuple, np.ndarray]: -``` - -> The function is used to construct bars from input ticks data. -:param data: tabular data that contains date_time, price, and volume columns -:return: constructed bars - -##### `method _bar_construction_condition` - -```python -def _bar_construction_conditionself, threshold: -``` - -> Compute the condition of whether next bar should sample with current and previous tick datas or not. -:return: whether next bar should form with current and previous tick datas or not. - -##### `method _reset_cached_fields` - -```python -def _reset_cached_fieldsself: -``` - -> This function are used (directly or override) by all concrete or abstract subtypes. The function is used to reset cached fields in bars construction process when next bar is sampled. -:return: - -##### `method _expected_number_of_ticks` - -```python -def _expected_number_of_ticksself: -``` - -> Calculate number of ticks expectation when new imbalance bar is sampled. - -:return: number of ticks expectation. - - -### πŸ“„ `RiskLabAI\data\structures\abstract_information_driven_bars.py` - -> A base class for the various bar types. Includes the logic shared between classes, to minimise the amount of -duplicated code. - -#### `class AbstractInformationDrivenBars` - -> Abstract class that contains the information driven properties which are shared between the subtypes. -This class subtypes are as follows: - 1- AbstractImbalanceBars - 2- AbstractRunBars - -The class implements imbalance bars sampling logic as explained on page 29,30,31,32 of Advances in Financial Machine Learning. - -##### `method __init__` - -```python -def __init__self, bar_type: str, window_size_for_expected_n_ticks_estimation: int, initial_estimate_of_expected_n_ticks_in_bar: int, window_size_for_expected_imbalance_estimation: int: -``` - -> AbstractInformationDrivenBars constructor function -:param bar_type: type of bar. e.g. expected_dollar_imbalance_bars, fixed_tick_run_bars etc. -:param window_size_for_expected_n_ticks_estimation: window size used to estimate number of ticks expectation -:param initial_estimate_of_expected_n_ticks_in_bar: initial estimate of number of ticks expectation window size -:param window_size_for_expected_imbalance_estimation: window size used to estimate imbalance expectation - -##### `method _ewma_expected_imbalance` - -```python -def _ewma_expected_imbalanceself, array: list, window: int, warm_up: bool=False: -``` - -> Calculates expected imbalance (2P[b_t=1]-1) using EWMA as defined on page 29 of Advances in Financial Machine Learning. -:param array: imbalances list -:param window: EWMA window for expectation calculation -:param warm_up: whether warm up period passed or not -:return: expected_imbalance: 2P[b_t=1]-1 which approximated using EWMA expectation - -##### `method _imbalance_at_tick` - -```python -def _imbalance_at_tickself, price: float, signed_tick: int, volume: float: -``` - -> Calculate the imbalance at tick t (current tick) (ΞΈ_t) using tick data as defined on page 29 of Advances in Financial Machine Learning -:param price: price of tick -:param signed_tick: tick rule of current tick computed before -:param volume: volume of current tick -:return: imbalance: imbalance of current tick - -##### `method _expected_number_of_ticks` - -```python -def _expected_number_of_ticksself: -``` - -> Calculate number of ticks expectation when new imbalance bar is sampled. - - -### πŸ“„ `RiskLabAI\data\structures\abstract_run_bars.py` - -#### `class AbstractRunBars` - -> Abstract class that contains the run properties which are shared between the subtypes. -This class subtypes are as follows: - 1- ExpectedRunBars - 2- FixedRunBars - -The class implements run bars sampling logic as explained on page 31,32 of Advances in Financial Machine Learning. - -##### `method __init__` - -```python -def __init__self, bar_type: str, window_size_for_expected_n_ticks_estimation: int, window_size_for_expected_imbalance_estimation: int, initial_estimate_of_expected_n_ticks_in_bar: int, analyse_thresholds: bool: -``` - -> AbstractRunBars constructor function -:param bar_type: type of bar. e.g. expected_dollar_run_bars, fixed_tick_run_bars etc. -:param window_size_for_expected_n_ticks_estimation: window size used to estimate number of ticks expectation -:param initial_estimate_of_expected_n_ticks_in_bar: initial estimate of number of ticks expectation window size -:param window_size_for_expected_imbalance_estimation: window size used to estimate imbalance expectation -:param analyse_thresholds: whether return thresholds values (ΞΈ, number of ticks expectation, imbalance expectation) in a tabular format - -##### `method construct_bars_from_data` - -```python -def construct_bars_from_dataself, data: Union[list, tuple, np.ndarray]: -``` - -> The function is used to construct bars from input ticks data. -:param data: tabular data that contains date_time, price, and volume columns -:return: constructed bars - -##### `method _bar_construction_condition` - -```python -def _bar_construction_conditionself, threshold: -``` - -> Compute the condition of whether next bar should sample with current and previous tick datas or not. -:return: whether next bar should form with current and previous tick datas or not. - -##### `method _reset_cached_fields` - -```python -def _reset_cached_fieldsself: -``` - -> This function are used (directly or override) by all concrete or abstract subtypes. The function is used to reset cached fields in bars construction process when next bar is sampled. -:return: - -##### `method _expected_number_of_ticks` - -```python -def _expected_number_of_ticksself: -``` - -> Calculate number of ticks expectation when new imbalance bar is sampled. - - -### πŸ“„ `RiskLabAI\data\structures\data_structures_lopez.py` - -#### `function progress_bar` - -```python -def progress_barvalue: int, end_value: int, start_time: float, bar_length: int=20: -``` - -> Display a progress bar in the console. - -:param value: Current progress value. -:param end_value: The end value indicating 100% progress. -:param start_time: Time when the event started. -:param bar_length: The length of the progress bar in characters. Default is 20. - -#### `function ewma` - -```python -def ewmainput_array: np.ndarray, window_length: int: -``` - -> Computes the Exponentially Weighted Moving Average (EWMA). - -:param input_array: The input time series array. -:param window_length: Window length for the EWMA. -:return: The EWMA values. - -#### `function compute_grouping` - -```python -def compute_groupingtarget_col: pd.Series, initial_expected_ticks: int, bar_size: float: -``` - -> Group a DataFrame based on a feature and calculates thresholds. - -:param target_col: Target column of tick dataframe. -:param initial_expected_ticks: Initial expected ticks. -:param bar_size: Initial expected size in each tick. -:return: Arrays of times_delta, thetas_absolute, thresholds, times, thetas, grouping_id. - -#### `function generate_information_driven_bars` - -```python -def generate_information_driven_barstick_data: pd.DataFrame, bar_type: str='volume', tick_expected_initial: int=2000: -``` - -> Implements Information-Driven Bars as per the methodology described in -"Advances in financial machine learning" by De Prado (2018). - -:param tick_data: DataFrame of tick data. -:param bar_type: Type of the bars, options: "tick", "volume", "dollar". -:param tick_expected_initial: Initial expected ticks value. -:return: A tuple containing the OHLCV DataFrame, thetas absolute array, - and thresholds array. - -#### `function ohlcv` - -```python -def ohlcvtick_data_grouped: pd.core.groupby.generic.DataFrameGroupBy: -``` - -> Computes various statistics for the grouped tick data. - -Takes a grouped dataframe, combines the data, and creates a new one with -information about prices, volume, and other statistics. This is typically -used in the context of financial tick data to generate OHLCV data -(Open, High, Low, Close, Volume). - -:param tick_data_grouped: Grouped DataFrame containing tick data. -:return: A DataFrame containing OHLCV data and other derived statistics. - -#### `function generate_time_bar` - -```python -def generate_time_bartick_data: pd.DataFrame, frequency: str='5Min': -``` - -> Generates time bars for tick data. - -This function groups tick data by a specified time frequency and then -computes OHLCV (Open, High, Low, Close, Volume) statistics. - -:param tick_data: DataFrame containing tick data. -:param frequency: Time frequency for rounding datetime. -:return: A DataFrame containing OHLCV data grouped by time. - -#### `function generate_tick_bar` - -```python -def generate_tick_bartick_data: pd.DataFrame, ticks_per_bar: int=10, number_bars: int=None: -``` - -> Generates tick bars for tick data. - -This function groups tick data by a specified number of ticks and then -computes OHLCV statistics. - -:param tick_data: DataFrame containing tick data. -:param ticks_per_bar: Number of ticks in each bar. -:param number_bars: Number of bars to generate. -:return: A DataFrame containing OHLCV data grouped by tick count. - -#### `function generate_volume_bar` - -```python -def generate_volume_bartick_data: pd.DataFrame, volume_per_bar: int=10000, number_bars: int=None: -``` - -> Generates volume bars for tick data. - -This function groups tick data by a specified volume size and then computes OHLCV statistics. - -:param tick_data: DataFrame containing tick data. -:param volume_per_bar: Volume size for each bar. -:param number_bars: Number of bars to generate. - -:return: A DataFrame containing OHLCV data grouped by volume. - -#### `function generate_dollar_bar` - -```python -def generate_dollar_bartick_data: pd.DataFrame, dollar_per_bar: float=100000, number_bars: int=None: -``` - -> Generates dollar bars for tick data. - -This function groups tick data by a specified dollar amount and then computes OHLCV statistics. - -:param tick_data: DataFrame containing tick data. -:param dollar_per_bar: Dollar amount for each bar. -:param number_bars: Number of bars to generate. - -:return: A DataFrame containing OHLCV data grouped by dollar amount. - -#### `function calculate_pca_weights` - -```python -def calculate_pca_weightscovariance_matrix: np.ndarray, risk_distribution: np.ndarray=None, risk_target: float=1.0: -``` - -> Calculates hedging weights using the covariance matrix, risk distribution, and risk target. - -:param covariance_matrix: Covariance matrix. -:param risk_distribution: Risk distribution vector. -:param risk_target: Risk target value. - -:return: Weights. - -#### `function events` - -```python -def eventsinput_data: pd.DataFrame, threshold: float: -``` - -> Implementation of the symmetric CUSUM filter. - -This function computes time events when certain price change thresholds are met. - -:param input_data: DataFrame of prices and dates. -:param threshold: Threshold for price change. - -:return: DatetimeIndex containing events. - - -### πŸ“„ `RiskLabAI\data\structures\filtering_lopez.py` - -#### `function symmetric_cusum_filter` - -```python -def symmetric_cusum_filterinput_data: pd.DataFrame, threshold: float: -``` - -> Implementation of the symmetric CUSUM filter. - -This method is used to detect changes in a time series data. - -:param input_data: DataFrame containing price data. -:param threshold: Threshold value for the CUSUM filter. - -:return: Datetime index of events based on the symmetric CUSUM filter. - -.. note:: - Reference: - De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. - Methodology 39. - -.. math:: - S_t^+ = max(0, S_{t-1}^+ + \Delta p_t) - S_t^- = min(0, S_{t-1}^- + \Delta p_t) - - where: - - :math:`S_t^+` is the positive CUSUM at time :math:`t` - - :math:`S_t^-` is the negative CUSUM at time :math:`t` - - :math:`\Delta p_t` is the price change at time :math:`t` - - -### πŸ“„ `RiskLabAI\data\structures\hedging.py` - -#### `function pca_weights` - -```python -def pca_weightscov: np.ndarray, risk_distribution: Optional[np.ndarray]=None, risk_target: float=1.0: -``` - -> Calculates hedging weights using covariance, risk distribution, and risk target. - -The function uses Principal Component Analysis (PCA) to determine the weights. -If the risk distribution is not provided, all risk is allocated to the principal -component with the smallest eigenvalue. - -:param cov: Covariance matrix. -:param risk_distribution: Risk distribution, defaults to None. -:param risk_target: Risk target, defaults to 1.0. - -:return: Weights calculated based on PCA. - -.. note:: - Reference: - De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. - Methodology 36. - -.. math:: - w = EV . \sqrt{\frac{\rho T}{\lambda}} - - where: - - :math:`w` are the weights. - - :math:`EV` is the matrix of eigenvectors. - - :math:`\rho` is the risk distribution. - - :math:`T` is the risk target. - - :math:`\lambda` is the eigenvalues. - - -### πŸ“„ `RiskLabAI\data\structures\imbalance_bars.py` - -#### `class ExpectedImbalanceBars` - -> Concrete class that contains the properties which are shared between all various type of ewma imbalance bars (dollar, volume, tick). - -##### `method __init__` - -```python -def __init__self, bar_type: str, window_size_for_expected_n_ticks_estimation: int, initial_estimate_of_expected_n_ticks_in_bar: int, window_size_for_expected_imbalance_estimation: int, expected_ticks_number_bounds: Tuple[float, float], analyse_thresholds: bool: -``` - -> ExpectedImbalanceBars constructor function -:param bar_type: type of bar. e.g. expected_dollar_imbalance_bars, fixed_tick_imbalance_bars etc. -:param window_size_for_expected_n_ticks_estimation: window size used to estimate number of ticks expectation -:param initial_estimate_of_expected_n_ticks_in_bar: initial estimate of number of ticks expectation window size -:param window_size_for_expected_imbalance_estimation: window size used to estimate imbalance expectation -:param expected_ticks_number_bounds lower and upper bound of possible number of expected ticks that used to force bars sampling convergence. -:param analyse_thresholds: whether return thresholds values (ΞΈ, number of ticks expectation, imbalance expectation) in a tabular format - -##### `method _expected_number_of_ticks` - -```python -def _expected_number_of_ticksself: -``` - -> Calculate number of ticks expectation when new imbalance bar is sampled. - -:return: number of ticks expectation. - -#### `class FixedImbalanceBars` - -> Concrete class that contains the properties which are shared between all various type of const imbalance bars (dollar, volume, tick). - -##### `method __init__` - -```python -def __init__self, bar_type: str, window_size_for_expected_n_ticks_estimation: int, initial_estimate_of_expected_n_ticks_in_bar: int, window_size_for_expected_imbalance_estimation: int, analyse_thresholds: bool: -``` - -> FixedImbalanceBars constructor function -:param bar_type: type of bar. e.g. expected_dollar_imbalance_bars, fixed_tick_imbalance_bars etc. -:param window_size_for_expected_n_ticks_estimation: window size used to estimate number of ticks expectation -:param initial_estimate_of_expected_n_ticks_in_bar: initial estimate of number of ticks expectation window size -:param window_size_for_expected_imbalance_estimation: window size used to estimate imbalance expectation -:param analyse_thresholds: whether return thresholds values (ΞΈ, number of ticks expectation, imbalance expectation) in a tabular format - -##### `method _expected_number_of_ticks` - -```python -def _expected_number_of_ticksself: -``` - -> Calculate number of ticks expectation when new imbalance bar is sampled. - -:return: number of ticks expectation. - - -### πŸ“„ `RiskLabAI\data\structures\infomation_driven_bars.py` - -#### `function generate_information_driven_bars` - -```python -def generate_information_driven_barstick_data: pd.DataFrame, bar_type: str='volume', initial_expected_ticks: int=2000: -``` - -> Implements Information-Driven Bars. - -This function computes the Information-Driven Bars based on tick data and the chosen bar type. - -:param tick_data: DataFrame of tick data. -:type tick_data: pd.DataFrame -:param bar_type: Type of the bar. Can be "tick", "volume", or "dollar". -:type bar_type: str, default "volume" -:param initial_expected_ticks: The initial value of expected ticks. -:type initial_expected_ticks: int, default 2000 - -:return: Tuple containing the OHLCV DataFrame, absolute thetas, and thresholds. -:rtype: Tuple[pd.DataFrame, np.ndarray, np.ndarray] - -.. note:: - Reference: - De Prado, M. (2018) Advances in Financial Machine Learning. John Wiley & Sons. - -.. math:: - E_b = |ar{x}| - - where: - - :math:`E_b` is the expected value of the bars. - - :math:`ar{x}` is the mean of the input data. - -The compute_thresholds function is called to compute times_delta, thetas_absolute, thresholds, -times, thetas, and grouping_id. - - -### πŸ“„ `RiskLabAI\data\structures\run_bars.py` - -#### `class ExpectedRunBars` - -> Concrete class that contains the properties which are shared between all various type of ewma run bars (dollar, volume, tick). - -##### `method __init__` - -```python -def __init__self, bar_type: str, window_size_for_expected_n_ticks_estimation: int, initial_estimate_of_expected_n_ticks_in_bar: int, window_size_for_expected_imbalance_estimation: int, expected_ticks_number_bounds: Tuple[float], analyse_thresholds: bool: -``` - -> ExpectedRunBars constructor function -:param bar_type: type of bar. e.g. expected_dollar_imbalance_bars, fixed_tick_imbalance_bars etc. -:param window_size_for_expected_n_ticks_estimation: window size used to estimate number of ticks expectation -:param initial_estimate_of_expected_n_ticks_in_bar: initial estimate of number of ticks expectation window size -:param window_size_for_expected_imbalance_estimation: window size used to estimate imbalance expectation -:param expected_ticks_number_bounds lower and upper bound of possible number of expected ticks that used to force bars sampling convergence. -:param analyse_thresholds: whether return thresholds values (ΞΈ, number of ticks expectation, imbalance expectation) in a tabular format - -##### `method _expected_number_of_ticks` - -```python -def _expected_number_of_ticksself: -``` - -> Calculate number of ticks expectation when new imbalance bar is sampled. - -:return: number of ticks expectation. - -#### `class FixedRunBars` - -> Concrete class that contains the properties which are shared between all various type of const run bars (dollar, volume, tick). - -##### `method __init__` - -```python -def __init__self, bar_type: str, window_size_for_expected_n_ticks_estimation: int, window_size_for_expected_imbalance_estimation: int, initial_estimate_of_expected_n_ticks_in_bar: int, analyse_thresholds: bool: -``` - -> Constructor. - -:param bar_type: (str) Type of run bar to create. Example: "dollar_run". -:param window_size_for_expected_n_ticks_estimation: (int) Window size for E[T]s (number of previous bars to use for expected number of ticks estimation). -:param window_size_for_expected_imbalance_estimation: (int) Expected window used to estimate expected run. -:param initial_estimate_of_expected_n_ticks_in_bar: (int) Initial number of expected ticks. -:param batch_size: (int) Number of rows to read in from the csv, per batch. -:param analyse_thresholds: (bool) Flag to save and return thresholds used to sample run bars. - -##### `method _expected_number_of_ticks` - -```python -def _expected_number_of_ticksself: -``` - -> Calculate number of ticks expectation when new imbalance bar is sampled. - -:return: number of ticks expectation. - - -### πŸ“„ `RiskLabAI\data\structures\standard_bars.py` - -#### `class StandardBars` - -> Concrete class that contains the properties which are shared between all various type of standard bars (dollar, volume, tick). - -##### `method __init__` - -```python -def __init__self, bar_type: str, threshold: float=50000: -``` - -> StandardBars constructor function -:param bar_type: type of bar. e.g. dollar_standard_bars, tick_standard_bars etc. -:param threshold: threshold that used to sampling process - -##### `method construct_bars_from_data` - -```python -def construct_bars_from_dataself, data: Union[list, tuple, np.ndarray]: -``` - -> The function is used to construct bars from input ticks data. -:param data: tabular data that contains date_time, price, and volume columns -:return: constructed bars - -##### `method _bar_construction_condition` - -```python -def _bar_construction_conditionself, threshold: -``` - -> Compute the condition of whether next bar should sample with current and previous tick datas or not. -:return: whether next bar should form with current and previous tick datas or not. - - -### πŸ“„ `RiskLabAI\data\structures\standard_bars_lopez.py` - -#### `function generate_dollar_bar_dataframe` - -```python -def generate_dollar_bar_dataframetick_data: pd.DataFrame, dollar_per_bar: int=100000, number_bars: Optional[int]=None: -``` - -> Generates a dollar bar dataframe. - -:param tick_data: DataFrame of tick data. -:type tick_data: pd.DataFrame -:param dollar_per_bar: Dollars in each bar, defaults to 100000. -:type dollar_per_bar: int, optional -:param number_bars: Number of bars, defaults to None. -:type number_bars: Optional[int], optional -:return: A dataframe containing OHLCV data and other relevant information based on dollar bars. -:rtype: pd.DataFrame - -#### `function generate_tick_bar_dataframe` - -```python -def generate_tick_bar_dataframetick_data: pd.DataFrame, tick_per_bar: int=10, number_bars: Optional[int]=None: -``` - -> Generates a tick bar dataframe. - -:param tick_data: DataFrame of tick data. -:type tick_data: pd.DataFrame -:param tick_per_bar: Number of ticks in each bar, defaults to 10. -:type tick_per_bar: int, optional -:param number_bars: Number of bars, defaults to None. -:type number_bars: Optional[int], optional -:return: A dataframe containing OHLCV data and other relevant information based on tick bars. -:rtype: pd.DataFrame - -#### `function generate_time_bar_dataframe` - -```python -def generate_time_bar_dataframetick_data: pd.DataFrame, frequency: str='5Min': -``` - -> Generates a time bar dataframe. - -:param tick_data: DataFrame of tick data. -:type tick_data: pd.DataFrame -:param frequency: Frequency for rounding date time, defaults to "5Min". -:type frequency: str, optional -:return: A dataframe containing OHLCV data and other relevant information based on time bars with the specified frequency. -:rtype: pd.DataFrame - -#### `function generate_volume_bar_dataframe` - -```python -def generate_volume_bar_dataframetick_data: pd.DataFrame, volume_per_bar: int=10000, number_bars: Optional[int]=None: -``` - -> Generates a volume bar dataframe. - -:param tick_data: DataFrame of tick data. -:type tick_data: pd.DataFrame -:param volume_per_bar: Volumes in each bar, defaults to 10000. -:type volume_per_bar: int, optional -:param number_bars: Number of bars, defaults to None. -:type number_bars: Optional[int], optional -:return: A dataframe containing OHLCV data and other relevant information based on volume bars. -:rtype: pd.DataFrame - - -### πŸ“„ `RiskLabAI\data\structures\time_bars.py` - -#### `class TimeBars` - -> Concrete class of TimeBars logic - -##### `method __init__` - -```python -def __init__self, resolution_type: str, resolution_units: int: -``` - -> TimeBars constructor function - -:param resolution_type: (str) Type of bar resolution: ['D', 'H', 'MIN', 'S']. -:param resolution_units: (int) Number of days, minutes, etc. - -##### `method construct_bars_from_data` - -```python -def construct_bars_from_dataself, data: Union[list, tuple, np.ndarray]: -``` - -> The function is used to construct bars from input ticks data. -:param data: tabular data that contains date_time, price, and volume columns -:return: constructed bars - -##### `method _bar_construction_condition` - -```python -def _bar_construction_conditionself, threshold: -``` - -> Compute the condition of whether next bar should sample with current and previous tick datas or not. -:return: whether next bar should form with current and previous tick datas or not. - - -### πŸ“„ `RiskLabAI\data\structures\utilities_lopez.py` - -#### `function compute_thresholds` - -```python -def compute_thresholdstarget_column: np.ndarray, initial_expected_ticks: int, initial_bar_size: float: -``` - -> Groups the target_column DataFrame based on a feature and calculates thresholds. - -This function groups the target_column DataFrame based on a feature -and calculates the thresholds, which can be used in financial machine learning -applications such as dynamic time warping. - -:param target_column: Target column of the DataFrame. -:type target_column: np.ndarray -:param initial_expected_ticks: Initial expected number of ticks. -:type initial_expected_ticks: int -:param initial_bar_size: Initial expected size of each tick. -:type initial_bar_size: float -:return: A tuple containing the time deltas, absolute theta values, thresholds, - times, theta values, and grouping IDs. -:rtype: Tuple[List[float], np.ndarray, np.ndarray, List[int], np.ndarray, np.ndarray] - -#### `function create_ohlcv_dataframe` - -```python -def create_ohlcv_dataframetick_data_grouped: pd.core.groupby.DataFrameGroupBy: -``` - -> Takes a grouped DataFrame and creates a new one with OHLCV data and other relevant information. - -:param tick_data_grouped: Grouped DataFrame based on some criteria (e.g., time). -:type tick_data_grouped: pd.core.groupby.DataFrameGroupBy -:return: A DataFrame containing OHLCV data and other relevant information. -:rtype: pd.DataFrame - - -### πŸ“„ `RiskLabAI\data\synthetic_data\drift_burst_hypothesis.py` - -#### `function drift_volatility_burst` - -```python -def drift_volatility_burstbubble_length: int, a_before: float, a_after: float, b_before: float, b_after: float, alpha: float, beta: float, explosion_filter_width: float=0.1: -``` - -> Compute the drift and volatility for a burst scenario. - -The drift and volatility are calculated based on: -.. math:: - drift = rac{a_{value}}{denominator^lpha} - volatility = rac{b_{value}}{denominator^eta} - -where: -.. math:: - denominator = |step - 0.5| - -:param bubble_length: The length of the bubble. -:param a_before: 'a' value before the mid-point. -:param a_after: 'a' value after the mid-point. -:param b_before: 'b' value before the mid-point. -:param b_after: 'b' value after the mid-point. -:param alpha: Exponent for the drift calculation. -:param beta: Exponent for the volatility calculation. -:param explosion_filter_width: Width of the area around the explosion that denominators won't exceed. -:return: A tuple containing the drift and volatility arrays. - - -### πŸ“„ `RiskLabAI\data\synthetic_data\synthetic_controlled_environment.py` - -#### `function compute_log_returns` - -```python -def compute_log_returnsN: int, mu_vector: np.ndarray, kappa_vector: np.ndarray, theta_vector: np.ndarray, xi_vector: np.ndarray, dwS: np.ndarray, dwV: np.ndarray, Y: np.ndarray, n: np.ndarray, dt: float, sqrt_dt: float, lambda_vector: np.ndarray, m_vector: np.ndarray, v_vector: np.ndarray, regime_change: np.ndarray: -``` - -> Computes the log returns based on the Heston-Merton model. - -:param N: Number of steps -:param mu_vector: Drift vector of length N -:param kappa_vector: Mean-reversion speed vector of length N -:param theta_vector: Long-term mean vector of length N -:param xi_vector: Volatility of volatility vector of length N -:param dwS: Wiener process for stock -:param dwV: Wiener process for volatility -:param Y: Jump component -:param n: Poisson random variable vector -:param dt: Time step -:param sqrt_dt: Square root of the time step -:param lambda_vector: Intensity of the jump vector -:param m_vector: Mean of jump size vector -:param v_vector: Variance of jump size vector -:param regime_change: Regime change booleans -:return: Log returns based on the Heston-Merton model - -The Heston Merton model formulae for log returns are: -.. math:: - v_{i+1} = v_i + \kappa_i (\theta_i - \max(v_i, 0)) dt + \xi_i \sqrt{\max(v_i, 0)} dwV_i \sqrt{dt} - log\_returns_i = (\mu_i - 0.5 v_i - \lambda_i (m_i + \frac{v^2_i}{2})) dt + \sqrt{v_i} dwS_i \sqrt{dt} + dJ_i - -#### `function heston_merton_log_returns` - -```python -def heston_merton_log_returnsT: float, N: int, mu_vector: np.ndarray, kappa_vector: np.ndarray, theta_vector: np.ndarray, xi_vector: np.ndarray, rho_vector: np.ndarray, lambda_vector: np.ndarray, m_vector: np.ndarray, v_vector: np.ndarray, regime_change: np.ndarray, random_state=None: -``` - -> Computes the log returns based on the Heston-Merton model using Gaussian random numbers. - -:param T: Total time -:param N: Number of steps -:param mu_vector: Drift vector of length N -:param kappa_vector: Mean-reversion speed vector of length N -:param theta_vector: Long-term mean vector of length N -:param xi_vector: Volatility of volatility vector of length N -:param rho_vector: Correlation coefficient vector of length N -:param lambda_vector: Intensity of the jump vector -:param m_vector: Mean of jump size vector -:param v_vector: Variance of jump size vector -:param random_state: Random state for reproducibility -:param regime_change: Regime change booleans -:return: Log returns based on the Heston-Merton model - -#### `function align_params_length` - -```python -def align_params_lengthregime_params: Dict[str, Union[float, List[float]]]: -``` - -> Align the parameters' length within the provided regime parameters. - -:param regime_params: Dictionary of regime parameters. Values can be floats or lists. -:return: A tuple containing the regime parameters with aligned lengths and the max length. - -#### `function generate_prices_from_regimes` - -```python -def generate_prices_from_regimesregimes: Dict[str, Dict[str, Union[float, List[float]]]], transition_matrix: np.ndarray, total_time: float, n_steps: int, random_state: int=None: -``` - -> Generate prices based on provided regimes and a Markov Chain. - -:param regimes: Dictionary containing regime names and their respective parameters. -:param transition_matrix: Markov Chain transition matrix. -:param total_time: Total time for the simulation. -:param n_steps: Number of discrete steps in the simulation. -:param random_state: Seed for random number generation. -:return: A tuple containing the generated prices as a pandas Series and the simulated regimes. - -#### `function parallel_generate_prices` - -```python -def parallel_generate_pricesnumber_of_paths: int, regimes: Dict[str, Dict[str, Union[float, List[float]]]], transition_matrix: np.ndarray, total_time: float, number_of_steps: int, random_state: Union[int, None]=None, n_jobs: int=1: -``` - -> Parallel generation of prices using the provided regimes. - -:param number_of_paths: The number of paths to generate. -:param regimes: Dictionary containing regime names and their respective parameters. -:param transition_matrix: Markov Chain transition matrix. -:param total_time: Total time for the simulation. -:param number_of_steps: Number of discrete steps in the simulation. -:param random_state: Seed for random number generation. -:param n_jobs: Number of parallel jobs to run. -:return: A tuple containing the generated prices and simulated regimes as pandas DataFrames. - - -### πŸ“„ `RiskLabAI\data\weights\sample_weights.py` - -#### `function expand_label_for_meta_labeling` - -```python -def expand_label_for_meta_labelingclose_index: pd.Index, timestamp: pd.Series, molecule: pd.Index: -``` - -> Expand labels for meta-labeling. - -This function expands labels to incorporate meta-labeling by taking -an event Index, a Series with the return and label of each period, -and an Index specifying the molecules to apply the function to. It then returns a Series with the count -of events spanning a bar for each molecule. - -:param event_index: Index of events. -:param return_label_dataframe: Series containing returns and labels of each period. -:param molecule_index: Index specifying molecules to apply the function on. -:return: Series with the count of events spanning a bar for each molecule. - -#### `function calculate_sample_weight` - -```python -def calculate_sample_weighttimestamp: pd.DataFrame, concurrency_events: pd.DataFrame, molecule: pd.Index: -``` - -> Calculate sample weight using triple barrier method. - -:param timestamp: DataFrame of events start and end for labelling. -:param concurrency_events: Data frame of concurrent events for each event. -:param molecule: Index that function must apply on it. -:return: Series of sample weights. - -#### `function create_index_matgrix` - -```python -def create_index_matgrixbar_index: pd.Index, timestamp: pd.DataFrame: -``` - -> Create an indicator matrix. - -:param bar_index: Index of all data. -:param timestamp: DataFrame with starting and ending times of events. -:return: Indicator matrix. - -#### `function calculate_average_uniqueness` - -```python -def calculate_average_uniquenessindex_matrix: pd.DataFrame: -``` - -> Calculate average uniqueness from indicator matrix. - -:param index_matrix: Indicator matrix. -:return: Series of average uniqueness values. - -#### `function perform_sequential_bootstrap` - -```python -def perform_sequential_bootstrapindex_matrix: pd.DataFrame, sample_length: int: -``` - -> Perform sequential bootstrap to generate a sample. - -:param index_matrix: Matrix of indicators for events. -:param sample_length: Number of samples. -:return: List of indices representing the sample. - -#### `function calculate_sample_weight_absolute_return` - -```python -def calculate_sample_weight_absolute_returntimestamp: pd.DataFrame, concurrency_events: pd.DataFrame, returns: pd.DataFrame, molecule: pd.Index: -``` - -> Calculate sample weight using absolute returns. - -:param timestamp: DataFrame for events. -:param concurrency_events: DataFrame that contains number of concurrent events for each event. -:param returns: DataFrame that contains returns. -:param molecule: Index for the calculation. -:return: Series of sample weights. - -#### `function sample_weight_absolute_return_meta_labeling` - -```python -def sample_weight_absolute_return_meta_labelingtimestamp: pd.Series, price: pd.Series, molecule: pd.Index: -``` - -> Calculate sample weights using absolute returns. - -:param event_timestamps: Series containing event timestamps. -:param price_series: Series containing prices. -:param molecule_index: Index for the calculation. -:return: Series of sample weights. - -#### `function calculate_time_decay` - -```python -def calculate_time_decayweight: pd.Series, clf_last_weight: float=1.0: -``` - -> Calculate time decay on weight. - -:param weight: Weight computed for each event. -:param clf_last_weight: Weight of oldest observation. -:return: Series of weights after applying time decay. - - -### πŸ“„ `RiskLabAI\ensemble\bagging_classifier_accuracy.py` - -#### `function bagging_classifier_accuracy` - -```python -def bagging_classifier_accuracyN: int, p: float, k: int=2: -``` - -> Calculate the accuracy of a bagging classifier. - -The function calculates the accuracy of a bagging classifier based on the given -parameters and according to the formula: - -.. math:: - 1 - \sum_{i=0}^{N/k} \binom{N}{i} p^i (1-p)^{N-i} - -:param N: Number of independent classifiers. -:param p: Probability of a classifier labeling a prediction as 1. -:param k: Number of classes (default is 2). -:return: Bagging classifier accuracy. - -Reference: - De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. - Methodology: page 96, "Improved Accuracy" section. - - -### πŸ“„ `RiskLabAI\features\entropy_features\entropy.py` - - -### πŸ“„ `RiskLabAI\features\entropy_features\kontoyiannis.py` - -#### `function longest_match_length` - -```python -def longest_match_lengthmessage: str, i: int, n: int: -``` - -> Calculate the length of the longest match. - -:param message: Input encoded message -:type message: str -:param i: Index value -:type i: int -:param n: Length parameter -:type n: int -:return: Tuple containing matched length and substring -:rtype: tuple - -#### `function kontoyiannis_entropy` - -```python -def kontoyiannis_entropymessage: str, window: int=None: -``` - -> Calculate Kontoyiannis Entropy. - -:param message: Input encoded message -:type message: str -:param window: Length of expanding window, default is None -:type window: int or None -:return: Calculated Kontoyiannis Entropy -:rtype: float - - -### πŸ“„ `RiskLabAI\features\entropy_features\lempel_ziv.py` - -#### `function lempel_ziv_entropy` - -```python -def lempel_ziv_entropymessage: str: -``` - -> Calculate Lempel-Ziv Entropy. - -:param message: Input encoded message -:type message: str -:return: Calculated Lempel-Ziv Entropy -:rtype: float - - -### πŸ“„ `RiskLabAI\features\entropy_features\plug_in.py` - -#### `function plug_in_entropy_estimator` - -```python -def plug_in_entropy_estimatormessage: str, approximate_word_length: int=1: -``` - -> Calculate Plug-in Entropy Estimator. - -:param message: Input encoded message -:type message: str -:param approximate_word_length: Approximation of word length, default is 1 -:type approximate_word_length: int -:return: Calculated Plug-in Entropy Estimator -:rtype: float - - -### πŸ“„ `RiskLabAI\features\entropy_features\pmf.py` - -#### `function probability_mass_function` - -```python -def probability_mass_functionmessage: str, approximate_word_length: int: -``` - -> Calculate Probability Mass Function. - -:param message: Input encoded message -:type message: str -:param approximate_word_length: Approximation of word length -:type approximate_word_length: int -:return: Probability Mass Function -:rtype: dict - - -### πŸ“„ `RiskLabAI\features\entropy_features\shannon.py` - -#### `function shannon_entropy` - -```python -def shannon_entropymessage: str: -``` - -> Calculate Shannon Entropy. - -:param message: Input encoded message -:type message: str -:return: Calculated Shannon Entropy -:rtype: float - - -### πŸ“„ `RiskLabAI\features\feature_importance\clustered_feature_importance_mda.py` - -#### `class ClusteredFeatureImportanceMDA` - -##### `method __init__` - -```python -def __init__: -``` -##### `method compute` - -```python -def computeself, classifier: RandomForestClassifier, x: pd.DataFrame, y: pd.Series, clusters: Dict[str, List[str]], n_splits: int=10, score_sample_weights: List[float]=None, train_sample_weights: List[float]=None: -``` - -> Compute clustered feature importance using MDA. - -The feature importance is computed by comparing the performance -(log loss) of a trained classifier on shuffled data to its -performance on non-shuffled data. - -:param classifier: The Random Forest classifier to be trained. -:param x: The features DataFrame. -:param y: The target Series. -:param clusters: A dictionary where the keys are the cluster names - and the values are lists of features in each cluster. -:param n_splits: The number of splits for KFold cross-validation. -:param score_sample_weights: Sample weights to be used when computing the score. -:param train_sample_weights: Sample weights to be used during training. - -:return: A DataFrame with feature importances and their standard deviations. - -The related mathematical formulae: - -.. math:: - - \text{{importance}} = \frac{{-1 \times \text{{score with shuffled data}}}} - {{\text{{score without shuffled data}}}} - -Using Central Limit Theorem for calculating the standard deviation: - -.. math:: - - \text{{StandardDeviation}} = \text{{std}} \times n^{-0.5} - - -### πŸ“„ `RiskLabAI\features\feature_importance\clustered_feature_importance_mdi.py` - -#### `class ClusteredFeatureImportanceMDI` - -##### `method __init__` - -```python -def __init__self, classifier: RandomForestClassifier, clusters: Dict[str, List[str]], x: pd.DataFrame, y: pd.Series: -``` - -> Initialize the ClusteredFeatureImportanceMDI class. - -:param classifier: The Random Forest classifier. -:param clusters: A dictionary where the keys are the cluster names - and the values are lists of features in each cluster. -:param x: The features DataFrame. -:param y: The target Series. - -##### `method group_mean_std` - -```python -def group_mean_stdself, dataframe: pd.DataFrame, clusters: Dict[str, List[str]]: -``` - -> Calculate the mean and standard deviation for clusters. - -:param dataframe: A DataFrame of importances. -:param clusters: A dictionary of cluster definitions. - -:return: A DataFrame with mean and standard deviation for each cluster. - -Using Central Limit Theorem for standard deviation: - -.. math:: - - \text{{StandardDeviation}} = \text{{std}} \times n^{-0.5} - -##### `method compute` - -```python -def computeself: -``` - -> Compute aggregated feature importances for clusters. - -:return: A DataFrame with aggregated importances for clusters. - - -### πŸ“„ `RiskLabAI\features\feature_importance\clustering.py` - -#### `function covariance_to_correlation` - -```python -def covariance_to_correlationcovariance: np.ndarray: -``` - -> Derive the correlation matrix from a covariance matrix. - -:param covariance: numpy ndarray - The covariance matrix to convert to a correlation matrix. -:return: numpy ndarray - The correlation matrix derived from the covariance matrix. - -The conversion is done based on the following mathematical formula: -correlation = covariance / (std_i * std_j) -where std_i and std_j are the standard deviations of the i-th and j-th elements. - -#### `function cluster_kmeans_base` - -```python -def cluster_kmeans_basecorrelation: pd.DataFrame, number_clusters: int=10, iterations: int=10: -``` - -> Apply the K-means clustering algorithm. - -:param correlation: pandas DataFrame - The correlation matrix. -:param number_clusters: int, optional - The maximum number of clusters. Default is 10. -:param iterations: int, optional - The number of iterations to run the clustering. Default is 10. -:return: tuple - A tuple containing the sorted correlation matrix, cluster membership, and silhouette scores. - -This function is based on Snippet 4.1 from De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - -#### `function make_new_outputs` - -```python -def make_new_outputscorrelation: pd.DataFrame, clusters: dict, clusters2: dict: -``` - -> Merge two sets of clusters and derive new outputs. - -:param correlation: pandas DataFrame - The correlation matrix. -:param clusters: dict - The first set of clusters. -:param clusters2: dict - The second set of clusters. -:return: tuple - A tuple containing the new correlation matrix, new cluster membership, and new silhouette scores. - -#### `function cluster_kmeans_top` - -```python -def cluster_kmeans_topcorrelation: pd.DataFrame, number_clusters: int=None, iterations: int=10: -``` - -> Apply the K-means clustering algorithm with hierarchical re-clustering. - -:param correlation: pandas DataFrame - The correlation matrix. -:param number_clusters: int, optional - The maximum number of clusters. Default is None. -:param iterations: int, optional - The number of iterations to run the clustering. Default is 10. -:return: tuple - A tuple containing the sorted correlation matrix, cluster membership, and silhouette scores. - -This function is based on Snippet 4.2 from De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - -#### `function random_covariance_sub` - -```python -def random_covariance_subnumber_observations: int, number_columns: int, sigma: float, random_state=None: -``` - -> Compute a sub covariance matrix. - -Generates a covariance matrix based on random data. - -:param number_observations: Number of observations. -:param number_columns: Number of columns. -:param sigma: Sigma for normal distribution. -:param random_state: Random state for reproducibility. -:return: Sub covariance matrix. - -.. note:: Reference: De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - Methodology: Snipet 4.3, Page 61. - -#### `function random_block_covariance` - -```python -def random_block_covariancenumber_columns: int, number_blocks: int, block_size_min: int=1, sigma: float=1.0, random_state=None: -``` - -> Compute a random block covariance matrix. - -Generates a block random covariance matrix by combining multiple sub covariance matrices. - -:param number_columns: Number of columns. -:param number_blocks: Number of blocks. -:param block_size_min: Minimum size of block. -:param sigma: Sigma for normal distribution. -:param random_state: Random state for reproducibility. -:return: Block random covariance matrix. - -.. note:: Reference: De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - Methodology: Snipet 4.3, Page 61. - -#### `function random_block_correlation` - -```python -def random_block_correlationnumber_columns: int, number_blocks: int, random_state=None, block_size_min: int=1: -``` - -> Compute a random block correlation matrix. - -Generates a block random correlation matrix by adding two block random covariance matrices -and converting them to a correlation matrix. - -:param number_columns: Number of columns. -:param number_blocks: Number of blocks. -:param random_state: Random state for reproducibility. -:param block_size_min: Minimum size of block. -:return: Block random correlation matrix. - -.. note:: Reference: De Prado, M. (2020) Advances in financial machine learning. John Wiley & Sons. - Methodology: Snipet 4.3, Page 61. - -#### `function cov_to_corr` - -```python -def cov_to_corrcovariance: np.ndarray: -``` - -> Convert a covariance matrix to a correlation matrix. - -:param covariance: Covariance matrix. -:return: Correlation matrix. - -.. math:: - correlation_{ij} = \\frac{covariance_{ij}}{\\sqrt{covariance_{ii} \cdot covariance_{jj}}} - -#### `function cluster_kmeans_base` - -```python -def cluster_kmeans_basecorrelation: pd.DataFrame, number_clusters: int=10, iterations: int=10: -``` - -> Perform KMeans clustering on a correlation matrix. - -:param correlation: Correlation matrix. -:param number_clusters: Number of clusters, default is 10. -:param iterations: Number of iterations, default is 10. -:return: Sorted correlation matrix, clusters, silhouette scores. - -.. note:: - The silhouette score is a measure of how similar an object is to its own cluster compared to other clusters. - - -### πŸ“„ `RiskLabAI\features\feature_importance\feature_importance_controller.py` - -#### `class FeatureImportanceController` - -> Controller class to manage various feature importance strategies. - -To use this controller class: - -1. Initialize it with the type of feature importance strategy - you want to use, along with any required parameters for that strategy. -2. Call the `calculate_importance` method to perform the - feature importance calculation. - -For example: - -.. code-block:: python - - # Initialize the controller with a 'ClusteredMDA' strategy - controller = FeatureImportanceController('ClusteredMDA', - classifier=my_classifier, - clusters=my_clusters) - - # Calculate feature importance - result = controller.calculate_importance(my_x, my_y) - -##### `method __init__` - -```python -def __init__self, strategy_type: str, **kwargs: -``` - -> Initialize the controller with a specific feature importance strategy. - -:param strategy_type: The type of feature importance strategy to use. -:param kwargs: Additional arguments to pass to the strategy class. - -##### `method calculate_importance` - -```python -def calculate_importanceself, x, y, **kwargs: -``` - -> Calculate feature importance based on the initialized strategy. - -:param x: Feature data. -:param y: Target data. -:param kwargs: Additional arguments to pass to the calculation method. - -:return: Feature importance results. - - -### πŸ“„ `RiskLabAI\features\feature_importance\feature_importance_factory.py` - -#### `class FeatureImportanceFactory` - -> Factory class for building and fetching feature importance computation results. - -Usage: - -.. code-block:: python - - factory = FeatureImportanceFactory() - factory.build(my_feature_importance_strategy_instance) - results = factory.get_results() - -##### `method __init__` - -```python -def __init__self: -``` - -> Initialize the FeatureImportanceFactory class. - -##### `method build` - -```python -def buildself, feature_importance_strategy: FeatureImportanceStrategy: -``` - -> Build the feature importance based on the provided strategy. - -:param feature_importance_strategy: An instance of a strategy - inheriting from FeatureImportanceStrategy. - -:return: Current instance of the FeatureImportanceFactory. - -##### `method get_results` - -```python -def get_resultsself: -``` - -> Fetch the computed feature importance results. - -:return: Dataframe containing the feature importance results. - - -### πŸ“„ `RiskLabAI\features\feature_importance\feature_importance_mda.py` - -#### `class FeatureImportanceMDA` - -> Computes the feature importance using the Mean Decrease Accuracy (MDA) method. - -The method shuffles each feature one by one and measures how much the performance -(log loss in this context) decreases due to the shuffling. - -.. math:: - - \text{importance}_{j} = \frac{\text{score without shuffling} - \text{score with shuffling}_{j}} - {\text{score without shuffling}} - -##### `method __init__` - -```python -def __init__self, classifier: object, x: pd.DataFrame, y: pd.Series, n_splits: int=10, score_sample_weights: Optional[List[float]]=None, train_sample_weights: Optional[List[float]]=None: -``` - -> Initialize the class with parameters. - -:param classifier: The classifier object. -:param x: The feature data. -:param y: The target data. -:param n_splits: Number of splits for cross-validation. -:param score_sample_weights: Weights for scoring samples. -:param train_sample_weights: Weights for training samples. - -##### `method compute` - -```python -def computeself: -``` - -> Compute the feature importances. - -:return: Feature importances as a dataframe with "Mean" and "StandardDeviation" columns. - - -### πŸ“„ `RiskLabAI\features\feature_importance\feature_importance_mdi.py` - -#### `class FeatureImportanceMDI` - -> Computes the feature importance using the Mean Decrease Impurity (MDI) method. - -The method calculates the importance of a feature by measuring the average impurity -decrease across all the trees in the forest, where impurity is calculated -using metrics like Gini impurity or entropy. - -.. math:: - - \text{importance}_{j} = \frac{\text{average impurity decrease for feature j}}{\text{total impurity decrease}} - -##### `method __init__` - -```python -def __init__self, classifier: object, x: pd.DataFrame, y: Union[pd.Series, List[Optional[float]]]: -``` - -> Initialize the class with parameters. - -:param classifier: The classifier object. -:param x: The feature data. -:param y: The target data. - -##### `method compute` - -```python -def computeself: -``` - -> Compute the feature importances. - -:return: Feature importances as a dataframe with "Mean" and "StandardDeviation" columns. - - -### πŸ“„ `RiskLabAI\features\feature_importance\feature_importance_sfi.py` - -#### `class FeatureImportanceSFI` - -> Computes the Single Feature Importance (SFI). - -The method calculates the importance of each feature by evaluating its performance -individually in the classifier. - -##### `method __init__` - -```python -def __init__self, classifier: object, x: pd.DataFrame, y: Union[pd.Series, List[Optional[float]]], n_splits: int=10, score_sample_weights: Optional[List[float]]=None, train_sample_weights: Optional[List[float]]=None, scoring: str='log_loss': -``` - -> Initialize the class with parameters. - -:param classifier: The classifier object. -:param x: The feature data. -:param y: The target data. -:param n_splits: The number of splits for cross-validation. -:param score_sample_weights: Sample weights for scoring. -:param train_sample_weights: Sample weights for training. -:param scoring: Scoring method ("log_loss" or "accuracy"). - -##### `method compute` - -```python -def computeself: -``` - -> Compute the Single Feature Importance. - -:return: Feature importances as a dataframe with "FeatureName", "Mean", and "StandardDeviation" columns. - - -### πŸ“„ `RiskLabAI\features\feature_importance\feature_importance_strategy.py` - -#### `class FeatureImportanceStrategy` - -> Abstract Base Class for computing feature importance. - -Derived classes must implement the `compute` method to -provide their own logic for computing feature importance. - -##### `method compute` - -```python -def computeself, *args, **kwargs: -``` - -> Abstract method to compute feature importance. - -:param args: Positional arguments. -:param kwargs: Keyword arguments. -:return: A pandas DataFrame containing feature importances. - -Note: Derived classes should provide a concrete implementation -of this method with specific parameters and docstrings relevant -to their implementation. - - -### πŸ“„ `RiskLabAI\features\feature_importance\generate_synthetic_data.py` - -#### `function get_test_dataset` - -```python -def get_test_datasetn_features: int=100, n_informative: int=25, n_redundant: int=25, n_samples: int=10000, random_state: int=0, sigma_std: float=0.0: -``` - -> Generate a synthetic dataset with informative, redundant, and explanatory variables. - -:param n_features: Total number of features -:type n_features: int -:param n_informative: Number of informative features -:type n_informative: int -:param n_redundant: Number of redundant features -:type n_redundant: int -:param n_samples: Number of samples to generate -:type n_samples: int -:param random_state: Random state for reproducibility -:type random_state: int -:param sigma_std: Standard deviation for generating redundant features, default is 0.0 -:type sigma_std: float -:return: Tuple containing generated X (features) and y (labels) -:rtype: tuple - - -### πŸ“„ `RiskLabAI\features\feature_importance\orthogonal_features.py` - -#### `function compute_eigenvectors` - -```python -def compute_eigenvectorsdot_product: np.ndarray, explained_variance_threshold: float: -``` - -> Compute eigenvalues and eigenvectors for orthogonal features. - -:param dot_product: Input dot product matrix. -:type dot_product: np.ndarray -:param explained_variance_threshold: Threshold for cumulative explained variance. -:type explained_variance_threshold: float -:return: DataFrame containing eigenvalues, eigenvectors, and cumulative explained variance. -:rtype: pd.DataFrame - -#### `function orthogonal_features` - -```python -def orthogonal_featuresfeatures: np.ndarray, variance_threshold: float=0.95: -``` - -> Compute orthogonal features using eigenvalues and eigenvectors. - -:param features: Features matrix. -:type features: np.ndarray -:param variance_threshold: Threshold for cumulative explained variance, default is 0.95. -:type variance_threshold: float -:return: Tuple containing orthogonal features and eigenvalues information. -:rtype: tuple - - -### πŸ“„ `RiskLabAI\features\feature_importance\weighted_tau.py` - -#### `function calculate_weighted_tau` - -```python -def calculate_weighted_taufeature_importances: np.ndarray, principal_component_ranks: np.ndarray: -``` - -> Calculate the weighted Kendall's tau (Ο„) using feature importances and principal component ranks. - -Kendall's tau is a measure of correlation between two rankings. The weighted version of -Kendall's tau takes into account the weights of the rankings. In this case, the weights -are the inverse of the principal component ranks. - -:param feature_importances: Vector of feature importances. -:type feature_importances: np.ndarray -:param principal_component_ranks: Vector of principal component ranks. -:type principal_component_ranks: np.ndarray -:return: Weighted Ο„ value. -:rtype: float - -.. math:: - - \\tau_B = \\frac{(P - Q)}{\\sqrt{(P + Q + T) (P + Q + U)}} - -where: - - P is the number of concordant pairs - - Q is the number of discordant pairs - - T is the number of ties only in the first ranking - - U is the number of ties only in the second ranking - - -### πŸ“„ `RiskLabAI\features\microstructural_features\bekker_parkinson_volatility_estimator.py` - -#### `function sigma_estimates` - -```python -def sigma_estimatesbeta: pd.Series, gamma: pd.Series: -``` - -> Compute Bekker-Parkinson volatility Οƒ estimates. - -This function calculates the Bekker-Parkinson volatility estimates based on the provided -beta and gamma values. The mathematical formula used is: - -.. math:: - \sigma = \frac{(2^{0.5} - 1) \cdot (\beta ^ {0.5})}{3 - 2 \cdot (2^{0.5})} - + \left(\frac{\gamma}{\left(\frac{8}{\pi}\right)^{0.5} \cdot (3 - 2 \cdot (2^{0.5}))}\right)^{0.5} - -Negative resulting values are set to 0. - -:param beta: Ξ² Estimates vector. -:param gamma: Ξ³ Estimates vector. -:return: Bekker-Parkinson volatility Οƒ estimates. - -Reference: - De Prado, M. (2018) Advances in Financial Machine Learning, page 286, snippet 19.2. - -#### `function bekker_parkinson_volatility_estimates` - -```python -def bekker_parkinson_volatility_estimateshigh_prices: pd.Series, low_prices: pd.Series, window_span: int=20: -``` - -> Compute Bekker-Parkinson volatility estimates based on high and low prices. - -Utilizes Corwin and Schultz estimation techniques to calculate the Bekker-Parkinson -volatility. The function first determines the beta and gamma values and then -uses them to compute the volatility estimates. - -:param high_prices: High prices vector. -:param low_prices: Low prices vector. -:param window_span: Rolling window span for beta estimation. -:return: Bekker-Parkinson volatility estimates. - -Reference: - De Prado, M. (2018) Advances in Financial Machine Learning, page 286, "Corwin and Schultz" section. - - -### πŸ“„ `RiskLabAI\features\microstructural_features\corwin_schultz.py` - -#### `function beta_estimates` - -```python -def beta_estimateshigh_prices: pd.Series, low_prices: pd.Series, window_span: int: -``` - -> Estimate Ξ² using Corwin and Schultz methodology. - -:param high_prices: High prices vector. -:param low_prices: Low prices vector. -:param window_span: Rolling window span. -:return: Estimated Ξ² vector. - -.. note:: Reference: Corwin, S. A., & Schultz, P. (2012). A simple way to estimate bid-ask spreads from daily high and low prices. The Journal of Finance, 67(2), 719-760. - -#### `function gamma_estimates` - -```python -def gamma_estimateshigh_prices: pd.Series, low_prices: pd.Series: -``` - -> Estimate Ξ³ using Corwin and Schultz methodology. - -:param high_prices: High prices vector. -:param low_prices: Low prices vector. -:return: Estimated Ξ³ vector. - -.. note:: Reference: Corwin, S. A., & Schultz, P. (2012). A simple way to estimate bid-ask spreads from daily high and low prices. The Journal of Finance, 67(2), 719-760. - -#### `function alpha_estimates` - -```python -def alpha_estimatesbeta: pd.Series, gamma: pd.Series: -``` - -> Estimate Ξ± using Corwin and Schultz methodology. - -:param beta: Ξ² Estimates vector. -:param gamma: Ξ³ Estimates vector. -:return: Estimated Ξ± vector. - -.. note:: Reference: Corwin, S. A., & Schultz, P. (2012). A simple way to estimate bid-ask spreads from daily high and low prices. The Journal of Finance, 67(2), 719-760. - -#### `function corwin_schultz_estimator` - -```python -def corwin_schultz_estimatorhigh_prices: pd.Series, low_prices: pd.Series, window_span: int=20: -``` - -> Estimate spread using Corwin and Schultz methodology. - -:param high_prices: High prices vector. -:param low_prices: Low prices vector. -:param window_span: Rolling window span, default is 20. -:return: Estimated spread vector. - -.. note:: Reference: Corwin, S. A., & Schultz, P. (2012). A simple way to estimate bid-ask spreads from daily high and low prices. The Journal of Finance, 67(2), 719-760. - - -### πŸ“„ `RiskLabAI\features\structural_breaks\structural_breaks.py` - -#### `function lag_dataframe` - -```python -def lag_dataframemarket_data: pd.DataFrame, lags: int: -``` - -> Apply lags to DataFrame. - -Reference: De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. -Methodology: Snippet 17.3 - -:param market_data: Data of price or log price. -:param lags: Arrays of lag or integer that shows number of lags. -:return: DataFrame with lagged data. - -#### `function prepare_data` - -```python -def prepare_dataseries: pd.DataFrame, constant: str, lags: int: -``` - -> Prepare the datasets. - -Reference: De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. -Methodology: Snippet 17.2 - -:param series: Data of price or log price. -:param constant: String that must be "nc" or "ct" or "ctt". -:param lags: Arrays of lag or integer that shows number of lags. -:return: Tuple of y and x arrays. - -#### `function compute_beta` - -```python -def compute_betay: np.ndarray, x: np.ndarray: -``` - -> Fit the ADF specification. - -Reference: De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. -Methodology: Snippet 17.4 - -:param y: Dependent variable. -:param x: Matrix of independent variable. -:return: Tuple of beta_mean and beta_variance. - -#### `function adf` - -```python -def adflog_price: pd.DataFrame, min_sample_length: int, constant: str, lags: int: -``` - -> SADF's inner loop. - -Reference: De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons. -Methodology: Snippet 17.1 - -:param log_price: Pandas DataFrame of log price. -:param min_sample_length: Minimum sample length. -:param constant: String that must be "nc" or "ct" or "ctt". -:param lags: Arrays of lag or integer that shows number of lags. -:return: Dictionary with Time and gsadf values. - - -### πŸ“„ `RiskLabAI\hpc\hpc.py` - -#### `function report_progress` - -```python -def report_progressjob_number: int, total_jobs: int, start_time: float, task: str: -``` - -> Report the progress of a computing task. - -:param job_number: The current job number. -:type job_number: int -:param total_jobs: The total number of jobs. -:type total_jobs: int -:param start_time: The start time of the computation. -:type start_time: float -:param task: The task being performed. -:type task: str -:return: None - -#### `function process_jobs` - -```python -def process_jobsjobs: list, task: str=None, num_threads: int=24: -``` - -> Process multiple jobs in parallel. - -:param jobs: A list of jobs to be processed. -:type jobs: list -:param task: The task being performed. -:type task: str -:param num_threads: Number of threads to be used. -:type num_threads: int -:return: Outputs of the jobs. -:rtype: list - -#### `function expand_call` - -```python -def expand_callkargs: dict: -``` - -> Expand the arguments of a callback function, kargs['func']. - -:param kargs: Arguments for the callback function. -:type kargs: dict -:return: Output of the callback function. - -#### `function process_jobs_sequential` - -```python -def process_jobs_sequentialjobs: list: -``` - -> Single-thread execution, for debugging. - -:param jobs: A list of jobs to be processed. -:type jobs: list -:return: Outputs of the jobs. -:rtype: list - -#### `function linear_partitions` - -```python -def linear_partitionsnum_atoms: int, num_threads: int: -``` - -> Generate linear partitions for parallel computation. - -:param num_atoms: Number of atoms. -:type num_atoms: int -:param num_threads: Number of threads. -:type num_threads: int -:return: The partitions. -:rtype: list - -#### `function nested_partitions` - -```python -def nested_partitionsnum_atoms: int, num_threads: int, upper_triangle: bool=False: -``` - -> Generate nested partitions for parallel computation. - -:param num_atoms: Number of atoms. -:type num_atoms: int -:param num_threads: Number of threads. -:type num_threads: int -:param upper_triangle: Whether to generate partitions for the upper triangle. -:type upper_triangle: bool -:return: The partitions. -:rtype: list - -The formula for partition size is given by: - -.. math:: - - partitions = \frac{-1 + \sqrt{1 + 4 \cdot (partitions[-1]^2 + partitions[-1] + \frac{n_atoms \cdot (n_atoms + 1)}{n_threads})}}{2} - -#### `function mp_pandas_obj` - -```python -def mp_pandas_objfunction, pandas_object: tuple, num_threads: int=2, mp_batches: int=1, linear_partition: bool=True, **kwargs: -``` - -> Parallelize jobs and return a DataFrame or Series. - -:param function: The function to be parallelized. -:param pandas_object: A tuple containing the name of the argument used to pass the molecule and a list of atoms - that will be grouped into molecules. -:type pandas_object: tuple -:param num_threads: Number of threads to be used. -:type num_threads: int -:param mp_batches: Number of batches for multiprocessing. -:type mp_batches: int -:param linear_partition: Whether to use linear partitioning or nested partitioning. -:type linear_partition: bool -:param kwargs: Other arguments needed by the function. -:return: The result of the function parallelized. -:rtype: DataFrame or Series - - -### πŸ“„ `RiskLabAI\optimization\hrp.py` - -#### `function inverse_variance_weights` - -```python -def inverse_variance_weightscovariance_matrix: pd.DataFrame: -``` - -> Compute the inverse-variance portfolio weights. - -:param covariance_matrix: Covariance matrix of asset returns. -:type covariance_matrix: pd.DataFrame -:return: Array of portfolio weights. -:rtype: np.ndarray - -#### `function cluster_variance` - -```python -def cluster_variancecovariance_matrix: pd.DataFrame, clustered_items: list: -``` - -> Compute the variance of a cluster. - -:param covariance_matrix: Covariance matrix of asset returns. -:type covariance_matrix: pd.DataFrame -:param clustered_items: List of indices of assets in the cluster. -:type clustered_items: list -:return: Variance of the cluster. -:rtype: float - -#### `function quasi_diagonal` - -```python -def quasi_diagonallinkage_matrix: np.ndarray: -``` - -> Return a sorted list of original items to reshape the correlation matrix. - -:param linkage_matrix: Linkage matrix obtained from hierarchical clustering. -:type linkage_matrix: np.ndarray -:return: Sorted list of original items. -:rtype: list - -#### `function recursive_bisection` - -```python -def recursive_bisectioncovariance_matrix: pd.DataFrame, sorted_items: list: -``` - -> Compute the Hierarchical Risk Parity (HRP) weights. - -:param covariance_matrix: Covariance matrix of asset returns. -:type covariance_matrix: pd.DataFrame -:param sorted_items: Sorted list of original items. -:type sorted_items: list -:return: DataFrame of asset weights. -:rtype: pd.Series - -#### `function distance_corr` - -```python -def distance_corrcorr_matrix: np.ndarray: -``` - -> Compute the distance matrix based on correlation. - -:param corr_matrix: Correlation matrix. -:type corr_matrix: np.ndarray -:return: Distance matrix based on correlation. -:rtype: np.ndarray - -#### `function plot_corr_matrix` - -```python -def plot_corr_matrixpath: str, corr_matrix: np.ndarray, labels: list=None: -``` - -> Plot a heatmap of the correlation matrix. - -:param path: Path to save the plot. -:type path: str -:param corr_matrix: Correlation matrix. -:type corr_matrix: np.ndarray -:param labels: List of labels for the assets (optional). -:type labels: list, optional - -#### `function random_data` - -```python -def random_datanum_observations: int, size_uncorr: int, size_corr: int, sigma_corr: float: -``` - -> Generate random data. - -:param num_observations: Number of observations. -:type num_observations: int -:param size_uncorr: Size for uncorrelated data. -:type size_uncorr: int -:param size_corr: Size for correlated data. -:type size_corr: int -:param sigma_corr: Standard deviation for correlated data. -:type sigma_corr: float -:return: DataFrame of randomly generated data and list of column indices for correlated data. -:rtype: pd.DataFrame, list - -#### `function random_data2` - -```python -def random_data2number_observations: int, length_sample: int, size_uncorrelated: int, size_correlated: int, mu_uncorrelated: float, sigma_uncorrelated: float, sigma_correlated: float: -``` - -> Generate random data for Monte Carlo simulation. - -:param number_observations: Number of observations. -:type number_observations: int -:param length_sample: Starting point for selecting random observations. -:type length_sample: int -:param size_uncorrelated: Size of uncorrelated data. -:type size_uncorrelated: int -:param size_correlated: Size of correlated data. -:type size_correlated: int -:param mu_uncorrelated: mu for uncorrelated data. -:type mu_uncorrelated: float -:param sigma_uncorrelated: sigma for uncorrelated data. -:type sigma_uncorrelated: float -:param sigma_correlated: sigma for correlated data. -:type sigma_correlated: float -:return: A tuple containing the generated data and the selected columns. -:rtype: np.ndarray, list - -#### `function hrp` - -```python -def hrpcov: np.ndarray, corr: np.ndarray: -``` - -> HRP method for constructing a hierarchical portfolio. - -:param cov: Covariance matrix. -:type cov: np.ndarray -:param corr: Correlation matrix. -:type corr: np.ndarray -:return: Pandas series containing weights of the hierarchical portfolio. -:rtype: pd.Series - -#### `function hrp_mc` - -```python -def hrp_mcnumber_iterations: int=5000, number_observations: int=520, size_uncorrelated: int=5, size_correlated: int=5, mu_uncorrelated: float=0, sigma_uncorrelated: float=0.01, sigma_correlated: float=0.25, length_sample: int=260, test_size: int=22: -``` - -> Monte Carlo simulation for out of sample comparison of HRP method. - -:param number_iterations: Number of iterations. -:type number_iterations: int -:param number_observations: Number of observations. -:type number_observations: int -:param size_uncorrelated: Size of uncorrelated data. -:type size_uncorrelated: int -:param size_correlated: Size of correlated data. -:type size_correlated: int -:param mu_uncorrelated: mu for uncorrelated data. -:type mu_uncorrelated: float -:param sigma_uncorrelated: sigma for uncorrelated data. -:type sigma_uncorrelated: float -:param sigma_correlated: sigma for correlated data. -:type sigma_correlated: float -:param length_sample: Length for in sample. -:type length_sample: int -:param test_size: Observation for test set. -:type test_size: int -:return: None - - -### πŸ“„ `RiskLabAI\optimization\hyper_parameter_tuning.py` - -#### `class MyPipeline` - -> Custom pipeline class to include sample_weight in fit_params. - -##### `method fit` - -```python -def fitself, X: pd.DataFrame, y: pd.DataFrame, sample_weight: list=None, **fit_params: -``` - -> Fit the pipeline while considering sample weights. - -:param X: Feature data. -:param y: Labels of data. -:param sample_weight: Sample weights for fit, defaults to None. -:param **fit_params: Additional fit parameters. -:return: Fitted pipeline. - -#### `function clf_hyper_fit` - -```python -def clf_hyper_fitfeature_data: pd.DataFrame, label: pd.DataFrame, times: pd.Series, pipe_clf: Pipeline, param_grid: dict, validator_type: str='purgedkfold', validator_params: dict=None, bagging: list=[0, -1, 1.0], rnd_search_iter: int=0, n_jobs: int=-1, **fit_params: -``` - -> Perform hyperparameter tuning and model fitting. - -:param feature_data: Data of features. -:param label: Labels of data. -:param times: The timestamp series associated with the labels. -:param pipe_clf: Our estimator. -:param param_grid: Parameter space. -:param validator_type: Type of cross-validator to create. -:param validator_params: Additional keyword arguments to be passed to the cross-validator's constructor. -:param bagging: Bagging type, defaults to [0, -1, 1.]. -:param rnd_search_iter: Number of iterations for randomized search, defaults to 0. -:param n_jobs: Number of jobs for parallel processing, defaults to -1. -:param **fit_params: Additional fit parameters. -:return: Fitted pipeline. - - -### πŸ“„ `RiskLabAI\optimization\nco.py` - -#### `function covariance_to_correlation_matrix` - -```python -def covariance_to_correlation_matrixcovariance: np.ndarray: -``` - -> Derive the correlation matrix from a covariance matrix. - -:param covariance: Covariance matrix. -:type covariance: numpy.ndarray -:return: Correlation matrix. -:rtype: numpy.ndarray - -#### `function get_optimal_portfolio_weights` - -```python -def get_optimal_portfolio_weightscovariance: np.ndarray, mu: np.ndarray=None: -``` - -> Compute the optimal portfolio weights. - -:param covariance: Covariance matrix. -:type covariance: numpy.ndarray -:param mu: Mean vector, defaults to None. -:type mu: numpy.ndarray, optional -:return: Portfolio weights. -:rtype: numpy.ndarray - -#### `function get_optimal_portfolio_weights_nco` - -```python -def get_optimal_portfolio_weights_ncocovariance: np.ndarray, mu: np.ndarray=None, number_clusters: int=None: -``` - -> Compute the optimal portfolio weights using the NCO algorithm. - -:param covariance: Covariance matrix. -:type covariance: numpy.ndarray -:param mu: Mean vector, defaults to None. -:type mu: numpy.ndarray, optional -:param number_clusters: Maximum number of clusters, defaults to None. -:type number_clusters: int, optional -:return: Optimal portfolio weights using NCO algorithm. -:rtype: numpy.ndarray - -#### `function cluster_k_means_base` - -```python -def cluster_k_means_basecorrelation: pd.DataFrame, number_clusters: int=10, iterations: int=10: -``` - -> Perform clustering using the K-means algorithm. - -:param correlation: Correlation matrix. -:type correlation: pd.DataFrame -:param number_clusters: Maximum number of clusters, defaults to 10. -:type number_clusters: int, optional -:param iterations: Number of iterations, defaults to 10. -:type iterations: int, optional -:return: Updated correlation matrix, cluster members, silhouette scores. -:rtype: tuple - - -### πŸ“„ `RiskLabAI\pde\equation.py` - -#### `class Equation` - -> Base class for defining PDE related function. - -Args: -eqn_config (dict): dictionary containing PDE configuration parameters - -Attributes: -dim (int): dimensionality of the problem -total_time (float): total time horizon -num_time_interval (int): number of time steps -delta_t (float): time step size -sqrt_delta_t (float): square root of time step size -y_init (None): initial value of the function - -##### `method __init__` - -```python -def __init__self, eqn_config: dict: -``` -##### `method sample` - -```python -def sampleself, num_sample: int: -``` - -> Sample forward SDE. - -Args: -num_sample (int): number of samples to generate - -Returns: -Tensor: tensor of size [num_sample, dim+1] containing samples - -##### `method r_u` - -```python -def r_uself, t: float, x: Tensor, y: Tensor, z: Tensor: -``` - -> Interest rate in the PDE. - -Args: -t (float): current time -x (Tensor): tensor of size [batch_size, dim] containing space coordinates -y (Tensor): tensor of size [batch_size, 1] containing function values -z (Tensor): tensor of size [batch_size, dim] containing gradients - -Returns: -Tensor: tensor of size [batch_size, 1] containing generator values - -##### `method h_z` - -```python -def h_zself, t, x, y, z: Tensor: -``` - -> Function to compute H(z) in the PDE. - -Args: -h (float): value of H function -z (Tensor): tensor of size [batch_size, dim] containing gradients - -Returns: -Tensor: tensor of size [batch_size, dim] containing H(z) - -##### `method terminal` - -```python -def terminalself, t: float, x: Tensor: -``` - -> Terminal condition of the PDE. - -Args: -t (float): current time -x (Tensor): tensor of size [batch_size, dim] containing space coordinates - -Returns: -Tensor: tensor of size [batch_size, 1] containing terminal values - -#### `class PricingDefaultRisk` - -> Args: -eqn_config (dict): dictionary containing PDE configuration parameters - -##### `method __init__` - -```python -def __init__self, eqn_config: -``` -##### `method sample` - -```python -def sampleself, num_sample: -``` - -> Sample forward SDE. - -Args: -num_sample (int): number of samples to generate - -Returns: -tuple: tuple of two tensors: dw_sample of size [num_sample, dim, num_time_interval] and -x_sample of size [num_sample, dim, num_time_interval+1] - -##### `method r_u` - -```python -def r_uself, t, x, y, z: -``` - -> Interest rate in the PDE. - -Args: -t (float): current time -x (torch.Tensor): tensor of size [batch_size, dim] containing space coordinates -y (torch.Tensor): tensor of size [batch_size, 1] containing function values -z (torch.Tensor): tensor of size [batch_size, dim] containing gradients - -Returns: -torch.Tensor: tensor of size [batch_size, 1] containing generator values - -##### `method h_z` - -```python -def h_zself, t, x, y, z: -``` - -> Function to compute $h^T Z$ in the PDE. - -Args: -t (float): current time -x (torch.Tensor): tensor of size [batch_size, dim] containing space coordinates -y (torch.Tensor): tensor of size [batch_size, 1] containing function value -z (torch.Tensor): tensor of size [batch_size, dim] containing gradients - -Returns: -torch.Tensor: tensor of size [batch_size, 1] containing H(z) - -##### `method sigma_matrix` - -```python -def sigma_matrixself, x: -``` -##### `method terminal` - -```python -def terminalself, t, x: -``` - -> Terminal condition of the PDE. - -Args: -t (float): current time -x (torch.Tensor): tensor of size [batch_size, dim] containing space coordinates - -Returns: -torch.Tensor: tensor of size [batch_size, 1] containing terminal values - -##### `method terminal_for_sample` - -```python -def terminal_for_sampleself, x: -``` - -> Terminal condition of the PDE. - -Args: -x (torch.Tensor): tensor of size [num_sample,batch_size, dim] containing space coordinates - -Returns: -torch.Tensor: tensor of size [num_sample ,batch_size, 1] containing terminal values - -#### `class HJBLQ` - -> Args: -eqn_config (dict): dictionary containing PDE configuration parameters - -##### `method __init__` - -```python -def __init__self, eqn_config: dict: -``` -##### `method sample` - -```python -def sampleself, num_sample: int: -``` - -> Sample forward SDE. - -Args: -num_sample (int): number of samples to generate - -Returns: -tuple: tuple of two tensors: dw_sample of size [num_sample, dim, num_time_interval] and -x_sample of size [num_sample, dim, num_time_interval+1] - -##### `method r_u` - -```python -def r_uself, t: float, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor: -``` - -> Interest rate in the PDE. - -Args: -t (float): current time -x (torch.Tensor): tensor of size [batch_size, dim] containing space coordinates -y (torch.Tensor): tensor of size [batch_size, 1] containing function values -z (torch.Tensor): tensor of size [batch_size, dim] containing gradients - -Returns: -torch.Tensor: tensor of size [batch_size, 1] containing generator values - -##### `method h_z` - -```python -def h_zself, t: float, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor: -``` - -> Function to compute in the PDE. - -Args: -t (float): current time -x (torch.Tensor): tensor of size [batch_size, dim] containing space coordinates -y (torch.Tensor): tensor of size [batch_size, 1] containing function value -z (torch.Tensor): tensor of size [batch_size, dim] containing gradients - -Returns: -torch.Tensor: tensor of size [batch_size, 1] containing H(z) - -##### `method terminal` - -```python -def terminalself, t: float, x: torch.Tensor: -``` - -> Terminal condition of the PDE. - -Args: -t (float): current time -x (torch.Tensor): tensor of size [batch_size, dim] containing space coordinates - -Returns: -torch.Tensor: tensor of size [batch_size, 1] containing terminal values - -##### `method sigma_matrix` - -```python -def sigma_matrixself, x: -``` -#### `function terminal_for_sample` - -```python -def terminal_for_sampleself, x: -``` - -> Terminal condition of the PDE. - -Args: -x (torch.Tensor): tensor of size [num_sample,batch_size, dim] containing space coordinates - -Returns: -torch.Tensor: tensor of size [num_sample ,batch_size, 1] containing terminal values - -#### `class BlackScholesBarenblatt` - -> Args: -eqn_config (dict): dictionary containing PDE configuration parameters - -##### `method __init__` - -```python -def __init__self, eqn_config: dict: -``` -##### `method sample` - -```python -def sampleself, num_sample: int: -``` - -> Sample forward SDE. - -Args: -num_sample (int): number of samples to generate - -Returns: -tuple: tuple of two tensors: dw_sample of size [num_sample, dim, num_time_interval] and -x_sample of size [num_sample, dim, num_time_interval+1] - -##### `method r_u` - -```python -def r_uself, t: float, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor: -``` - -> Interest rate in the PDE. - -Args: -t (float): current time -x (torch.Tensor): tensor of size [batch_size, dim] containing space coordinates -y (torch.Tensor): tensor of size [batch_size, 1] containing function values -z (torch.Tensor): tensor of size [batch_size, dim] containing gradients - -Returns: -torch.Tensor: tensor of size [batch_size, 1] containing generator values - -##### `method h_z` - -```python -def h_zself, t: float, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor: -``` - -> Function to compute in the PDE. - -Args: -t (float): current time -x (torch.Tensor): tensor of size [batch_size, dim] containing space coordinates -y (torch.Tensor): tensor of size [batch_size, 1] containing function value -z (torch.Tensor): tensor of size [batch_size, dim] containing gradients - -Returns: -torch.Tensor: tensor of size [batch_size, 1] containing H(z) - -##### `method terminal` - -```python -def terminalself, t: float, x: torch.Tensor: -``` - -> Terminal condition of the PDE. - -Args: -t (float): current time -x (torch.Tensor): tensor of size [batch_size, dim] containing space coordinates - -Returns: -torch.Tensor: tensor of size [batch_size, 1] containing terminal values - -##### `method sigma_matrix` - -```python -def sigma_matrixself, x: -``` -##### `method terminal_for_sample` - -```python -def terminal_for_sampleself, x: -``` - -> Terminal condition of the PDE. - -Args: -x (torch.Tensor): tensor of size [num_sample,batch_size, dim] containing space coordinates - -Returns: -torch.Tensor: tensor of size [num_sample ,batch_size, 1] containing terminal values - -#### `class PricingDiffRate` - -> Nonlinear Black-Scholes equation with different interest rates for borrowing and lending -in Section 4.4 of Comm. Math. Stat. paper doi.org/10.1007/s40304-017-0117-6 - -##### `method __init__` - -```python -def __init__self, eqn_config: -``` -##### `method sample` - -```python -def sampleself, num_sample: -``` -##### `method r_u` - -```python -def r_uself, t: float, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor: -``` -##### `method h_z` - -```python -def h_zself, t: float, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor: -``` - -> Function to compute in the PDE. - -Args: -t (float): current time -x (torch.Tensor): tensor of size [batch_size, dim] containing space coordinates -y (torch.Tensor): tensor of size [batch_size, 1] containing function value -z (torch.Tensor): tensor of size [batch_size, dim] containing gradients - -Returns: -torch.Tensor: tensor of size [batch_size, 1] containing H(z) - -##### `method terminal` - -```python -def terminalself, t, x: -``` -##### `method sigma_matrix` - -```python -def sigma_matrixself, x: -``` -##### `method terminal_for_sample` - -```python -def terminal_for_sampleself, x: -``` - -> Terminal condition of the PDE. - -Args: -x (torch.Tensor): tensor of size [num_sample,batch_size, dim] containing space coordinates - -Returns: -torch.Tensor: tensor of size [num_sample ,batch_size, 1] containing terminal values - - -### πŸ“„ `RiskLabAI\pde\model.py` - -#### `class TimeNet` - -> Neural network model for time dimension - -##### `method __init__` - -```python -def __init__self, output_dim: int: -``` - -> Initialize the neural network model with layers - -:param output_dim: The output dimension of the neural network -:type output_dim: int - -##### `method forward` - -```python -def forwardself, x: torch.Tensor: -``` - -> Forward propagation through the network. - -:param x: Input tensor -:type x: torch.Tensor -:return: Output tensor -:rtype: torch.Tensor - -#### `class Net1` - -> A class for defining a neural network with a single linear layer. - -##### `method __init__` - -```python -def __init__self, input_dim: int, output_dim: int: -``` - -> Initialize the network with a single linear layer. - -:param input_dim: Number of input features -:type input_dim: int -:param output_dim: Number of output features -:type output_dim: int - -##### `method forward` - -```python -def forwardself, x: torch.Tensor: -``` - -> Forward propagation through the network. - -:param x: Input tensor of shape (batch_size, input_dim) -:type x: torch.Tensor -:return: Output tensor of shape (batch_size, output_dim) -:rtype: torch.Tensor - -#### `class MAB` - -##### `method __init__` - -```python -def __init__self, dim_q: int, dim_k: int, dim_v: int, num_heads: int, ln: bool=False: -``` - -> Multi-Head Self Attention Block. - -:param dim_q: Dimension of query -:param dim_k: Dimension of key -:param dim_v: Dimension of value -:param num_heads: Number of attention heads -:param ln: Whether to use Layer Normalization - -##### `method forward` - -```python -def forwardself, q: torch.Tensor, k: torch.Tensor: -``` - -> Forward propagation. - -:param q: Query tensor -:param k: Key tensor -:return: Output tensor - -#### `class SAB` - -##### `method __init__` - -```python -def __init__self, dim_in: int, dim_out: int, num_heads: int, ln: bool=False: -``` - -> Self Attention Block. - -:param dim_in: Input dimension -:param dim_out: Output dimension -:param num_heads: Number of attention heads -:param ln: Whether to use Layer Normalization - -##### `method forward` - -```python -def forwardself, x: torch.Tensor: -``` - -> Forward propagation. - -:param x: Input tensor -:return: Output tensor - -#### `class ISAB` - -##### `method __init__` - -```python -def __init__self, dim_in: int, dim_out: int, num_heads: int, num_inds: int, ln: bool=False: -``` - -> Induced Self Attention Block. - -:param dim_in: Input dimension -:param dim_out: Output dimension -:param num_heads: Number of attention heads -:param num_inds: Number of inducing points -:param ln: Whether to use Layer Normalization - -##### `method forward` - -```python -def forwardself, x: torch.Tensor: -``` - -> Forward propagation. - -:param x: Input tensor -:return: Output tensor - -#### `class PMA` - -##### `method __init__` - -```python -def __init__self, dim: int, num_heads: int, num_seeds: int, ln: bool=False: -``` - -> Pooling Multihead Attention. - -:param dim: Dimension of input and output -:param num_heads: Number of attention heads -:param num_seeds: Number of seed vectors -:param ln: Whether to use Layer Normalization - -##### `method forward` - -```python -def forwardself, x: torch.Tensor: -``` - -> Forward propagation. - -:param x: Input tensor -:return: Output tensor - -#### `class TimeNetForSet` - -> Neural network model for time dimension. - -Args: - in_features (int): The input features dimension. Default is 1. - out_features (int): The output features dimension. Default is 64. - -##### `method __init__` - -```python -def __init__self, in_features: int=1, out_features: int=64: -``` -##### `method forward` - -```python -def forwardself, t: torch.Tensor, x: torch.Tensor: -``` - -> Forward pass of the network. - -Args: - t (torch.Tensor): Input tensor for time dimension. - x (torch.Tensor): Input tensor for features. - -Returns: - torch.Tensor: Output tensor. - -##### `method freeze` - -```python -def freezeself: -``` - -> Freezes the feature parameters. - -#### `class DeepTimeSetTransformer` - -##### `method __init__` - -```python -def __init__self, input_dim: int: -``` -##### `method forward` - -```python -def forwardself, t: torch.Tensor, x: torch.Tensor: -``` - -> Forward pass of the network. - -Args: - t (torch.Tensor): Input tensor for time dimension. - x (torch.Tensor): Input tensor for features. - -Returns: - torch.Tensor: Output tensor. - -#### `class FBSNNNetwork` - -##### `method __init__` - -```python -def __init__self, layersize: list[int]: -``` - -> Initializes a neural network with multiple blocks. - -Args: -- indim (int): input dimension -- layersize (List[int]): list of sizes of hidden layers -- outdim (int): output dimension - -##### `method forward` - -```python -def forwardself, x: torch.Tensor: -``` - -> Passes the input through the neural network. - -Args: -- x (torch.Tensor): input tensor - -Returns: -- torch.Tensor: output tensor - -#### `class DeepBSDE` - -##### `method __init__` - -```python -def __init__self, layersize: list[int]: -``` - -> Initializes a neural network with multiple blocks. - -Args: -- indim (int): input dimension -- layersize (List[int]): list of sizes of hidden layers -- outdim (int): output dimension - -##### `method forward` - -```python -def forwardself, x: torch.Tensor: -``` - -> Passes the input through the neural network. - -Args: -- x (torch.Tensor): input tensor - -Returns: -- torch.Tensor: output tensor - -#### `class TimeDependentNetwork` - -##### `method __init__` - -```python -def __init__self, indim: int, layersize: list[int], outdim: int: -``` - -> Initializes a neural network with multiple blocks. - -Args: -- indim (int): input dimension -- layersize (List[int]): list of sizes of hidden layers -- outdim (int): output dimension - -##### `method forward` - -```python -def forwardself, t: torch.Tensor, x: torch.Tensor: -``` - -> Passes the input through the neural network. - -Args: -- t (torch.Tensor): tensor containing time information -- x (torch.Tensor): input tensor - -Returns: -- torch.Tensor: output tensor - -#### `class TimeDependentNetworkMonteCarlo` - -##### `method __init__` - -```python -def __init__self, indim: int, layersize: list[int], outdim: int, sigma: float: -``` - -> Initializes a neural network with multiple blocks. - -Args: -- indim (int): input dimension -- layersize (List[int]): list of sizes of hidden layers -- outdim (int): output dimension -- sigma (float) : volatility - -##### `method forward` - -```python -def forwardself, t: torch.Tensor, x: torch.Tensor, y: -``` - -> Passes the input through the neural network. - -Args: -- t (torch.Tensor): tensor containing time information -- x (torch.Tensor): input tensor - -Returns: -- torch.Tensor: output tensor - - -### πŸ“„ `RiskLabAI\pde\solver.py` - -#### `function initialize_weights` - -```python -def initialize_weightsm: nn.Module: -``` - -> Initializes the weights of the given module. - -Args: -- m (nn.Module): the module to initialize weights of - -Returns: -- None - -#### `class FBSDESolver` - -##### `method __init__` - -```python -def __init__self, pde, layer_sizes, learning_rate, solving_method, device: -``` - -> Initializes the FBSDESolver. - -Args: -- pde : the partial differential equation to solve -- layer_sizes (list[int]): list of sizes of hidden layers -- learning_rate (float): learning rate for optimization -- solving_method (str): method to solve the PDE ('Monte-Carlo', 'Deep-Time-SetTransformer', 'Basic') - -##### `method compute_loss` - -```python -def compute_lossself, y, dw, t, init, init_grad: -``` -##### `method solve` - -```python -def solveself, num_iterations, batch_size, init, sample_size=None: -``` - -> Solves the PDE. - -Args: -- num_iterations (int): number of iterations for optimization -- batch_size (int): batch size for training -- init (torch.Tensor): initial value -- device (torch.device): device to perform calculations on ('cpu', 'cuda') -- sample_size (int, optional): sample size for Monte-Carlo method - -Returns: -- list[torch.Tensor]: list of losses during optimization -- list[torch.Tensor]: list of initial values during optimization - -#### `class FBSNNolver` - -##### `method __init__` - -```python -def __init__self, pde, layer_sizes, learning_rate, device: -``` - -> Initializes the FBSDESolver. - -Args: -- pde : the partial differential equation to solve -- layer_sizes (list[int]): list of sizes of hidden layers -- learning_rate (float): learning rate for optimization -- solving_method (str): method to solve the PDE ('Monte-Carlo', 'Deep-Time-SetTransformer', 'Basic') - -##### `method compute_loss` - -```python -def compute_lossself, y, dw, t, init: -``` -##### `method solve` - -```python -def solveself, num_iterations, batch_size, init, sample_size=None: -``` - -> Solves the PDE. - -Args: -- num_iterations (int): number of iterations for optimization -- batch_size (int): batch size for training -- init (torch.Tensor): initial value -- device (torch.device): device to perform calculations on ('cpu', 'cuda') -- sample_size (int, optional): sample size for Monte-Carlo method - -Returns: -- list[torch.Tensor]: list of losses during optimization -- list[torch.Tensor]: list of initial values during optimization - - -### πŸ“„ `RiskLabAI\utils\constants.py` - - -### πŸ“„ `RiskLabAI\utils\ewma.py` - -#### `function ewma` - -```python -def ewmaarray, window: -``` - -> This function calculate Exponential Weighted Moving Average of array -:param array: input array -:param window: window size -:return: ewma array - - -### πŸ“„ `RiskLabAI\utils\momentum_mean_reverting_strategy_sides.py` - -#### `function determine_strategy_side` - -```python -def determine_strategy_sideprices: pd.Series, fast_window: int=20, slow_window: int=50, exponential: bool=False, mean_reversion: bool=False: -``` - -> Determines the trading side (long or short) based on moving average crossovers and -the nature of the strategy (momentum or mean reversion). - -This function computes the fast and slow moving averages of the provided price series. -The trading side is decided based on the relationship between these averages and -the chosen strategy type (momentum or mean reversion). - -.. math:: - \text{Momentum:} - \begin{cases} - 1 & \text{if } \text{fast\_moving\_average} \geq \text{slow\_moving\_average} \\ - -1 & \text{otherwise} - \end{cases} - - \text{Mean Reversion:} - \begin{cases} - 1 & \text{if } \text{fast\_moving\_average} < \text{slow\_moving\_average} \\ - -1 & \text{otherwise} - \end{cases} - -:param prices: Series containing the prices. -:param fast_window: Window size for the fast moving average. -:param slow_window: Window size for the slow moving average. -:param exponential: If True, compute exponential moving averages. Otherwise, compute simple moving averages. -:param mean_reversion: If True, strategy is mean reverting. If False, strategy is momentum-based. -:return: Series containing strategy sides. - - -### πŸ“„ `RiskLabAI\utils\progress.py` - -#### `function progress_bar` - -```python -def progress_barcurrent_progress: int, total_progress: int, start_time: float, bar_length: int=20: -``` - -> Display a terminal-style progress bar with completion percentage and estimated remaining time. - -:param current_progress: Current value indicating the progress made. -:param total_progress: Total value representing the completion of the task. -:param start_time: The time at which the task started, typically acquired via time.time(). -:param bar_length: Length of the progress bar in terminal characters, default is 20. - -The displayed progress bar uses the formula: - -.. math:: - \text{percentage} = \frac{\text{current\_progress}}{\text{total\_progress}} - -The estimated remaining time is calculated based on elapsed time and progress made: - -.. math:: - \text{remaining\_time} = \frac{\text{elapsed\_time} \times (\text{total\_progress} - \text{current\_progress})}{\text{current\_progress}} - -:return: None - - -### πŸ“„ `RiskLabAI\utils\smoothing_average.py` - -#### `function compute_exponential_weighted_moving_average` - -```python -def compute_exponential_weighted_moving_averageinput_series: np.ndarray, window_length: int: -``` - -> Compute the exponential weighted moving average (EWMA) of a time series array. - -The EWMA is calculated using the formula: - -.. math:: - EWMA_t = \\frac{x_t + (1 - \\alpha) x_{t-1} + (1 - \\alpha)^2 x_{t-2} + \\ldots}{\\omega_t} - -where: - -.. math:: - \\omega_t = 1 + (1 - \\alpha) + (1 - \\alpha)^2 + \\ldots + (1 - \\alpha)^t, - \\alpha = \\frac{2}{{window\_length + 1}} - -:param input_series: Input time series array. -:type input_series: np.ndarray -:param window_length: Window length for the exponential weighted moving average. -:type window_length: int -:return: An array containing the computed EWMA values. -:rtype: np.ndarray - - -### πŸ“„ `RiskLabAI\utils\update_figure_layout.py` - -#### `function update_figure_layout` - -```python -def update_figure_layoutfig, title, xaxis_title, yaxis_title, legend_x=1, legend_y=1: -``` diff --git a/INSTALLATION.md b/INSTALLATION.md index 1919e6f..6c5bf5f 100644 --- a/INSTALLATION.md +++ b/INSTALLATION.md @@ -1,58 +1,65 @@ -### 1\. Create the New Environment +# Installation & Development Setup -Open your terminal or Anaconda Prompt. This command will create a new, empty environment named `risklab` using a stable Python version (e.g., 3.10). +## Install (users) ```bash -conda create -n risklab python=3.10 -y +pip install RiskLabAI ``` -### 2\. Activate the Environment - -You must activate the environment to install packages into it and use it. +Optional extras pull in heavier dependencies only when you need them: ```bash -conda activate risklab +pip install "RiskLabAI[pde]" # torch β€” the Deep-BSDE PDE solver +pip install "RiskLabAI[plot]" # matplotlib / seaborn / plotly β€” plotting helpers +pip install "RiskLabAI[synth]" # quantecon β€” synthetic-data utilities +pip install "RiskLabAI[all]" # everything above ``` -Your terminal prompt should now change to show `(risklab)` at the beginning. +The base install is intentionally lightweight: `import RiskLabAI` does not pull +in torch or plotting libraries β€” sub-packages that need them are imported lazily. -### 3\. Install Your Project's Dependencies +## Development setup (contributors) -Navigate to the root directory of your `RiskLabAI.py` project (the one containing `requirements.txt`). This command will read your `requirements.txt` file and install all the necessary packages. +### 1. Create and activate an environment ```bash -# Navigate to your project folder first -cd /path/to/your/RiskLabAI.py - -# Install all packages from your requirements file -pip install -r requirements.txt +conda create -n risklab python=3.11 -y +conda activate risklab ``` -### 4\. Install Your Library in "Editable" Mode +(Any Python 3.9–3.12 works; a venv is fine too.) -This is a crucial step for development and testing. It links your `RiskLabAI` source code to the environment, which allows your test suite to import your library as if it were officially installed. +### 2. Install in editable mode with all extras and test tooling -From the same root directory (where your `pyproject.toml` is), run: +Dependencies are declared in `pyproject.toml` (there is **no** `requirements.txt`). +From the repository root: ```bash -pip install -e . +pip install -e ".[all]" pytest black ruff ``` -### 5\. Run Your Tests +The editable install (`-e`) links the package to your source tree so the test +suite imports your local code. -Now you are all set. The standard way to run your test suite is by using `pytest`. If `pytest` wasn't included in your `requirements.txt`, you can install it: +> On some setuptools versions the plain editable install does not expose all +> sub-modules. If `import RiskLabAI.backtest.bet_sizing` fails, reinstall with +> the compatibility mode: +> ```bash +> pip install -e . --config-settings editable_mode=compat +> ``` + +### 3. Run the tests ```bash -pip install pytest +pytest -q --ignore=test/pde ``` -Then, simply run the following command from your project's root directory: +`test/pde` is skipped unless you have a working `torch` runtime (install the +`[pde]` extra to include it). -```bash -pytest -``` -If you want to be fast, you can tell `pytest` to ignore that specific directory when you run your tests: +### 4. Lint and format before committing ```bash -pytest --ignore=RiskLabAI/pde -``` \ No newline at end of file +black RiskLabAI test +ruff check RiskLabAI test +``` diff --git a/README.md b/README.md index 12c808a..cbaa6a2 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,113 @@ # RiskLabAI.py + [![PyPI version](https://badge.fury.io/py/RiskLabAI.svg)](https://badge.fury.io/py/RiskLabAI) +[![CI](https://github.com/RiskLabAI/RiskLabAI.py/actions/workflows/ci.yml/badge.svg)](https://github.com/RiskLabAI/RiskLabAI.py/actions/workflows/ci.yml) + +A Python library for quantitative finance and financial machine learning, +implementing core methods from Marcos LΓ³pez de Prado's *Advances in Financial +Machine Learning* and *Machine Learning for Asset Managers*. -A Python library for quantitative finance and financial AI, implementing core concepts from Marcos LΓ³pez de Prado's books, "Advances in Financial Machine Learning" and "Machine Learning for Asset Managers." +The library provides implementations for: -This library provides production-ready implementations for: -* Advanced Financial Data Structures (Tick, Volume, Dollar, Imbalance, and Run Bars) -* Fractional Differentiation (FFD) -* The Triple-Barrier Method and Meta-Labeling -* Advanced Cross-Validation (Purged K-Fold, Combinatorial Purged CV) -* Feature Importance (MDI, MDA, SFI) and Clustered Feature Importance -* Portfolio Optimization (HRP, NCO) -* And many more... +- **Financial data structures** β€” tick, volume, dollar, imbalance, and run bars +- **Labeling** β€” the triple-barrier method, meta-labeling, trend-scanning +- **Fractional differentiation** β€” standard and fixed-width window (FFD) +- **Sample weights**, **denoising** (Marčenko–Pastur), **distance metrics** +- **Cross-validation** β€” Purged K-Fold, Combinatorial Purged CV (+ adaptive/bagged), walk-forward +- **Feature importance** β€” MDI, MDA, SFI, and clustered variants +- **Portfolio optimization** β€” HRP, NCO, hedging +- **Backtest statistics** β€” PSR/DSR, PBO, strategy risk +- **Microstructure & entropy features**, **structural breaks**, and a Deep-BSDE PDE solver -## πŸ“¦ Installation +There is a companion Julia package, +[RiskLabAI.jl](https://github.com/RiskLabAI/RiskLabAI.jl), which mirrors this +API. -Install the library directly from PyPI: +## Installation ```bash -pip install RiskLabAI \ No newline at end of file +pip install RiskLabAI +``` + +The base install is lightweight. Heavier, optional capabilities are available as +extras: + +| Extra | Installs | Enables | +|---|---|---| +| `RiskLabAI[pde]` | `torch` | the Deep-BSDE PDE solver (`RiskLabAI.pde`) | +| `RiskLabAI[plot]` | `matplotlib`, `seaborn`, `plotly` | plotting helpers | +| `RiskLabAI[synth]` | `quantecon` | synthetic-data utilities | +| `RiskLabAI[all]` | all of the above | everything | + +```bash +pip install "RiskLabAI[all]" +``` + +For development (editable install + tests), see +[`INSTALLATION.md`](INSTALLATION.md). + +## Quickstart + +Sample dollar/volume/tick bars from raw ticks: + +```python +from RiskLabAI.data.structures.standard_bars import StandardBars +from RiskLabAI.utils.constants import CUMULATIVE_DOLLAR + +# ticks: an iterable of (datetime, price, volume) +ticks = [ + ("2020-01-01 10:00:00", 100.0, 10), + ("2020-01-01 10:00:01", 101.0, 5), + ("2020-01-01 10:00:02", 100.0, 20), +] + +bars = StandardBars(bar_type=CUMULATIVE_DOLLAR, threshold=3000) +bar_list = bars.construct_bars_from_data(ticks) +# each bar: [date_time, idx, open, high, low, close, volume, +# buy_volume, sell_volume, ticks, dollar, threshold] +``` + +Discover and construct components by name through the extension registry: + +```python +import RiskLabAI.core as core + +core.list_components() # {family: [available keys]} +cv = core.CROSS_VALIDATORS.create("purgedkfold", n_splits=5, times=event_times) +``` + +## Logging + +RiskLabAI logs under the `"RiskLabAI"` logger and is silent by default. To see +progress and diagnostics, configure logging in your application: + +```python +import logging +logging.basicConfig(level=logging.INFO) +``` + +## Extending the library + +RiskLabAI is built to be extended with new models. The `RiskLabAI.core` layer +provides a component registry and base interfaces so a new bar type, labeler, +cross-validator, etc. can be registered and discovered without editing central +code. See [`EXTENDING.md`](EXTENDING.md) for a step-by-step guide with worked +examples. + +## Contributing + +Contributions are welcome. The project uses `pytest` for tests and +`black` + `ruff` for formatting/linting (run before opening a PR): + +```bash +pip install -e ".[all]" pytest black ruff +pytest -q --ignore=test/pde +black RiskLabAI test +ruff check RiskLabAI test +``` + +Please branch from `main`, keep changes focused, and update `CHANGELOG.md`. + +## License + +See [`LICENSE.txt`](LICENSE.txt). diff --git a/RiskLabAI/__init__.py b/RiskLabAI/__init__.py index 4b15392..bb9a0d8 100644 --- a/RiskLabAI/__init__.py +++ b/RiskLabAI/__init__.py @@ -52,19 +52,21 @@ # and stays silent unless the application opts in by configuring logging. _logging.getLogger(__name__).addHandler(_logging.NullHandler()) -_SUBMODULES = frozenset({ - "backtest", - "cluster", - "controller", - "core", - "data", - "ensemble", - "features", - "hpc", - "optimization", - "pde", - "utils", -}) +_SUBMODULES = frozenset( + { + "backtest", + "cluster", + "controller", + "core", + "data", + "ensemble", + "features", + "hpc", + "optimization", + "pde", + "utils", + } +) # Single source of truth for the version is pyproject.toml. try: diff --git a/RiskLabAI/backtest/__init__.py b/RiskLabAI/backtest/__init__.py index 5b01d7b..fc8958d 100644 --- a/RiskLabAI/backtest/__init__.py +++ b/RiskLabAI/backtest/__init__.py @@ -139,4 +139,4 @@ "measure_cpcv_scalability", "get_cpu_info", "format_cpu_info", -] \ No newline at end of file +] diff --git a/RiskLabAI/backtest/backtest_overfitting_simulation.py b/RiskLabAI/backtest/backtest_overfitting_simulation.py index e292b92..2865c2d 100644 --- a/RiskLabAI/backtest/backtest_overfitting_simulation.py +++ b/RiskLabAI/backtest/backtest_overfitting_simulation.py @@ -30,18 +30,26 @@ import subprocess from RiskLabAI.data.differentiation import fractionally_differentiated_log_price -from RiskLabAI.data.labeling import daily_volatility_with_log_returns, cusum_filter_events_dynamic_threshold, vertical_barrier, meta_events, meta_labeling +from RiskLabAI.data.labeling import ( + daily_volatility_with_log_returns, + cusum_filter_events_dynamic_threshold, + vertical_barrier, + meta_events, + meta_labeling, +) from RiskLabAI.data.weights import sample_weight_absolute_return_meta_labeling from RiskLabAI.utils import determine_strategy_side from RiskLabAI.backtest.validation import CrossValidatorController from .probability_of_backtest_overfitting import probability_of_backtest_overfitting -from .probabilistic_sharpe_ratio import probabilistic_sharpe_ratio, benchmark_sharpe_ratio +from .probabilistic_sharpe_ratio import ( + probabilistic_sharpe_ratio, + benchmark_sharpe_ratio, +) from .bet_sizing import strategy_bet_sizing + def financial_features_backtest_overfitting_simulation( - prices: pd.Series, - noise_scale: float = 0.0, - random_state: Optional[int] = None + prices: pd.Series, noise_scale: float = 0.0, random_state: Optional[int] = None ) -> pd.DataFrame: """ Create a DataFrame of financial features from the given price series. @@ -55,26 +63,38 @@ def financial_features_backtest_overfitting_simulation( pd.DataFrame: DataFrame containing the computed financial features. """ rng = np.random.default_rng(random_state) - + with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) # Features features = pd.DataFrame() - features['FracDiff'] = fractionally_differentiated_log_price(prices) - features['Volatility'] = daily_volatility_with_log_returns(prices, 100) - features['Z-Score'] = (prices - prices.rolling(20).mean()) / prices.rolling(20).std() + features["FracDiff"] = fractionally_differentiated_log_price(prices) + features["Volatility"] = daily_volatility_with_log_returns(prices, 100) + features["Z-Score"] = (prices - prices.rolling(20).mean()) / prices.rolling( + 20 + ).std() macd_line = np.log(prices.ewm(span=12).mean() / prices.ewm(span=26).mean()) signal_line = macd_line.ewm(span=9).mean() features["Log MACD Histogram"] = macd_line - signal_line - features["ADX"] = ta.trend.ADXIndicator(prices, prices, prices, fillna=True).adx() + features["ADX"] = ta.trend.ADXIndicator( + prices, prices, prices, fillna=True + ).adx() features["RSI"] = ta.momentum.RSIIndicator(prices, fillna=True).rsi() - features["CCI"] = ta.trend.CCIIndicator(prices, prices, prices, fillna=True).cci() - stochastic = ta.momentum.StochasticOscillator(prices, prices, prices, fillna=True) + features["CCI"] = ta.trend.CCIIndicator( + prices, prices, prices, fillna=True + ).cci() + stochastic = ta.momentum.StochasticOscillator( + prices, prices, prices, fillna=True + ) features["Stochastic"] = stochastic.stoch() features["ROC"] = ta.momentum.ROCIndicator(prices, fillna=True).roc() - features["ATR"] = ta.volatility.AverageTrueRange(prices, prices, prices, fillna=True).average_true_range() - features["Log DPO"] = np.log(prices.rolling(11).mean() / prices.rolling(20).mean()) - + features["ATR"] = ta.volatility.AverageTrueRange( + prices, prices, prices, fillna=True + ).average_true_range() + features["Log DPO"] = np.log( + prices.rolling(11).mean() / prices.rolling(20).mean() + ) + # 1. MACD Crossovers: features["MACD Position"] = 0 # default to no crossover features.loc[features["Log MACD Histogram"] >= 0, "MACD Position"] = 1 @@ -111,8 +131,12 @@ def financial_features_backtest_overfitting_simulation( features["Kumo Breakout"] = 0 senkou_span_a = ichimoku.ichimoku_a() senkou_span_b = ichimoku.ichimoku_b() - features.loc[(prices > senkou_span_a) & (prices > senkou_span_b), "Kumo Breakout"] = 1 - features.loc[(prices < senkou_span_a) & (prices < senkou_span_b), "Kumo Breakout"] = -1 + features.loc[ + (prices > senkou_span_a) & (prices > senkou_span_b), "Kumo Breakout" + ] = 1 + features.loc[ + (prices < senkou_span_a) & (prices < senkou_span_b), "Kumo Breakout" + ] = -1 # 2. TK Crosses features["TK Position"] = 0 @@ -123,32 +147,45 @@ def financial_features_backtest_overfitting_simulation( # 4. Price Relative to Kumo features["Price Kumo Position"] = 0 - features.loc[(prices > senkou_span_a) & (prices > senkou_span_b), "Price Kumo Position"] = 1 - features.loc[(prices < senkou_span_a) & (prices < senkou_span_b), "Price Kumo Position"] = -1 + features.loc[ + (prices > senkou_span_a) & (prices > senkou_span_b), "Price Kumo Position" + ] = 1 + features.loc[ + (prices < senkou_span_a) & (prices < senkou_span_b), "Price Kumo Position" + ] = -1 # 5. Cloud Thickness features["Cloud Thickness"] = np.log(senkou_span_a / senkou_span_b) # 6. Momentum Confirmation features["Momentum Confirmation"] = 0 - features.loc[(tenkan_sen > senkou_span_a) & (prices > senkou_span_a), "Momentum Confirmation"] = 1 - features.loc[(tenkan_sen < senkou_span_a) & (prices < senkou_span_a), "Momentum Confirmation"] = -1 + features.loc[ + (tenkan_sen > senkou_span_a) & (prices > senkou_span_a), + "Momentum Confirmation", + ] = 1 + features.loc[ + (tenkan_sen < senkou_span_a) & (prices < senkou_span_a), + "Momentum Confirmation", + ] = -1 if noise_scale > 0.0: for col in features.columns: - noise = rng.normal(loc=0, scale=noise_scale * features[col].std(), size=features[col].shape) + noise = rng.normal( + loc=0, scale=noise_scale * features[col].std(), size=features[col].shape + ) features[col] += noise - + return features + def backtest_overfitting_simulation_results( - prices: pd.Series, + prices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], cross_validators: Dict[str, Any], noise_scale: float = 0.0, random_state: int = None, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Dict[str, List[Dict[str, Any]]]: """ Conducts a simulation to evaluate the performance of trading strategies and models. @@ -170,40 +207,80 @@ def backtest_overfitting_simulation_results( volatility = daily_volatility_with_log_returns(prices, 100) filter_threshold = 1.8 - moelcules = cusum_filter_events_dynamic_threshold(np.log(prices), filter_threshold * volatility) + moelcules = cusum_filter_events_dynamic_threshold( + np.log(prices), filter_threshold * volatility + ) vertical_barriers = vertical_barrier(prices, moelcules, 20) - features = financial_features_backtest_overfitting_simulation(prices, noise_scale=noise_scale, random_state=random_state) + features = financial_features_backtest_overfitting_simulation( + prices, noise_scale=noise_scale, random_state=random_state + ) results = {cv: [] for cv in cross_validators.keys()} # Iterate over each strategy parameter combination - strategy_parameters_keys, strategy_parameters_values = zip(*strategy_parameters.items()) + strategy_parameters_keys, strategy_parameters_values = zip( + *strategy_parameters.items() + ) for strategy_parameters_value in itertools.product(*strategy_parameters_values): strategy_params = dict(zip(strategy_parameters_keys, strategy_parameters_value)) - if (strategy_params['fast_window'] == strategy_parameters['fast_window'][0] and strategy_params['slow_window'] == strategy_parameters['slow_window'][0]) or \ - (strategy_params['fast_window'] == strategy_parameters['fast_window'][1] and strategy_params['slow_window'] == strategy_parameters['slow_window'][1]) or \ - (strategy_params['fast_window'] == strategy_parameters['fast_window'][2] and strategy_params['slow_window'] == strategy_parameters['slow_window'][2]) or \ - (strategy_params['fast_window'] == strategy_parameters['fast_window'][3] and strategy_params['slow_window'] == strategy_parameters['slow_window'][3]): - + if ( + ( + strategy_params["fast_window"] == strategy_parameters["fast_window"][0] + and strategy_params["slow_window"] + == strategy_parameters["slow_window"][0] + ) + or ( + strategy_params["fast_window"] == strategy_parameters["fast_window"][1] + and strategy_params["slow_window"] + == strategy_parameters["slow_window"][1] + ) + or ( + strategy_params["fast_window"] == strategy_parameters["fast_window"][2] + and strategy_params["slow_window"] + == strategy_parameters["slow_window"][2] + ) + or ( + strategy_params["fast_window"] == strategy_parameters["fast_window"][3] + and strategy_params["slow_window"] + == strategy_parameters["slow_window"][3] + ) + ): + strategy_sides = determine_strategy_side(prices, **strategy_params) else: - continue - - triple_barrier_events = meta_events(prices, moelcules, [0.5, 1.5], volatility, 0, 1, vertical_barriers, strategy_sides) + continue + + triple_barrier_events = meta_events( + prices, + moelcules, + [0.5, 1.5], + volatility, + 0, + 1, + vertical_barriers, + strategy_sides, + ) labels = meta_labeling(triple_barrier_events, prices) - sample_weights = sample_weight_absolute_return_meta_labeling(triple_barrier_events['End Time'], prices, moelcules) + sample_weights = sample_weight_absolute_return_meta_labeling( + triple_barrier_events["End Time"], prices, moelcules + ) - index = features.loc[moelcules].dropna().index.intersection(labels.dropna().index).intersection(sample_weights.dropna().index) + index = ( + features.loc[moelcules] + .dropna() + .index.intersection(labels.dropna().index) + .intersection(sample_weights.dropna().index) + ) data = features.loc[index] - target = labels.loc[index]['Label'] + target = labels.loc[index]["Label"] weights = sample_weights.loc[index] - times = labels.loc[index]['End Time'] + times = labels.loc[index]["End Time"] external_feature = volatility.loc[index] # Iterate over each model and hyperparameter configuration for model_name, model_details in models.items(): - model = model_details['Model'] - param_grid = model_details['Parameters'] + model = model_details["Model"] + param_grid = model_details["Parameters"] # Generate all combinations of hyperparameters model_keys, model_values = zip(*param_grid.items()) @@ -212,33 +289,55 @@ def backtest_overfitting_simulation_results( model.set_params(**params) for cross_validator_type, cross_validator in cross_validators.items(): - if 'times' in cross_validator.__dict__: + if "times" in cross_validator.__dict__: cross_validator.times = times - if 'external_feature' in cross_validator.__dict__: - cross_validator.external_feature = external_feature - predictions = cross_validator.backtest_predictions(model, data, target, weights, predict_probability=True, n_jobs=n_jobs) - probabilities = pd.Series(np.vstack(list(map(lambda x: x[:, 1], predictions.values()))).mean(axis=0), times.index).dropna() - positions = strategy_bet_sizing(prices.index, times.loc[probabilities.index], strategy_sides[probabilities.index], probabilities) - strategy_log_returns = (np.log(prices).diff() * positions.shift()).dropna() - results[cross_validator_type].append({ - 'Trial Info.' : { - 'Strategy Parameters' : strategy_params, - 'Model Name': model_name, - 'Model Parameters': params, - }, - 'Returns': strategy_log_returns - }) + if "external_feature" in cross_validator.__dict__: + cross_validator.external_feature = external_feature + predictions = cross_validator.backtest_predictions( + model, + data, + target, + weights, + predict_probability=True, + n_jobs=n_jobs, + ) + probabilities = pd.Series( + np.vstack( + list(map(lambda x: x[:, 1], predictions.values())) + ).mean(axis=0), + times.index, + ).dropna() + positions = strategy_bet_sizing( + prices.index, + times.loc[probabilities.index], + strategy_sides[probabilities.index], + probabilities, + ) + strategy_log_returns = ( + np.log(prices).diff() * positions.shift() + ).dropna() + results[cross_validator_type].append( + { + "Trial Info.": { + "Strategy Parameters": strategy_params, + "Model Name": model_name, + "Model Parameters": params, + }, + "Returns": strategy_log_returns, + } + ) return results + def overall_backtest_overfitting_simulation( - prices: pd.Series, + prices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, noise_scale: float = 0.0, random_state: int = None, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Tuple[Dict[str, float], Dict[str, float]]: """ Conducts an overall backtest overfitting simulation to calculate the metrics. @@ -257,61 +356,65 @@ def overall_backtest_overfitting_simulation( """ cross_validators = { - 'Walk-Forward' : CrossValidatorController( - 'walkforward', + "Walk-Forward": CrossValidatorController( + "walkforward", n_splits=4, ).cross_validator, - 'K-Fold' : CrossValidatorController( - 'kfold', + "K-Fold": CrossValidatorController( + "kfold", n_splits=4, ).cross_validator, - 'Purged K-Fold' : CrossValidatorController( - 'purgedkfold', - n_splits=4, - times=None, - embargo=0.02 + "Purged K-Fold": CrossValidatorController( + "purgedkfold", n_splits=4, times=None, embargo=0.02 ).cross_validator, - 'Combinatorial Purged' : CrossValidatorController( - 'combinatorialpurged', - n_splits=8, - n_test_groups=2, - times=None, - embargo=0.02 + "Combinatorial Purged": CrossValidatorController( + "combinatorialpurged", n_splits=8, n_test_groups=2, times=None, embargo=0.02 ).cross_validator, } - results = backtest_overfitting_simulation_results(prices, strategy_parameters, models, cross_validators, noise_scale, random_state, n_jobs=n_jobs) - + results = backtest_overfitting_simulation_results( + prices, + strategy_parameters, + models, + cross_validators, + noise_scale, + random_state, + n_jobs=n_jobs, + ) cv_deflated_sr = {} cv_pbo = {} for cv, trials in results.items(): - performances = pd.concat([trial['Returns'] for trial in trials], axis=1) + performances = pd.concat([trial["Returns"] for trial in trials], axis=1) - sharpe_ratios = performances.apply(lambda y: sharpe_ratio(y.values)) + sharpe_ratios = performances.apply(lambda y: sharpe_ratio(y.values)) benchmark_sr = benchmark_sharpe_ratio(sharpe_ratios) best_strategy_index = sharpe_ratios.idxmax() cv_deflated_sr[cv] = probabilistic_sharpe_ratio( - sharpe_ratios.loc[best_strategy_index], - benchmark_sr, len(performances), - ss.skew(performances[best_strategy_index]), + sharpe_ratios.loc[best_strategy_index], + benchmark_sr, + len(performances), + ss.skew(performances[best_strategy_index]), ss.kurtosis(performances[best_strategy_index]), - return_test_statistic=True + return_test_statistic=True, ) - pbo, logit_values = probability_of_backtest_overfitting(performances.values, risk_free_return=step_risk_free_rate) + pbo, logit_values = probability_of_backtest_overfitting( + performances.values, risk_free_return=step_risk_free_rate + ) cv_pbo[cv] = pbo - return cv_pbo, cv_deflated_sr + return cv_pbo, cv_deflated_sr + def temporal_backtest_overfitting_simulation( - prices: pd.Series, + prices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, overfitting_partitions_length: int, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Tuple[Dict[str, List[float]], Dict[str, List[float]]]: """ Conducts a temporal backtest overfitting simulation to calculate the metrics in chunks. @@ -328,65 +431,64 @@ def temporal_backtest_overfitting_simulation( Tuple[Dict[str, List[float]], Dict[str, List[float]]]: A tuple containing two dictionaries, one for the Probability of Backtest Overfitting (PBO) and the other for the Deflated Sharpe Ratio (DSR), for each cross-validation method tested. """ cross_validators = { - 'Walk-Forward' : CrossValidatorController( - 'walkforward', + "Walk-Forward": CrossValidatorController( + "walkforward", n_splits=4, ).cross_validator, - 'K-Fold' : CrossValidatorController( - 'kfold', + "K-Fold": CrossValidatorController( + "kfold", n_splits=4, ).cross_validator, - 'Purged K-Fold' : CrossValidatorController( - 'purgedkfold', - n_splits=4, - times=None, - embargo=0.02 + "Purged K-Fold": CrossValidatorController( + "purgedkfold", n_splits=4, times=None, embargo=0.02 ).cross_validator, - 'Combinatorial Purged' : CrossValidatorController( - 'combinatorialpurged', - n_splits=8, - n_test_groups=2, - times=None, - embargo=0.02 + "Combinatorial Purged": CrossValidatorController( + "combinatorialpurged", n_splits=8, n_test_groups=2, times=None, embargo=0.02 ).cross_validator, } - results = backtest_overfitting_simulation_results(prices, strategy_parameters, models, cross_validators, n_jobs=n_jobs) + results = backtest_overfitting_simulation_results( + prices, strategy_parameters, models, cross_validators, n_jobs=n_jobs + ) cv_deflated_sr = {cv: [] for cv in results.keys()} cv_pbo = {cv: [] for cv in results.keys()} for cv, trials in results.items(): - performances = pd.concat([trial['Returns'] for trial in trials], axis=1) - + performances = pd.concat([trial["Returns"] for trial in trials], axis=1) + # Calculate the number of chunks using ceil n_chunks = ceil(performances.shape[0] / overfitting_partitions_length) for chunk in np.array_split(performances, n_chunks): sharpe_ratios = chunk.apply(lambda y: sharpe_ratio(y.values)) benchmark_sr = benchmark_sharpe_ratio(sharpe_ratios) best_strategy_index = sharpe_ratios.idxmax() - + deflated_sr = probabilistic_sharpe_ratio( - sharpe_ratios.loc[best_strategy_index], - benchmark_sr, len(chunk), - ss.skew(chunk[best_strategy_index]), + sharpe_ratios.loc[best_strategy_index], + benchmark_sr, + len(chunk), + ss.skew(chunk[best_strategy_index]), ss.kurtosis(chunk[best_strategy_index]), - return_test_statistic=True + return_test_statistic=True, ) cv_deflated_sr[cv].append(deflated_sr) - - pbo, logit_values = probability_of_backtest_overfitting(chunk.values, risk_free_return=step_risk_free_rate, n_jobs=n_jobs) + + pbo, logit_values = probability_of_backtest_overfitting( + chunk.values, risk_free_return=step_risk_free_rate, n_jobs=n_jobs + ) cv_pbo[cv].append(pbo) return cv_pbo, cv_deflated_sr + def time_temporal_backtest_overfitting_simulation( - prices: pd.Series, + prices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, - overfitting_partitions_duration: str = 'A', # Annual grouping by default - n_jobs: int = 1 + overfitting_partitions_duration: str = "A", # Annual grouping by default + n_jobs: int = 1, ) -> Tuple[Dict[str, pd.Series], Dict[str, pd.Series]]: """ Conducts a time-temporal backtest overfitting simulation to calculate the metrics in time-indexed chunks. @@ -403,36 +505,31 @@ def time_temporal_backtest_overfitting_simulation( Tuple[Dict[str, pd.Series], Dict[str, pd.Series]]: A tuple containing two dictionaries, one for the Probability of Backtest Overfitting (PBO) and the other for the Deflated Sharpe Ratio (DSR), for each cross-validation method tested, indexed by time. """ cross_validators = { - 'Walk-Forward' : CrossValidatorController( - 'walkforward', + "Walk-Forward": CrossValidatorController( + "walkforward", n_splits=4, ).cross_validator, - 'K-Fold' : CrossValidatorController( - 'kfold', + "K-Fold": CrossValidatorController( + "kfold", n_splits=4, ).cross_validator, - 'Purged K-Fold' : CrossValidatorController( - 'purgedkfold', - n_splits=4, - times=None, - embargo=0.02 + "Purged K-Fold": CrossValidatorController( + "purgedkfold", n_splits=4, times=None, embargo=0.02 ).cross_validator, - 'Combinatorial Purged' : CrossValidatorController( - 'combinatorialpurged', - n_splits=8, - n_test_groups=2, - times=None, - embargo=0.02 + "Combinatorial Purged": CrossValidatorController( + "combinatorialpurged", n_splits=8, n_test_groups=2, times=None, embargo=0.02 ).cross_validator, } - results = backtest_overfitting_simulation_results(prices, strategy_parameters, models, cross_validators, n_jobs=n_jobs) + results = backtest_overfitting_simulation_results( + prices, strategy_parameters, models, cross_validators, n_jobs=n_jobs + ) cv_deflated_sr = {cv: pd.Series(dtype=float) for cv in results.keys()} cv_pbo = {cv: pd.Series(dtype=float) for cv in results.keys()} for cv, trials in results.items(): - performances = pd.concat([trial['Returns'] for trial in trials], axis=1) + performances = pd.concat([trial["Returns"] for trial in trials], axis=1) # Group by the specified duration and calculate metrics grouped = performances.groupby(pd.Grouper(freq=overfitting_partitions_duration)) @@ -443,25 +540,29 @@ def time_temporal_backtest_overfitting_simulation( deflated_sr = probabilistic_sharpe_ratio( sharpe_ratios.loc[best_strategy_index], - benchmark_sr, len(chunk), + benchmark_sr, + len(chunk), ss.skew(chunk[best_strategy_index]), ss.kurtosis(chunk[best_strategy_index]), - return_test_statistic=True + return_test_statistic=True, ) cv_deflated_sr[cv].at[timestamp] = deflated_sr - pbo, logit_values = probability_of_backtest_overfitting(chunk.values, risk_free_return=step_risk_free_rate, n_jobs=1) + pbo, logit_values = probability_of_backtest_overfitting( + chunk.values, risk_free_return=step_risk_free_rate, n_jobs=1 + ) cv_pbo[cv].at[timestamp] = pbo return cv_pbo, cv_deflated_sr + def varying_embargo_backtest_overfitting_simulation( - prices: pd.Series, + prices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, embargo_values: List[float], - n_jobs: int = 1 + n_jobs: int = 1, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Conducts a backtest overfitting simulation with varying embargo values to calculate the metrics. @@ -480,62 +581,107 @@ def varying_embargo_backtest_overfitting_simulation( volatility = daily_volatility_with_log_returns(prices, 100) filter_threshold = 1.8 - moelcules = cusum_filter_events_dynamic_threshold(np.log(prices), filter_threshold * volatility) + moelcules = cusum_filter_events_dynamic_threshold( + np.log(prices), filter_threshold * volatility + ) vertical_barriers = vertical_barrier(prices, moelcules, 20) features = financial_features_backtest_overfitting_simulation(prices) - cv_pbo_embargo = pd.DataFrame(index=embargo_values, columns=['Purged K-Fold', 'Combinatorial Purged']) - cv_deflated_sr_embargo = pd.DataFrame(index=embargo_values, columns=['Purged K-Fold', 'Combinatorial Purged']) + cv_pbo_embargo = pd.DataFrame( + index=embargo_values, columns=["Purged K-Fold", "Combinatorial Purged"] + ) + cv_deflated_sr_embargo = pd.DataFrame( + index=embargo_values, columns=["Purged K-Fold", "Combinatorial Purged"] + ) for embargo in embargo_values: results = { - 'Purged K-Fold' : [], - 'Combinatorial Purged' : [], + "Purged K-Fold": [], + "Combinatorial Purged": [], } cross_validators = { - 'Purged K-Fold' : CrossValidatorController( - 'purgedkfold', - n_splits=4, - times=None, - embargo=embargo + "Purged K-Fold": CrossValidatorController( + "purgedkfold", n_splits=4, times=None, embargo=embargo ).cross_validator, - 'Combinatorial Purged' : CrossValidatorController( - 'combinatorialpurged', + "Combinatorial Purged": CrossValidatorController( + "combinatorialpurged", n_splits=8, n_test_groups=2, times=None, - embargo=embargo + embargo=embargo, ).cross_validator, } # Iterate over each strategy parameter combination - strategy_parameters_keys, strategy_parameters_values = zip(*strategy_parameters.items()) + strategy_parameters_keys, strategy_parameters_values = zip( + *strategy_parameters.items() + ) for strategy_parameters_value in itertools.product(*strategy_parameters_values): - strategy_params = dict(zip(strategy_parameters_keys, strategy_parameters_value)) - if (strategy_params['fast_window'] == strategy_parameters['fast_window'][0] and strategy_params['slow_window'] == strategy_parameters['slow_window'][0]) or \ - (strategy_params['fast_window'] == strategy_parameters['fast_window'][1] and strategy_params['slow_window'] == strategy_parameters['slow_window'][1]) or \ - (strategy_params['fast_window'] == strategy_parameters['fast_window'][2] and strategy_params['slow_window'] == strategy_parameters['slow_window'][2]) or \ - (strategy_params['fast_window'] == strategy_parameters['fast_window'][3] and strategy_params['slow_window'] == strategy_parameters['slow_window'][3]): - + strategy_params = dict( + zip(strategy_parameters_keys, strategy_parameters_value) + ) + if ( + ( + strategy_params["fast_window"] + == strategy_parameters["fast_window"][0] + and strategy_params["slow_window"] + == strategy_parameters["slow_window"][0] + ) + or ( + strategy_params["fast_window"] + == strategy_parameters["fast_window"][1] + and strategy_params["slow_window"] + == strategy_parameters["slow_window"][1] + ) + or ( + strategy_params["fast_window"] + == strategy_parameters["fast_window"][2] + and strategy_params["slow_window"] + == strategy_parameters["slow_window"][2] + ) + or ( + strategy_params["fast_window"] + == strategy_parameters["fast_window"][3] + and strategy_params["slow_window"] + == strategy_parameters["slow_window"][3] + ) + ): + strategy_sides = determine_strategy_side(prices, **strategy_params) else: - continue + continue - triple_barrier_events = meta_events(prices, moelcules, [0.5, 1.5], volatility, 0, 1, vertical_barriers, strategy_sides) + triple_barrier_events = meta_events( + prices, + moelcules, + [0.5, 1.5], + volatility, + 0, + 1, + vertical_barriers, + strategy_sides, + ) labels = meta_labeling(triple_barrier_events, prices) - sample_weights = sample_weight_absolute_return_meta_labeling(triple_barrier_events['End Time'], prices, moelcules) + sample_weights = sample_weight_absolute_return_meta_labeling( + triple_barrier_events["End Time"], prices, moelcules + ) - index = features.loc[moelcules].dropna().index.intersection(labels.dropna().index).intersection(sample_weights.dropna().index) + index = ( + features.loc[moelcules] + .dropna() + .index.intersection(labels.dropna().index) + .intersection(sample_weights.dropna().index) + ) data = features.loc[index] - target = labels.loc[index]['Label'] + target = labels.loc[index]["Label"] weights = sample_weights.loc[index] - times = labels.loc[index]['End Time'] + times = labels.loc[index]["End Time"] # Iterate over each model and hyperparameter configuration for model_name, model_details in models.items(): - model = model_details['Model'] - param_grid = model_details['Parameters'] + model = model_details["Model"] + param_grid = model_details["Parameters"] # Generate all combinations of hyperparameters model_keys, model_values = zip(*param_grid.items()) @@ -543,40 +689,67 @@ def varying_embargo_backtest_overfitting_simulation( params = dict(zip(model_keys, model_value)) model.set_params(**params) - for cross_validator_type, cross_validator in cross_validators.items(): - if 'times' in cross_validator.__dict__: + for ( + cross_validator_type, + cross_validator, + ) in cross_validators.items(): + if "times" in cross_validator.__dict__: cross_validator.times = times - predictions = cross_validator.backtest_predictions(model, data, target, weights, predict_probability=True, n_jobs=n_jobs) - probabilities = pd.Series(np.vstack(list(map(lambda x: x[:, 1], predictions.values()))).mean(axis=0), times.index).dropna() - positions = strategy_bet_sizing(prices.index, times.loc[probabilities.index], strategy_sides[probabilities.index], probabilities) - strategy_log_returns = (np.log(prices).diff() * positions.shift()).dropna() - results[cross_validator_type].append({ - 'Trial Info.' : { - 'Strategy Parameters' : strategy_params, - 'Model Name': model_name, - 'Model Parameters': params, - }, - 'Returns': strategy_log_returns - }) + predictions = cross_validator.backtest_predictions( + model, + data, + target, + weights, + predict_probability=True, + n_jobs=n_jobs, + ) + probabilities = pd.Series( + np.vstack( + list(map(lambda x: x[:, 1], predictions.values())) + ).mean(axis=0), + times.index, + ).dropna() + positions = strategy_bet_sizing( + prices.index, + times.loc[probabilities.index], + strategy_sides[probabilities.index], + probabilities, + ) + strategy_log_returns = ( + np.log(prices).diff() * positions.shift() + ).dropna() + results[cross_validator_type].append( + { + "Trial Info.": { + "Strategy Parameters": strategy_params, + "Model Name": model_name, + "Model Parameters": params, + }, + "Returns": strategy_log_returns, + } + ) cv_deflated_sr = {} cv_pbo = {} for cv, trials in results.items(): - performances = pd.concat([trial['Returns'] for trial in trials], axis=1) + performances = pd.concat([trial["Returns"] for trial in trials], axis=1) - sharpe_ratios = performances.apply(lambda y: sharpe_ratio(y.values)) + sharpe_ratios = performances.apply(lambda y: sharpe_ratio(y.values)) benchmark_sr = benchmark_sharpe_ratio(sharpe_ratios) best_strategy_index = sharpe_ratios.idxmax() cv_deflated_sr[cv] = probabilistic_sharpe_ratio( - sharpe_ratios.loc[best_strategy_index], - benchmark_sr, len(performances), - ss.skew(performances[best_strategy_index]), + sharpe_ratios.loc[best_strategy_index], + benchmark_sr, + len(performances), + ss.skew(performances[best_strategy_index]), ss.kurtosis(performances[best_strategy_index]), - return_test_statistic=True + return_test_statistic=True, ) - pbo, logit_values = probability_of_backtest_overfitting(performances.values, risk_free_return=step_risk_free_rate, n_jobs=1) + pbo, logit_values = probability_of_backtest_overfitting( + performances.values, risk_free_return=step_risk_free_rate, n_jobs=1 + ) cv_pbo[cv] = pbo for cv in results.keys(): @@ -585,6 +758,7 @@ def varying_embargo_backtest_overfitting_simulation( return cv_pbo_embargo, cv_deflated_sr_embargo + def sharpe_ratio(returns, risk_free_rate=0): """Calculate the Sharpe ratio of the given returns.""" std = returns.std() @@ -592,6 +766,7 @@ def sharpe_ratio(returns, risk_free_rate=0): return 0.0 return (returns.mean() - risk_free_rate) / std + def sortino_ratio(returns, risk_free_rate=0): """Calculate the Sortino ratio of the given returns.""" downside_returns = returns[returns < risk_free_rate] @@ -607,18 +782,20 @@ def sortino_ratio(returns, risk_free_rate=0): return expected_return / downside_risk + def expected_shortfall(returns, step_risk_free_rate, confidence_level=0.05): """Calculate the expected shortfall (conditional VaR) of the given returns.""" var = np.percentile(returns, 100 * confidence_level) es = returns[returns <= var].mean() return es + def backtest_overfitting_simulation_financial_metrics_rank_correlation( - prices: pd.Series, + prices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, - n_jobs: int = 1 + n_jobs: int = 1, ) -> pd.DataFrame: """ Conducts a backtest overfitting simulation and calculates the rank correlation of financial metrics. @@ -635,41 +812,36 @@ def backtest_overfitting_simulation_financial_metrics_rank_correlation( """ # Run the backtest overfitting simulation to get results cross_validators = { - 'Walk-Forward' : CrossValidatorController( - 'walkforward', + "Walk-Forward": CrossValidatorController( + "walkforward", n_splits=4, ).cross_validator, - 'K-Fold' : CrossValidatorController( - 'kfold', + "K-Fold": CrossValidatorController( + "kfold", n_splits=4, ).cross_validator, - 'Purged K-Fold' : CrossValidatorController( - 'purgedkfold', - n_splits=4, - times=None, - embargo=0.02 + "Purged K-Fold": CrossValidatorController( + "purgedkfold", n_splits=4, times=None, embargo=0.02 ).cross_validator, - 'Combinatorial Purged' : CrossValidatorController( - 'combinatorialpurged', - n_splits=8, - n_test_groups=2, - times=None, - embargo=0.02 + "Combinatorial Purged": CrossValidatorController( + "combinatorialpurged", n_splits=8, n_test_groups=2, times=None, embargo=0.02 ).cross_validator, } - results = backtest_overfitting_simulation_results(prices, strategy_parameters, models, cross_validators, n_jobs=n_jobs) - + results = backtest_overfitting_simulation_results( + prices, strategy_parameters, models, cross_validators, n_jobs=n_jobs + ) + metrics = { - 'Sharpe Ratio': sharpe_ratio, - 'Sortino Ratio': sortino_ratio, - 'Expected Shortfall': expected_shortfall + "Sharpe Ratio": sharpe_ratio, + "Sortino Ratio": sortino_ratio, + "Expected Shortfall": expected_shortfall, } rank_correlations = {cv: {} for cv in results.keys()} for cv, trials in results.items(): - performances = pd.concat([trial['Returns'] for trial in trials], axis=1) + performances = pd.concat([trial["Returns"] for trial in trials], axis=1) # Split the data into two halves midpoint = len(performances) // 2 @@ -677,16 +849,34 @@ def backtest_overfitting_simulation_financial_metrics_rank_correlation( second_half = performances.iloc[midpoint:] # Calculate the metrics for each half - first_half_metrics = {metric_name: first_half.apply(lambda x: metric_func(x, step_risk_free_rate), axis=0) for metric_name, metric_func in metrics.items()} - second_half_metrics = {metric_name: second_half.apply(lambda x: metric_func(x, step_risk_free_rate), axis=0) for metric_name, metric_func in metrics.items()} + first_half_metrics = { + metric_name: first_half.apply( + lambda x: metric_func(x, step_risk_free_rate), axis=0 + ) + for metric_name, metric_func in metrics.items() + } + second_half_metrics = { + metric_name: second_half.apply( + lambda x: metric_func(x, step_risk_free_rate), axis=0 + ) + for metric_name, metric_func in metrics.items() + } # Rank the trials in each half - first_half_ranks = {metric_name: metrics_values.rank() for metric_name, metrics_values in first_half_metrics.items()} - second_half_ranks = {metric_name: metrics_values.rank() for metric_name, metrics_values in second_half_metrics.items()} + first_half_ranks = { + metric_name: metrics_values.rank() + for metric_name, metrics_values in first_half_metrics.items() + } + second_half_ranks = { + metric_name: metrics_values.rank() + for metric_name, metrics_values in second_half_metrics.items() + } # Calculate the rank correlation for each metric for metric_name in metrics.keys(): - rank_corr, _ = kendalltau(first_half_ranks[metric_name], second_half_ranks[metric_name]) + rank_corr, _ = kendalltau( + first_half_ranks[metric_name], second_half_ranks[metric_name] + ) rank_correlations[cv][metric_name] = rank_corr # Create the final DataFrame @@ -694,12 +884,13 @@ def backtest_overfitting_simulation_financial_metrics_rank_correlation( return rank_corr_df + def backtest_overfitting_simulation_model_complexity( - prices: pd.Series, + prices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Any], step_risk_free_rate: float, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Conducts a backtest overfitting simulation to compare the PBO and DSR values of each CV method for simple and complex models. @@ -715,83 +906,129 @@ def backtest_overfitting_simulation_model_complexity( Tuple[pd.DataFrame, pd.DataFrame]: Two DataFrames containing PBO and DSR values for each model and each CV method. """ # Initialize DataFrames to store results - pbo_df = pd.DataFrame(columns=['Combinatorial Purged'], index=models.keys()) - dsr_df = pd.DataFrame(columns=['Combinatorial Purged'], index=models.keys()) + pbo_df = pd.DataFrame(columns=["Combinatorial Purged"], index=models.keys()) + dsr_df = pd.DataFrame(columns=["Combinatorial Purged"], index=models.keys()) # Create features volatility = daily_volatility_with_log_returns(prices, 100) filter_threshold = 1.8 - moelcules = cusum_filter_events_dynamic_threshold(np.log(prices), filter_threshold * volatility) + moelcules = cusum_filter_events_dynamic_threshold( + np.log(prices), filter_threshold * volatility + ) vertical_barriers = vertical_barrier(prices, moelcules, 20) features = financial_features_backtest_overfitting_simulation(prices) for model_name, model in models.items(): results = { - 'Combinatorial Purged' : [], + "Combinatorial Purged": [], } - + # Iterate over each strategy parameter combination - strategy_parameters_keys, strategy_parameters_values = zip(*strategy_parameters.items()) + strategy_parameters_keys, strategy_parameters_values = zip( + *strategy_parameters.items() + ) for strategy_parameters_value in itertools.product(*strategy_parameters_values): - strategy_params = dict(zip(strategy_parameters_keys, strategy_parameters_value)) + strategy_params = dict( + zip(strategy_parameters_keys, strategy_parameters_value) + ) try: strategy_sides = determine_strategy_side(prices, **strategy_params) - except ValueError: + except ValueError: continue - triple_barrier_events = meta_events(prices, moelcules, [0.5, 1.5], volatility, 0, 1, vertical_barriers, strategy_sides) + triple_barrier_events = meta_events( + prices, + moelcules, + [0.5, 1.5], + volatility, + 0, + 1, + vertical_barriers, + strategy_sides, + ) labels = meta_labeling(triple_barrier_events, prices) - sample_weights = sample_weight_absolute_return_meta_labeling(triple_barrier_events['End Time'], prices, moelcules) + sample_weights = sample_weight_absolute_return_meta_labeling( + triple_barrier_events["End Time"], prices, moelcules + ) - index = features.loc[moelcules].dropna().index.intersection(labels.dropna().index).intersection(sample_weights.dropna().index) + index = ( + features.loc[moelcules] + .dropna() + .index.intersection(labels.dropna().index) + .intersection(sample_weights.dropna().index) + ) data = features.loc[index] - target = labels.loc[index]['Label'] + target = labels.loc[index]["Label"] weights = sample_weights.loc[index] - times = labels.loc[index]['End Time'] + times = labels.loc[index]["End Time"] cross_validators = { - 'Combinatorial Purged' : CrossValidatorController( - 'combinatorialpurged', + "Combinatorial Purged": CrossValidatorController( + "combinatorialpurged", n_splits=8, n_test_groups=2, times=times, - embargo=0.02 + embargo=0.02, ).cross_validator, } for cross_validator_type, cross_validator in cross_validators.items(): - predictions = cross_validator.backtest_predictions(model, data, target, weights, predict_probability=True, n_jobs=n_jobs) - probabilities = pd.Series(np.vstack(list(map(lambda x: x[:, 1], predictions.values()))).mean(axis=0), times.index).dropna() - positions = strategy_bet_sizing(prices.index, times.loc[probabilities.index], strategy_sides[probabilities.index], probabilities) - strategy_log_returns = (np.log(prices).diff() * positions.shift()).dropna() - results[cross_validator_type].append({ - 'Trial Info.' : { - 'Strategy Parameters' : strategy_params, - 'Model Name': model_name, - 'Model Parameters': {}, - }, - 'Returns': strategy_log_returns - }) + predictions = cross_validator.backtest_predictions( + model, + data, + target, + weights, + predict_probability=True, + n_jobs=n_jobs, + ) + probabilities = pd.Series( + np.vstack(list(map(lambda x: x[:, 1], predictions.values()))).mean( + axis=0 + ), + times.index, + ).dropna() + positions = strategy_bet_sizing( + prices.index, + times.loc[probabilities.index], + strategy_sides[probabilities.index], + probabilities, + ) + strategy_log_returns = ( + np.log(prices).diff() * positions.shift() + ).dropna() + results[cross_validator_type].append( + { + "Trial Info.": { + "Strategy Parameters": strategy_params, + "Model Name": model_name, + "Model Parameters": {}, + }, + "Returns": strategy_log_returns, + } + ) cv_deflated_sr = {} cv_pbo = {} for cv, trials in results.items(): - performances = pd.concat([trial['Returns'] for trial in trials], axis=1) + performances = pd.concat([trial["Returns"] for trial in trials], axis=1) - sharpe_ratios = performances.apply(lambda y: sharpe_ratio(y.values)) + sharpe_ratios = performances.apply(lambda y: sharpe_ratio(y.values)) benchmark_sr = benchmark_sharpe_ratio(sharpe_ratios) best_strategy_index = sharpe_ratios.idxmax() cv_deflated_sr[cv] = probabilistic_sharpe_ratio( - sharpe_ratios.loc[best_strategy_index], - benchmark_sr, len(performances), - ss.skew(performances[best_strategy_index]), + sharpe_ratios.loc[best_strategy_index], + benchmark_sr, + len(performances), + ss.skew(performances[best_strategy_index]), ss.kurtosis(performances[best_strategy_index]), - return_test_statistic=True + return_test_statistic=True, ) - pbo, logit_values = probability_of_backtest_overfitting(performances.values, risk_free_return=step_risk_free_rate, n_jobs=1) + pbo, logit_values = probability_of_backtest_overfitting( + performances.values, risk_free_return=step_risk_free_rate, n_jobs=1 + ) cv_pbo[cv] = pbo for cv in results.keys(): @@ -800,14 +1037,15 @@ def backtest_overfitting_simulation_model_complexity( return pbo_df, dsr_df + def noised_backtest_overfitting_simulation( - prices: pd.Series, + prices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, noise_scales: List[float], random_state: int = None, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Conducts a noised backtest overfitting simulation to compare the new PBO/DSR values for different noise scales. @@ -825,8 +1063,14 @@ def noised_backtest_overfitting_simulation( Tuple[pd.DataFrame, pd.DataFrame]: Two DataFrames containing PBO and DSR values for each noise scale and each CV method. """ # Initialize DataFrames to store results - pbo_df = pd.DataFrame(columns=['Walk-Forward', 'K-Fold', 'Purged K-Fold', 'Combinatorial Purged'], index=noise_scales) - dsr_df = pd.DataFrame(columns=['Walk-Forward', 'K-Fold', 'Purged K-Fold', 'Combinatorial Purged'], index=noise_scales) + pbo_df = pd.DataFrame( + columns=["Walk-Forward", "K-Fold", "Purged K-Fold", "Combinatorial Purged"], + index=noise_scales, + ) + dsr_df = pd.DataFrame( + columns=["Walk-Forward", "K-Fold", "Purged K-Fold", "Combinatorial Purged"], + index=noise_scales, + ) for noise_scale in noise_scales: # Perform the overall backtest overfitting simulation with the given noise scale @@ -837,7 +1081,7 @@ def noised_backtest_overfitting_simulation( step_risk_free_rate=step_risk_free_rate, noise_scale=noise_scale, random_state=random_state, - n_jobs=n_jobs + n_jobs=n_jobs, ) # Store the results in the DataFrames @@ -847,14 +1091,15 @@ def noised_backtest_overfitting_simulation( return pbo_df, dsr_df + def overall_novel_methods_backtest_overfitting_simulation( - prices: pd.Series, + prices: pd.Series, strategy_parameters: Dict[str, Union[List[int], List[float], List[bool]]], models: Dict[str, Dict[str, Any]], step_risk_free_rate: float, noise_scale: float = 0.0, random_state: int = None, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Tuple[Dict[str, float], Dict[str, float]]: """ Conducts an overall backtest overfitting simulation to calculate the metrics for the novel CPCV methods. @@ -873,23 +1118,19 @@ def overall_novel_methods_backtest_overfitting_simulation( """ cross_validators = { - 'Combinatorial Purged' : CrossValidatorController( - 'combinatorialpurged', - n_splits=8, - n_test_groups=2, - times=None, - embargo=0.02 + "Combinatorial Purged": CrossValidatorController( + "combinatorialpurged", n_splits=8, n_test_groups=2, times=None, embargo=0.02 ).cross_validator, - 'Bagged Combinatorial Purged' : CrossValidatorController( - 'baggedcombinatorialpurged', + "Bagged Combinatorial Purged": CrossValidatorController( + "baggedcombinatorialpurged", n_splits=8, n_test_groups=2, times=None, embargo=0.02, - random_state=random_state + random_state=random_state, ).cross_validator, - 'Adaptive Combinatorial Purged' : CrossValidatorController( - 'adaptivecombinatorialpurged', + "Adaptive Combinatorial Purged": CrossValidatorController( + "adaptivecombinatorialpurged", n_splits=8, n_test_groups=2, times=None, @@ -897,42 +1138,53 @@ def overall_novel_methods_backtest_overfitting_simulation( ).cross_validator, } - results = backtest_overfitting_simulation_results(prices, strategy_parameters, models, cross_validators, noise_scale, random_state, n_jobs=n_jobs) - + results = backtest_overfitting_simulation_results( + prices, + strategy_parameters, + models, + cross_validators, + noise_scale, + random_state, + n_jobs=n_jobs, + ) cv_deflated_sr = {} cv_pbo = {} for cv, trials in results.items(): - performances = pd.concat([trial['Returns'] for trial in trials], axis=1) + performances = pd.concat([trial["Returns"] for trial in trials], axis=1) - sharpe_ratios = performances.apply(lambda y: sharpe_ratio(y.values)) + sharpe_ratios = performances.apply(lambda y: sharpe_ratio(y.values)) benchmark_sr = benchmark_sharpe_ratio(sharpe_ratios) best_strategy_index = sharpe_ratios.idxmax() cv_deflated_sr[cv] = probabilistic_sharpe_ratio( - sharpe_ratios.loc[best_strategy_index], - benchmark_sr, len(performances), - ss.skew(performances[best_strategy_index]), + sharpe_ratios.loc[best_strategy_index], + benchmark_sr, + len(performances), + ss.skew(performances[best_strategy_index]), ss.kurtosis(performances[best_strategy_index]), - return_test_statistic=True + return_test_statistic=True, ) - pbo, logit_values = probability_of_backtest_overfitting(performances.values, risk_free_return=step_risk_free_rate) + pbo, logit_values = probability_of_backtest_overfitting( + performances.values, risk_free_return=step_risk_free_rate + ) cv_pbo[cv] = pbo - return cv_pbo, cv_deflated_sr + return cv_pbo, cv_deflated_sr + def get_cpu_info(): # Run the lscpu command - result = subprocess.run(['lscpu'], stdout=subprocess.PIPE) + result = subprocess.run(["lscpu"], stdout=subprocess.PIPE) # Decode the output from bytes to string - lscpu_output = result.stdout.decode('utf-8') - + lscpu_output = result.stdout.decode("utf-8") + # Parse the lscpu output cpu_info = {} - for line in lscpu_output.split('\n'): + for line in lscpu_output.split("\n"): if line.strip(): - parts = line.split(':', 1) + parts = line.split(":", 1) if len(parts) == 2: key, value = parts cpu_info[key.strip()] = value.strip() @@ -956,6 +1208,7 @@ def get_cpu_info(): return useful_info + def format_cpu_info(cpu_info): report = ( f"Architecture: {cpu_info['Architecture']}\n" @@ -974,63 +1227,83 @@ def format_cpu_info(cpu_info): ) return report + # Function to generate random data, target, weights, and times -def generate_random_data(n_samples: int, n_features: int) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, pd.Series]: - date_range = pd.date_range(start='1980-01-01', periods=n_samples, freq='1h') - data = pd.DataFrame(np.random.randn(n_samples, n_features), columns=[f'feature_{i}' for i in range(n_features)], index=date_range) +def generate_random_data( + n_samples: int, n_features: int +) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, pd.Series]: + date_range = pd.date_range(start="1980-01-01", periods=n_samples, freq="1h") + data = pd.DataFrame( + np.random.randn(n_samples, n_features), + columns=[f"feature_{i}" for i in range(n_features)], + index=date_range, + ) target = pd.Series(np.random.randint(0, 2, n_samples), index=date_range) weights = pd.Series(np.random.rand(n_samples), index=date_range) weights = weights / weights.sum() times = pd.Series(date_range + pd.DateOffset(hours=3), index=date_range) return data, target, weights, times + # Function to measure computational requirements -def measure_computational_requirements(cross_validator, model, data, target, weights, n_jobs: int = 1) -> Dict[str, Any]: +def measure_computational_requirements( + cross_validator, model, data, target, weights, n_jobs: int = 1 +) -> Dict[str, Any]: from memory_profiler import memory_usage # optional dependency: RiskLabAI[dev] start_time = time.time() - mem_usage = memory_usage((cross_validator.backtest_predictions, (model, data, target, weights), {'predict_probability': True, 'n_jobs': n_jobs}), interval=0.1) + mem_usage = memory_usage( + ( + cross_validator.backtest_predictions, + (model, data, target, weights), + {"predict_probability": True, "n_jobs": n_jobs}, + ), + interval=0.1, + ) end_time = time.time() - return { - 'execution_time': end_time - start_time, - 'memory_usage': max(mem_usage) - } + return {"execution_time": end_time - start_time, "memory_usage": max(mem_usage)} + # Main function to measure computational requirements for all CV methods def measure_all_cv_computational_requirements( cross_validators: Dict[str, Any], - n_samples: int = 40 * 252, - n_features: int = 22, + n_samples: int = 40 * 252, + n_features: int = 22, n_jobs: int = 1, - n_repeats: int = 30 + n_repeats: int = 30, ) -> pd.DataFrame: # Generate random data, target, weights, and times data, target, weights, times = generate_random_data(n_samples, n_features) # Define the logistic regression model without regularization - model = LogisticRegression(penalty=None, solver='lbfgs', max_iter=10000) - + model = LogisticRegression(penalty=None, solver="lbfgs", max_iter=10000) + # Measure computational requirements for each CV method - results = {cv_name: {'execution_time': [], 'memory_usage': []} for cv_name in cross_validators.keys()} - + results = { + cv_name: {"execution_time": [], "memory_usage": []} + for cv_name in cross_validators.keys() + } + for _ in tqdm(range(n_repeats)): for cv_name, cross_validator in cross_validators.items(): # print(f"Measuring computational requirements for {cv_name} (repeat {_ + 1}/{n_repeats})...") # Update cross-validator with times if required - if 'times' in cross_validator.__dict__: + if "times" in cross_validator.__dict__: cross_validator.times = times - result = measure_computational_requirements(cross_validator, model, data, target, weights, n_jobs=n_jobs) - results[cv_name]['execution_time'].append(result['execution_time']) - results[cv_name]['memory_usage'].append(result['memory_usage']) + result = measure_computational_requirements( + cross_validator, model, data, target, weights, n_jobs=n_jobs + ) + results[cv_name]["execution_time"].append(result["execution_time"]) + results[cv_name]["memory_usage"].append(result["memory_usage"]) # Calculate mean and standard deviation for each CV method results_summary = {} for cv_name, metrics in results.items(): results_summary[cv_name] = { - 'execution_time_mean': np.mean(metrics['execution_time']), - 'execution_time_std': np.std(metrics['execution_time']), - 'memory_usage_mean': np.mean(metrics['memory_usage']), - 'memory_usage_std': np.std(metrics['memory_usage']) + "execution_time_mean": np.mean(metrics["execution_time"]), + "execution_time_std": np.std(metrics["execution_time"]), + "memory_usage_mean": np.mean(metrics["memory_usage"]), + "memory_usage_std": np.std(metrics["memory_usage"]), } # Convert results to DataFrame @@ -1038,40 +1311,45 @@ def measure_all_cv_computational_requirements( return results_df + def measure_cpcv_parallelization( - n_samples: int = 40 * 252, - n_features: int = 22, + n_samples: int = 40 * 252, + n_features: int = 22, n_repeats: int = 30, - n_jobs_list: List[int] = range(1, 9) + n_jobs_list: List[int] = range(1, 9), ) -> pd.DataFrame: # Generate random data, target, weights, and times data, target, weights, times = generate_random_data(n_samples, n_features) # Define the logistic regression model without regularization - model = LogisticRegression(penalty=None, solver='lbfgs', max_iter=10000) + model = LogisticRegression(penalty=None, solver="lbfgs", max_iter=10000) # Define the CPCV cross-validator - cpcv_cross_validator = CrossValidatorController('combinatorialpurged', n_splits=8, n_test_groups=2, times=times, embargo=0.02).cross_validator + cpcv_cross_validator = CrossValidatorController( + "combinatorialpurged", n_splits=8, n_test_groups=2, times=times, embargo=0.02 + ).cross_validator # Measure computational requirements with and without parallelization results = {} - + for n_jobs in tqdm(n_jobs_list): - key = f'n_jobs_{n_jobs}' - results[key] = {'execution_time': [], 'memory_usage': []} + key = f"n_jobs_{n_jobs}" + results[key] = {"execution_time": [], "memory_usage": []} for _ in range(n_repeats): - result = measure_computational_requirements(cpcv_cross_validator, model, data, target, weights, n_jobs=n_jobs) - results[key]['execution_time'].append(result['execution_time']) - results[key]['memory_usage'].append(result['memory_usage']) + result = measure_computational_requirements( + cpcv_cross_validator, model, data, target, weights, n_jobs=n_jobs + ) + results[key]["execution_time"].append(result["execution_time"]) + results[key]["memory_usage"].append(result["memory_usage"]) # Calculate mean and standard deviation for each setting results_summary = {} for key, metrics in results.items(): results_summary[key] = { - 'execution_time_mean': np.mean(metrics['execution_time']), - 'execution_time_std': np.std(metrics['execution_time']), - 'memory_usage_mean': np.mean(metrics['memory_usage']), - 'memory_usage_std': np.std(metrics['memory_usage']) + "execution_time_mean": np.mean(metrics["execution_time"]), + "execution_time_std": np.std(metrics["execution_time"]), + "memory_usage_mean": np.mean(metrics["memory_usage"]), + "memory_usage_std": np.std(metrics["memory_usage"]), } # Convert results to DataFrame @@ -1079,14 +1357,15 @@ def measure_cpcv_parallelization( return results_df + def measure_cpcv_scalability( sample_sizes: List[int], feature_sizes: List[int], n_repeats: int = 1, - n_jobs: int = 1 + n_jobs: int = 1, ) -> pd.DataFrame: # Define the logistic regression model without regularization - model = LogisticRegression(penalty=None, solver='lbfgs', max_iter=10000) + model = LogisticRegression(penalty=None, solver="lbfgs", max_iter=10000) execution_times = pd.DataFrame(index=sample_sizes, columns=feature_sizes) memory_usages = pd.DataFrame(index=sample_sizes, columns=feature_sizes) @@ -1100,16 +1379,23 @@ def measure_cpcv_scalability( data, target, weights, times = generate_random_data(n_samples, n_features) # Define the CPCV cross-validator - cpcv_cross_validator = CrossValidatorController('combinatorialpurged', n_splits=8, n_test_groups=2, times=times, embargo=0.02).cross_validator + cpcv_cross_validator = CrossValidatorController( + "combinatorialpurged", + n_splits=8, + n_test_groups=2, + times=times, + embargo=0.02, + ).cross_validator # Measure computational requirements - result = measure_computational_requirements(cpcv_cross_validator, model, data, target, weights, n_jobs=n_jobs) - execution_time_list.append(result['execution_time']) - memory_usage_list.append(result['memory_usage']) + result = measure_computational_requirements( + cpcv_cross_validator, model, data, target, weights, n_jobs=n_jobs + ) + execution_time_list.append(result["execution_time"]) + memory_usage_list.append(result["memory_usage"]) # Calculate mean execution time and memory usage execution_times.loc[n_samples, n_features] = np.mean(execution_time_list) memory_usages.loc[n_samples, n_features] = np.mean(memory_usage_list) - return execution_times, memory_usages diff --git a/RiskLabAI/backtest/backtest_statistics.py b/RiskLabAI/backtest/backtest_statistics.py index 664550e..6276d92 100644 --- a/RiskLabAI/backtest/backtest_statistics.py +++ b/RiskLabAI/backtest/backtest_statistics.py @@ -14,10 +14,9 @@ import pandas as pd from numba import jit + @jit(nopython=True) -def sharpe_ratio( - returns: np.ndarray, risk_free_rate: float = 0.0 -) -> float: +def sharpe_ratio(returns: np.ndarray, risk_free_rate: float = 0.0) -> float: """ Calculate the Sharpe Ratio (Numba-optimized). @@ -35,12 +34,13 @@ def sharpe_ratio( """ excess_returns = returns - risk_free_rate std_dev = np.std(excess_returns) - + if std_dev == 0.0: return 0.0 - + return np.mean(excess_returns) / std_dev + def bet_timing(target_positions: pd.Series) -> pd.Index: """ Determine the timestamps of bets, defined as when positions @@ -84,6 +84,7 @@ def bet_timing(target_positions: pd.Series) -> pd.Index: return bets + def calculate_holding_period( target_positions: pd.Series, ) -> Tuple[pd.DataFrame, float]: @@ -108,9 +109,9 @@ def calculate_holding_period( time_entry = 0.0 # Average entry time position_diff = target_positions.diff() # Time difference in fractional days - time_diff = ( - target_positions.index - target_positions.index[0] - ) / np.timedelta64(1, "D") + time_diff = (target_positions.index - target_positions.index[0]) / np.timedelta64( + 1, "D" + ) for i in range(1, target_positions.shape[0]): current_pos = target_positions.iloc[i] @@ -120,9 +121,7 @@ def calculate_holding_period( if diff * prev_pos >= 0: # Position increase or flat if current_pos != 0: # Update average entry time - time_entry = ( - time_entry * prev_pos + time_diff[i] * diff - ) / current_pos + time_entry = (time_entry * prev_pos + time_diff[i] * diff) / current_pos else: # Position decrease or flip if current_pos * prev_pos < 0: # Position flip # Close old position @@ -142,22 +141,22 @@ def calculate_holding_period( "w": abs(diff), } ) - + if not hold_period_data: - return pd.DataFrame(columns=['dT', 'w']), np.nan - - hold_period_df = pd.DataFrame(hold_period_data).set_index('index') - + return pd.DataFrame(columns=["dT", "w"]), np.nan + + hold_period_df = pd.DataFrame(hold_period_data).set_index("index") + if hold_period_df["w"].sum() > 0: mean_holding_period = ( - (hold_period_df["dT"] * hold_period_df["w"]).sum() - / hold_period_df["w"].sum() - ) + hold_period_df["dT"] * hold_period_df["w"] + ).sum() / hold_period_df["w"].sum() else: mean_holding_period = np.nan return hold_period_df, mean_holding_period + def calculate_hhi(bet_returns: pd.Series) -> float: """ Calculate the Herfindahl-Hirschman Index (HHI) for concentration. @@ -185,13 +184,14 @@ def calculate_hhi(bet_returns: pd.Series) -> float: weights = bet_returns / total_return hhi = (weights**2).sum() - + # Normalize HHI n = bet_returns.shape[0] hhi_normalized = (hhi - 1.0 / n) / (1.0 - 1.0 / n) return hhi_normalized + def calculate_hhi_concentration(returns: pd.Series) -> Tuple[float, float, float]: """ Calculate HHI concentration for positive, negative, and monthly returns. @@ -210,13 +210,14 @@ def calculate_hhi_concentration(returns: pd.Series) -> Tuple[float, float, float """ hhi_positive = calculate_hhi(returns[returns >= 0]) hhi_negative = calculate_hhi(returns[returns < 0]) - + # Calculate time concentration (by month) time_concentration = returns.groupby(pd.Grouper(freq="M")).count() hhi_time = calculate_hhi(time_concentration) return hhi_positive, hhi_negative, hhi_time + def compute_drawdowns_time_under_water( pnl_series: pd.Series, dollars: bool = False ) -> Tuple[pd.Series, pd.Series]: @@ -264,7 +265,7 @@ def compute_drawdowns_time_under_water( drawdown_analysis_df = pd.DataFrame(drawdown_analysis_data) drawdown_analysis_df = drawdown_analysis_df.set_index("Start") - + if dollars: drawdown = drawdown_analysis_df["HWM"] - drawdown_analysis_df["Min"] else: @@ -273,11 +274,12 @@ def compute_drawdowns_time_under_water( # Time under water in fractional years time_under_water = ( - drawdown_analysis_df["Stop"] - drawdown_analysis_df.index - ) / np.timedelta64(1, "D") / 365.25 - + (drawdown_analysis_df["Stop"] - drawdown_analysis_df.index) + / np.timedelta64(1, "D") + / 365.25 + ) - drawdown.index.name = 'Datetime' - time_under_water.index.name = 'Datetime' + drawdown.index.name = "Datetime" + time_under_water.index.name = "Datetime" - return drawdown, time_under_water \ No newline at end of file + return drawdown, time_under_water diff --git a/RiskLabAI/backtest/backtest_synthetic_data.py b/RiskLabAI/backtest/backtest_synthetic_data.py index ade3102..455e51c 100644 --- a/RiskLabAI/backtest/backtest_synthetic_data.py +++ b/RiskLabAI/backtest/backtest_synthetic_data.py @@ -9,6 +9,7 @@ import numpy as np + def synthetic_back_testing( forecast: float, half_life: float, @@ -84,13 +85,13 @@ def synthetic_back_testing( ): stop_returns.append(gain) break - + mean_return = np.mean(stop_returns) std_return = np.std(stop_returns) sharpe_ratio = mean_return / std_return if std_return > 0 else 0.0 - + back_test_results.append( (profit_taking, stop_loss, mean_return, std_return, sharpe_ratio) ) - return back_test_results \ No newline at end of file + return back_test_results diff --git a/RiskLabAI/backtest/bet_sizing.py b/RiskLabAI/backtest/bet_sizing.py index cfaf69c..9672882 100644 --- a/RiskLabAI/backtest/bet_sizing.py +++ b/RiskLabAI/backtest/bet_sizing.py @@ -14,9 +14,7 @@ from RiskLabAI.hpc import mp_pandas_obj -def probability_bet_size( - probabilities: np.ndarray, sides: np.ndarray -) -> np.ndarray: +def probability_bet_size(probabilities: np.ndarray, sides: np.ndarray) -> np.ndarray: r""" Calculate the bet size based on probabilities and side. @@ -133,9 +131,7 @@ def strategy_bet_sizing( _probabilities = probabilities.loc[common_index] # 1. Calculate individual bet sizes - bet_sizes_arr = probability_bet_size( - _probabilities.to_numpy(), _sides.to_numpy() - ) + bet_sizes_arr = probability_bet_size(_probabilities.to_numpy(), _sides.to_numpy()) # 2. Calculate concurrent average avg_bet_sizes_arr = average_bet_sizes( @@ -151,12 +147,11 @@ def strategy_bet_sizing( # --- The following functions appear to be from de Prado (2018) --- # --- Naming convention (camelCase) is preserved for reference. --- -def avgActiveSignals( - signals: pd.DataFrame, nThreads: int -) -> pd.DataFrame: + +def avgActiveSignals(signals: pd.DataFrame, nThreads: int) -> pd.DataFrame: """ Calculate the average signal among active signals using parallel processing. - + Reference: De Prado, M. (2018) Advances in financial machine learning. Methodology: SNIPPET 10.2 @@ -178,7 +173,7 @@ def avgActiveSignals( timePoints = timePoints.union(signals.index.values) timePoints = list(timePoints) timePoints.sort() - + # 2) call parallel function out = mp_pandas_obj( mpAvgActiveSignals, @@ -189,12 +184,10 @@ def avgActiveSignals( return out -def mpAvgActiveSignals( - signals: pd.DataFrame, molecule: list -) -> pd.Series: +def mpAvgActiveSignals(signals: pd.DataFrame, molecule: list) -> pd.Series: """ Worker function for `avgActiveSignals`. - + At time `loc`, average signal among those still active. Signal is active if: a) issued before or at `loc` AND @@ -238,7 +231,9 @@ def mpAvgActiveSignals( end_times = signals["t1"].to_numpy()[finite] end_order = np.argsort(end_times, kind="mergesort") sorted_ends = end_times[end_order] - cum_signal_end = np.concatenate(([0.0], np.cumsum(signal_values[finite][end_order]))) + cum_signal_end = np.concatenate( + ([0.0], np.cumsum(signal_values[finite][end_order])) + ) cum_count_end = np.arange(len(sorted_ends) + 1) started = np.searchsorted(sorted_starts, molecule_array, side="right") @@ -326,11 +321,9 @@ def Signal( signal *= events.loc[signal.index, "side"] # meta-labeling # 2) compute average signal among those concurrently open - signal_df = signal.to_frame("signal").join( - events[["t1"]], how="left" - ) + signal_df = signal.to_frame("signal").join(events[["t1"]], how="left") avg_signal = avgActiveSignals(signal_df, nThreads) - + # 3) discretize signal discretized_signal = discreteSignal(signal=avg_signal, stepSize=stepSize) return discretized_signal @@ -358,9 +351,7 @@ def betSize(w: float, x: float) -> float: return x / np.sqrt(w + x**2) -def TPos( - w: float, f: float, acctualPrice: float, maximumPositionSize: int -) -> int: +def TPos(w: float, f: float, acctualPrice: float, maximumPositionSize: int) -> int: """ Calculate the target position size. @@ -383,9 +374,7 @@ def TPos( int The target position size (integer). """ - return int( - betSize(w, f - acctualPrice) * maximumPositionSize - ) + return int(betSize(w, f - acctualPrice) * maximumPositionSize) def inversePrice(f: float, w: float, m: float) -> float: @@ -410,7 +399,7 @@ def inversePrice(f: float, w: float, m: float) -> float: The implied price for bet size `m`. """ if m == 1.0 or m == -1.0: - return f # Avoid division by zero + return f # Avoid division by zero return f - m * np.sqrt(w / (1 - m**2)) @@ -446,17 +435,15 @@ def limitPrice( The average limit price. """ if targetPositionSize == cPosition: - return f # No change - + return f # No change + sgn = np.sign(targetPositionSize - cPosition) lP = 0.0 - + # Average price from current to target position - for i in range( - abs(cPosition + sgn), abs(targetPositionSize + sgn) - ): + for i in range(abs(cPosition + sgn), abs(targetPositionSize + sgn)): lP += inversePrice(f, w, i / float(maximumPositionSize)) - + lP /= abs(targetPositionSize - cPosition) return lP @@ -481,5 +468,5 @@ def getW(x: float, m: float) -> float: The implied 'w' coefficient. """ if m == 0.0 or m == 1.0 or m == -1.0: - return np.inf # w is undefined - return x**2 * ( (1 / m**2) - 1 ) \ No newline at end of file + return np.inf # w is undefined + return x**2 * ((1 / m**2) - 1) diff --git a/RiskLabAI/backtest/probabilistic_sharpe_ratio.py b/RiskLabAI/backtest/probabilistic_sharpe_ratio.py index 08ae0c3..b7a0bed 100644 --- a/RiskLabAI/backtest/probabilistic_sharpe_ratio.py +++ b/RiskLabAI/backtest/probabilistic_sharpe_ratio.py @@ -13,6 +13,7 @@ import numpy as np from scipy import stats as ss + def probabilistic_sharpe_ratio( observed_sharpe_ratio: float, benchmark_sharpe_ratio: float, @@ -141,4 +142,4 @@ def benchmark_sharpe_ratio(sharpe_ratio_estimates: List[float]) -> float: benchmark_value = standard_deviation * (term1 + term2) - return benchmark_value \ No newline at end of file + return benchmark_value diff --git a/RiskLabAI/backtest/probability_of_backtest_overfitting.py b/RiskLabAI/backtest/probability_of_backtest_overfitting.py index b16bbcb..520e8e3 100644 --- a/RiskLabAI/backtest/probability_of_backtest_overfitting.py +++ b/RiskLabAI/backtest/probability_of_backtest_overfitting.py @@ -19,6 +19,7 @@ from .backtest_statistics import sharpe_ratio + def performance_evaluation( train_partition: np.ndarray, test_partition: np.ndarray, @@ -54,15 +55,13 @@ def performance_evaluation( """ # 1. Find best strategy on training data evaluate_train = [ - metric(train_partition[:, i], risk_free_return) - for i in range(n_strategies) + metric(train_partition[:, i], risk_free_return) for i in range(n_strategies) ] best_strategy_idx = np.argmax(evaluate_train) # 2. Evaluate all strategies on test data evaluate_test = [ - metric(test_partition[:, i], risk_free_return) - for i in range(n_strategies) + metric(test_partition[:, i], risk_free_return) for i in range(n_strategies) ] # 3. Find rank of the best_strategy in the test set @@ -129,7 +128,7 @@ def probability_of_backtest_overfitting( _, n_strategies = performances.shape partitions = np.array_split(performances, n_partitions) partition_indices = list(range(n_partitions)) - + # Get all combinations of training partition indices partition_combinations_indices = list( combinations(partition_indices, n_partitions // 2) @@ -139,11 +138,7 @@ def probability_of_backtest_overfitting( delayed(performance_evaluation)( np.concatenate([partitions[i] for i in train_indices], axis=0), np.concatenate( - [ - partitions[i] - for i in partition_indices - if i not in train_indices - ], + [partitions[i] for i in partition_indices if i not in train_indices], axis=0, ), n_strategies, @@ -158,4 +153,4 @@ def probability_of_backtest_overfitting( pbo = results_arr[:, 0].mean() logit_values = results_arr[:, 1] - return pbo, logit_values \ No newline at end of file + return pbo, logit_values diff --git a/RiskLabAI/backtest/strategy_risk.py b/RiskLabAI/backtest/strategy_risk.py index d9754f2..5527c83 100644 --- a/RiskLabAI/backtest/strategy_risk.py +++ b/RiskLabAI/backtest/strategy_risk.py @@ -12,6 +12,7 @@ logger = logging.getLogger(__name__) + def sharpe_ratio_trials(p: float, n_run: int) -> Tuple[float, float, float]: r""" Simulate binomial trials to estimate mean, std dev, and Sharpe ratio. @@ -109,14 +110,13 @@ def implied_precision( """ a = (frequency + target_sharpe_ratio**2) * (profit_taking - stop_loss) ** 2 b = ( - (2 * frequency * stop_loss - target_sharpe_ratio**2 * (profit_taking - stop_loss)) - * (profit_taking - stop_loss) - ) + 2 * frequency * stop_loss - target_sharpe_ratio**2 * (profit_taking - stop_loss) + ) * (profit_taking - stop_loss) c = frequency * stop_loss**2 - + discriminant = b**2 - 4 * a * c if discriminant < 0: - return np.nan # No real solution + return np.nan # No real solution precision = (-b + np.sqrt(discriminant)) / (2.0 * a) return precision @@ -133,7 +133,7 @@ def bin_frequency( .. math:: f = \frac{S^2 (pt - sl)^2 p (1 - p)}{((pt - sl) p - sl)^2} - + Note: The original formula had `((pt - sl) * p + sl)` in the denominator, which corresponds to `sl` being negative (a loss). This implementation assumes `stop_loss` is a positive value, so the @@ -174,7 +174,7 @@ def bin_frequency( * (1 - precision) ) denominator = ((profit_taking - stop_loss) * precision + stop_loss) ** 2 - + if denominator == 0: return np.inf @@ -213,7 +213,7 @@ def binomial_sharpe_ratio( The annualized Sharpe Ratio. """ expected_return = (profit_taking * probability) + (stop_loss * (1 - probability)) - + p = probability stdev_return = (profit_taking - stop_loss) * np.sqrt(p * (1 - p)) @@ -222,7 +222,7 @@ def binomial_sharpe_ratio( sr_trade = expected_return / stdev_return sr_annual = sr_trade * np.sqrt(frequency) - + return sr_annual @@ -259,7 +259,7 @@ def mix_gaussians( """ n_obs1 = int(n_obs * probability) n_obs2 = n_obs - n_obs1 - + returns1 = np.random.normal(mu1, sigma1, size=n_obs1) returns2 = np.random.normal(mu2, sigma2, size=n_obs2) @@ -296,11 +296,11 @@ def failure_probability( negative_returns = returns[returns <= 0] if len(positive_returns) == 0 or len(negative_returns) == 0: - return 0.0 # Cannot calculate + return 0.0 # Cannot calculate profit_taking = positive_returns.mean() - stop_loss = negative_returns.mean() # This will be negative - + stop_loss = negative_returns.mean() # This will be negative + # Observed precision observed_precision = positive_returns.shape[0] / float(returns.shape[0]) @@ -308,23 +308,24 @@ def failure_probability( required_precision = implied_precision( abs(stop_loss), profit_taking, frequency, target_sharpe_ratio ) - + if np.isnan(required_precision): - return 1.0 # Cannot achieve target SR + return 1.0 # Cannot achieve target SR # Probability that observed_precision < required_precision # This is a test on a proportion p_var = observed_precision * (1 - observed_precision) if p_var == 0: return 0.0 if observed_precision >= required_precision else 1.0 - - p_std = np.sqrt(p_var / returns.shape[0]) # Std dev of the proportion - + + p_std = np.sqrt(p_var / returns.shape[0]) # Std dev of the proportion + z_score = (observed_precision - required_precision) / p_std - risk = ss.norm.cdf(z_score) # Prob of being <= required_precision + risk = ss.norm.cdf(z_score) # Prob of being <= required_precision return risk + def calculate_strategy_risk( mu1: float, mu2: float, @@ -362,12 +363,8 @@ def calculate_strategy_risk( float Calculated probability of strategy failure. """ - returns = mix_gaussians( - mu1, mu2, sigma1, sigma2, probability, n_obs - ) - probability_fail = failure_probability( - returns, frequency, target_sharpe_ratio - ) - + returns = mix_gaussians(mu1, mu2, sigma1, sigma2, probability, n_obs) + probability_fail = failure_probability(returns, frequency, target_sharpe_ratio) + logger.info("Probability that strategy will fail: %.2f%%", probability_fail * 100) - return probability_fail \ No newline at end of file + return probability_fail diff --git a/RiskLabAI/backtest/test_set_overfitting.py b/RiskLabAI/backtest/test_set_overfitting.py index 7a7a063..9bfcb3e 100644 --- a/RiskLabAI/backtest/test_set_overfitting.py +++ b/RiskLabAI/backtest/test_set_overfitting.py @@ -9,6 +9,7 @@ import scipy.stats as ss from scipy.stats import norm + def expected_max_sharpe_ratio( n_trials: int, mean_sharpe_ratio: float, std_sharpe_ratio: float ) -> float: @@ -45,7 +46,7 @@ def expected_max_sharpe_ratio( return 0.0 if n_trials == 1: return mean_sharpe_ratio - + euler_gamma = 0.5772156649 term1 = (1 - euler_gamma) * norm.ppf(1.0 - 1.0 / n_trials) @@ -55,6 +56,7 @@ def expected_max_sharpe_ratio( return expected_max_sr + def generate_max_sharpe_ratios( n_sims: int, n_trials_list: List[int], @@ -85,21 +87,20 @@ def generate_max_sharpe_ratios( for n_trials in n_trials_list: # Generate all simulations for this n_trials - sr_sims = rng.normal( - loc=0.0, scale=1.0, size=(n_sims, n_trials) - ) - + sr_sims = rng.normal(loc=0.0, scale=1.0, size=(n_sims, n_trials)) + # Normalize (z-score) each simulation row - sr_sims = (sr_sims - sr_sims.mean(axis=1, keepdims=True)) / \ - sr_sims.std(axis=1, keepdims=True) - + sr_sims = (sr_sims - sr_sims.mean(axis=1, keepdims=True)) / sr_sims.std( + axis=1, keepdims=True + ) + # Scale by target mean and std sr_sims = mean_sharpe_ratio + sr_sims * std_sharpe_ratio # Get max SR for each simulation max_sr = sr_sims.max(axis=1) - - output_temp = pd.DataFrame({'max_SR': max_sr, 'n_trials': n_trials}) + + output_temp = pd.DataFrame({"max_SR": max_sr, "n_trials": n_trials}) output_list.append(output_temp) return pd.concat(output_list, ignore_index=True) @@ -147,7 +148,7 @@ def mean_std_error( expected_sr.index.name = "nTrials" error_list = [] - + # 2. Run n_sims1 experiments for _ in range(int(n_sims1)): # 3. Generate simulated max SRs @@ -162,15 +163,17 @@ def mean_std_error( # 5. Calculate error error = avg_simulated_sr / expected_sr - 1.0 - error_list.append(error.rename('error')) + error_list.append(error.rename("error")) all_errors = pd.concat(error_list, axis=1).T # 6. Compute mean and std of errors - output = pd.DataFrame({ - "meanErr": all_errors.mean(), - "stdErr": all_errors.std(), - }) + output = pd.DataFrame( + { + "meanErr": all_errors.mean(), + "stdErr": all_errors.std(), + } + ) return output @@ -209,12 +212,10 @@ def estimated_sharpe_ratio_z_statistics( float The calculated Z-statistic. """ - denominator = ( - 1 - skew * sharpe_ratio + (kurt - 1) / 4.0 * sharpe_ratio**2 - ) + denominator = 1 - skew * sharpe_ratio + (kurt - 1) / 4.0 * sharpe_ratio**2 if denominator <= 0: return np.nan - + z = (sharpe_ratio - true_sharpe_ratio) * np.sqrt(t - 1) z /= np.sqrt(denominator) @@ -282,20 +283,16 @@ def theta_for_type2_error( float The \(\theta\) parameter. """ - denominator = ( - 1 - skew * sharpe_ratio + (kurt - 1) / 4.0 * sharpe_ratio**2 - ) + denominator = 1 - skew * sharpe_ratio + (kurt - 1) / 4.0 * sharpe_ratio**2 if denominator <= 0: return np.nan - + theta = true_sharpe_ratio * np.sqrt(t - 1) theta /= np.sqrt(denominator) return theta -def strategy_type2_error_probability( - alpha_k: float, k: int, theta: float -) -> float: +def strategy_type2_error_probability(alpha_k: float, k: int, theta: float) -> float: r""" Calculate the Type II error probability (beta) for multiple tests. @@ -321,4 +318,4 @@ def strategy_type2_error_probability( """ z_alpha = ss.norm.ppf((1 - alpha_k) ** (1.0 / k)) beta = ss.norm.cdf(z_alpha - theta) - return beta \ No newline at end of file + return beta diff --git a/RiskLabAI/backtest/validation/__init__.py b/RiskLabAI/backtest/validation/__init__.py index 29b7628..c4d28c1 100644 --- a/RiskLabAI/backtest/validation/__init__.py +++ b/RiskLabAI/backtest/validation/__init__.py @@ -24,7 +24,6 @@ __all__ = [ # Interface "CrossValidator", - # Validators "KFold", "PurgedKFold", @@ -32,8 +31,7 @@ "CombinatorialPurged", # <-- Fix "BaggedCombinatorialPurged", # <-- Fix "AdaptiveCombinatorialPurged", # <-- Fix - # Utilities "CrossValidatorFactory", "CrossValidatorController", -] \ No newline at end of file +] diff --git a/RiskLabAI/backtest/validation/adaptive_combinatorial_purged.py b/RiskLabAI/backtest/validation/adaptive_combinatorial_purged.py index 09a93dd..fa6c287 100644 --- a/RiskLabAI/backtest/validation/adaptive_combinatorial_purged.py +++ b/RiskLabAI/backtest/validation/adaptive_combinatorial_purged.py @@ -5,9 +5,7 @@ import warnings from collections import ChainMap -from typing import ( - Any, Dict, Generator, List, Optional, Tuple, Union -) +from typing import Any, Dict, Generator, List, Optional, Tuple, Union from itertools import combinations @@ -22,6 +20,7 @@ # For type hinting sklearn-like estimators Estimator = Any + class AdaptiveCombinatorialPurged(CombinatorialPurged): """ Adaptive Combinatorial Purged Cross-Validation (A-CPCV). @@ -69,29 +68,31 @@ def __init__( external_feature: Union[pd.Series, Dict[str, pd.Series]] = None, lower_quantile: float = 0.25, upper_quantile: float = 0.75, - subtract_border_adjustments: bool = True + subtract_border_adjustments: bool = True, ): """ Initialize the AdaptiveCombinatorialPurged class. """ super().__init__(n_splits, n_test_groups, times, embargo) - + if external_feature is None: raise ValueError("external_feature must be provided for A-CPCV") - + self.n_subsplits = n_subsplits self.external_feature = external_feature self.lower_quantile = lower_quantile self.upper_quantile = upper_quantile self.subtract_border_adjustments = subtract_border_adjustments - + # Check for multiple datasets consistency if self.is_multiple_datasets and not isinstance(external_feature, dict): raise ValueError( "If 'times' is a dict, 'external_feature' must also be a dict." ) - if not self.is_multiple_datasets and not isinstance(external_feature, pd.Series): - raise ValueError( + if not self.is_multiple_datasets and not isinstance( + external_feature, pd.Series + ): + raise ValueError( "If 'times' is a Series, 'external_feature' must be a Series." ) @@ -99,23 +100,19 @@ def _validate_input( self, single_times: pd.Series, single_data: pd.DataFrame, - single_external_feature: pd.Series + single_external_feature: pd.Series, ) -> None: """ Validate that data, times, and external feature share the same index. """ # Call parent validation super()._validate_input(single_times, single_data) - + if not single_data.index.equals(single_external_feature.index): - raise ValueError( - "Data and external_feature must have the same index" - ) + raise ValueError("Data and external_feature must have the same index") def _single_adaptive_split_segments( - self, - indices: np.ndarray, - single_external_feature: pd.Series + self, indices: np.ndarray, single_external_feature: pd.Series ) -> List[np.ndarray]: """ Adaptively split data indices based on the external feature. @@ -144,8 +141,10 @@ def _single_adaptive_split_segments( subsplits = np.array_split(indices, n_total_subsplits) # Get the start index of each subsplit - subsplit_starts_loc = np.array([split[0] for split in subsplits if len(split) > 0]) - + subsplit_starts_loc = np.array( + [split[0] for split in subsplits if len(split) > 0] + ) + # Get the feature values at the start of each subsplit # We need to map iloc back to the feature series's index subsplit_start_indices = single_external_feature.index[subsplit_starts_loc] @@ -170,7 +169,7 @@ def _single_adaptive_split_segments( # Ensure borders are within valid range adjusted_borders = np.clip(adjusted_borders, 1, len(subsplit_starts_loc) - 1) - + # Get the integer-location split points split_points = subsplit_starts_loc[adjusted_borders] @@ -183,14 +182,14 @@ def _single_adaptive_split_segments( return split_segments def _get_split_segments( - self, + self, single_data: pd.DataFrame, - single_external_feature: Optional[pd.Series] = None + single_external_feature: Optional[pd.Series] = None, ) -> List[np.ndarray]: """Override to use adaptive splitting.""" if single_external_feature is None: raise ValueError("_get_split_segments requires external_feature") - + indices = np.arange(single_data.shape[0]) return self._single_adaptive_split_segments(indices, single_external_feature) @@ -198,24 +197,18 @@ def _single_split( self, single_times: pd.Series, single_data: pd.DataFrame, - single_external_feature: Optional[pd.Series] = None + single_external_feature: Optional[pd.Series] = None, ) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]: """ Split a single dataset into C(n, k) adaptively purged indices. """ if single_external_feature is None: - raise ValueError("_single_split requires single_external_feature") - - self._validate_input( - single_times, single_data, single_external_feature - ) + raise ValueError("_single_split requires single_external_feature") - split_segments = self._get_split_segments( - single_data, single_external_feature - ) - combinations_list = list( - combinations(range(self.n_splits), self.n_test_groups) - ) + self._validate_input(single_times, single_data, single_external_feature) + + split_segments = self._get_split_segments(single_data, single_external_feature) + combinations_list = list(combinations(range(self.n_splits), self.n_test_groups)) all_combinatorial_splits = self._combinatorial_splits( combinations_list, split_segments @@ -231,10 +224,10 @@ def split( self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] = None, - groups: Optional[np.ndarray] = None + groups: Optional[np.ndarray] = None, ) -> Union[ Generator[Tuple[np.ndarray, np.ndarray], None, None], - Generator[Tuple[str, Tuple[np.ndarray, np.ndarray]], None, None] + Generator[Tuple[str, Tuple[np.ndarray, np.ndarray]], None, None], ]: """ Split data (or dictionary of data) into adaptively purged indices. @@ -256,64 +249,61 @@ def split( yield train_indices, test_indices def _combinations_and_path_locations_and_split_segments( - self, - data: pd.DataFrame, - single_external_feature: Optional[pd.Series] = None - ) -> Tuple[List[Tuple[int, ...]], Dict[int, List[Tuple[int, int]]], List[np.ndarray]]: + self, data: pd.DataFrame, single_external_feature: Optional[pd.Series] = None + ) -> Tuple[ + List[Tuple[int, ...]], Dict[int, List[Tuple[int, int]]], List[np.ndarray] + ]: """Helper to compute all components, now including adaptive splits.""" if single_external_feature is None: - raise ValueError("Method requires single_external_feature") + raise ValueError("Method requires single_external_feature") - combinations_list = list( - combinations(range(self.n_splits), self.n_test_groups) - ) + combinations_list = list(combinations(range(self.n_splits), self.n_test_groups)) locations = self._path_locations(self.n_splits, combinations_list) split_segments = self._get_split_segments(data, single_external_feature) return combinations_list, locations, split_segments def _single_backtest_paths( - self, - single_times: pd.Series, - single_data: pd.DataFrame, - single_external_feature: Optional[pd.Series] = None + self, + single_times: pd.Series, + single_data: pd.DataFrame, + single_external_feature: Optional[pd.Series] = None, ) -> Dict[str, List[Dict[str, np.ndarray]]]: """ Generate all adaptive combinatorial backtest paths. """ if single_external_feature is None: - raise ValueError("Method requires single_external_feature") - - self._validate_input( - single_times, single_data, single_external_feature - ) + raise ValueError("Method requires single_external_feature") + + self._validate_input(single_times, single_data, single_external_feature) paths = {} - combinations_list, locations, split_segments = \ + combinations_list, locations, split_segments = ( self._combinations_and_path_locations_and_split_segments( single_data, single_external_feature ) - + ) + all_combinatorial_splits = list( self._combinatorial_splits(combinations_list, split_segments) ) for path_num, locs in locations.items(): path_data = [] - for (group_idx, split_idx) in locs: + for group_idx, split_idx in locs: combinatorial_test_indices = all_combinatorial_splits[split_idx] train_indices = self._get_train_indices( - combinatorial_test_indices, - single_times, - continous_test_times=False + combinatorial_test_indices, single_times, continous_test_times=False ) test_indices_segment = split_segments[group_idx] - path_data.append({ - "Train": train_indices, - "Test": test_indices_segment, - }) - + path_data.append( + { + "Train": train_indices, + "Test": test_indices_segment, + } + ) + paths[f"Path {path_num}"] = path_data return paths @@ -322,7 +312,7 @@ def backtest_paths( data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], ) -> Union[ Dict[str, List[Dict[str, np.ndarray]]], - Dict[str, Dict[str, List[Dict[str, np.ndarray]]]] + Dict[str, Dict[str, List[Dict[str, np.ndarray]]]], ]: """ Generate adaptive backtest paths for data or a dictionary of data. @@ -336,12 +326,10 @@ def backtest_paths( self.times[key], data[key], self.external_feature[key] ) return multiple_paths - + if not isinstance(data, pd.DataFrame): raise ValueError("If 'times' is a Series, 'data' must be a DataFrame.") - return self._single_backtest_paths( - self.times, data, self.external_feature - ) + return self._single_backtest_paths(self.times, data, self.external_feature) def _single_backtest_predictions( self, @@ -350,44 +338,40 @@ def _single_backtest_predictions( single_data: pd.DataFrame, single_labels: pd.Series, single_weights: Optional[np.ndarray] = None, - single_external_feature: Optional[pd.Series] = None, # New arg + single_external_feature: Optional[pd.Series] = None, # New arg predict_probability: bool = False, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Dict[str, np.ndarray]: """ Obtain backtest predictions for all A-CPCV paths. """ if single_external_feature is None: - raise ValueError("Method requires single_external_feature") + raise ValueError("Method requires single_external_feature") - self._validate_input( - single_times, single_data, single_external_feature - ) + self._validate_input(single_times, single_data, single_external_feature) if single_weights is None: single_weights = np.ones(len(single_data)) - combinations_list, locations, split_segments = \ + combinations_list, locations, split_segments = ( self._combinations_and_path_locations_and_split_segments( single_data, single_external_feature ) + ) def train_single_estimator( - estimator_: Estimator, - combinatorial_test_indices: np.ndarray + estimator_: Estimator, combinatorial_test_indices: np.ndarray ) -> Estimator: """Train one estimator for one C(n,k) split.""" train_indices = self._get_train_indices( - combinatorial_test_indices, - single_times, - continous_test_times=False + combinatorial_test_indices, single_times, continous_test_times=False ) X_train = single_data.iloc[train_indices] y_train = single_labels.iloc[train_indices] weights_train = single_weights[train_indices] with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=ConvergenceWarning) + warnings.filterwarnings("ignore", category=ConvergenceWarning) try: estimator_.fit(X_train, y_train, sample_weight=weights_train) except (TypeError, ValueError): @@ -397,21 +381,18 @@ def train_single_estimator( # 1. Train all C(n, k) estimators in parallel combinatorial_trained_estimators = Parallel(n_jobs=n_jobs)( - delayed(train_single_estimator)( - clone(single_estimator), test_indices - ) + delayed(train_single_estimator)(clone(single_estimator), test_indices) for test_indices in self._combinatorial_splits( combinations_list, split_segments ) ) def get_path_data( - path_num: int, - locs: List[Tuple[int, int]] + path_num: int, locs: List[Tuple[int, int]] ) -> Dict[str, np.ndarray]: """Assemble predictions for one path.""" path_predictions = [] - for (group_idx, split_idx) in locs: + for group_idx, split_idx in locs: test_indices_segment = split_segments[group_idx] X_test = single_data.iloc[test_indices_segment] estimator = combinatorial_trained_estimators[split_idx] @@ -422,7 +403,7 @@ def get_path_data( preds = estimator.predict(X_test) path_predictions.append(preds) - + return {f"Path {path_num}": np.concatenate(path_predictions)} # 2. Assemble predictions for all paths @@ -430,7 +411,7 @@ def get_path_data( delayed(get_path_data)(path_num, locs) for path_num, locs in locations.items() ) - + paths_predictions = dict(ChainMap(*reversed(path_results))) return paths_predictions @@ -441,34 +422,40 @@ def backtest_predictions( labels: Union[pd.Series, Dict[str, pd.Series]], sample_weights: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] = None, predict_probability: bool = False, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Union[Dict[str, np.ndarray], Dict[str, Dict[str, np.ndarray]]]: """ Generate adaptive backtest predictions. """ if self.is_multiple_datasets: - if not (isinstance(data, dict) and - isinstance(estimator, dict) and - isinstance(labels, dict)): + if not ( + isinstance(data, dict) + and isinstance(estimator, dict) + and isinstance(labels, dict) + ): raise ValueError( "If 'times' is a dict, 'data', 'estimator', and 'labels' " "must also be dicts." ) - + multiple_paths_predictions = {} for key in self.times: s_weights = sample_weights[key] if sample_weights else None multiple_paths_predictions[key] = self._single_backtest_predictions( - estimator[key], self.times[key], data[key], labels[key], - s_weights, self.external_feature[key], - predict_probability, n_jobs + estimator[key], + self.times[key], + data[key], + labels[key], + s_weights, + self.external_feature[key], + predict_probability, + n_jobs, ) return multiple_paths_predictions # Handle single dataset case - if not (isinstance(data, pd.DataFrame) and - isinstance(labels, pd.Series)): - raise ValueError( + if not (isinstance(data, pd.DataFrame) and isinstance(labels, pd.Series)): + raise ValueError( "If 'times' is a Series, 'data' must be a DataFrame " "and 'labels' must be a Series." ) @@ -481,5 +468,5 @@ def backtest_predictions( sample_weights, self.external_feature, predict_probability, - n_jobs - ) \ No newline at end of file + n_jobs, + ) diff --git a/RiskLabAI/backtest/validation/bagged_combinatorial_purged.py b/RiskLabAI/backtest/validation/bagged_combinatorial_purged.py index 1b8393c..c6b60ce 100644 --- a/RiskLabAI/backtest/validation/bagged_combinatorial_purged.py +++ b/RiskLabAI/backtest/validation/bagged_combinatorial_purged.py @@ -4,9 +4,7 @@ import warnings from collections import ChainMap -from typing import ( - Any, Dict, Optional, Union -) +from typing import Any, Dict, Optional, Union import numpy as np import pandas as pd @@ -22,6 +20,7 @@ # For type hinting sklearn-like estimators Estimator = Any + class BaggedCombinatorialPurged(CombinatorialPurged): """ Bagged Combinatorial Purged Cross-Validation (B-CPCV). @@ -69,7 +68,7 @@ def __init__( max_features: float = 1.0, bootstrap: bool = True, bootstrap_features: bool = False, - random_state: int = None + random_state: int = None, ): """ Initialize the BaggedCombinatorialPurged class. @@ -91,7 +90,7 @@ def _single_backtest_predictions( single_labels: pd.Series, single_weights: Optional[np.ndarray] = None, predict_probability: bool = False, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Dict[str, np.ndarray]: """ Obtain backtest predictions for all B-CPCV paths. @@ -127,24 +126,22 @@ def _single_backtest_predictions( if single_weights is None: single_weights = np.ones(len(single_data)) - + if predict_probability and not self.classifier: raise ValueError( "Cannot use predict_probability=True when classifier=False" ) - combinations_list, locations, split_segments = \ + combinations_list, locations, split_segments = ( self._combinations_and_path_locations_and_split_segments(single_data) + ) def train_single_bagging_estimator( - base_estimator_: Estimator, - combinatorial_test_indices: np.ndarray + base_estimator_: Estimator, combinatorial_test_indices: np.ndarray ) -> Estimator: """Train one Bagging estimator for one C(n,k) split.""" train_indices = self._get_train_indices( - combinatorial_test_indices, - single_times, - continous_test_times=False + combinatorial_test_indices, single_times, continous_test_times=False ) X_train = single_data.iloc[train_indices] @@ -161,7 +158,7 @@ def train_single_bagging_estimator( bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, random_state=self.random_state, - n_jobs=n_jobs # Parallelize bagging itself + n_jobs=n_jobs, # Parallelize bagging itself ) else: bagging_estimator = BaggingRegressor( @@ -172,12 +169,12 @@ def train_single_bagging_estimator( bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, random_state=self.random_state, - n_jobs=n_jobs # Parallelize bagging itself + n_jobs=n_jobs, # Parallelize bagging itself ) # --- End Bagging Wrapper --- with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=ConvergenceWarning) + warnings.filterwarnings("ignore", category=ConvergenceWarning) try: bagging_estimator.fit(X_train, y_train, sample_weight=weights_train) except (TypeError, ValueError): @@ -190,28 +187,26 @@ def train_single_bagging_estimator( # Note: We set n_jobs=1 for the *outer* parallel loop # and let the *inner* bagging estimator use `n_jobs`. # This avoids nested parallelization issues. - + # Determine parallelization strategy if n_jobs > 1 or n_jobs == -1: outer_n_jobs = n_jobs - inner_n_jobs = 1 - + inner_n_jobs = 1 + # Update bagging params to use inner_n_jobs if self.classifier: - BaggingClassifier.__init__ = ( - lambda self, **kwargs: - super(BaggingClassifier, self).__init__(**kwargs, n_jobs=inner_n_jobs) - ) + BaggingClassifier.__init__ = lambda self, **kwargs: super( + BaggingClassifier, self + ).__init__(**kwargs, n_jobs=inner_n_jobs) else: - BaggingRegressor.__init__ = ( - lambda self, **kwargs: - super(BaggingRegressor, self).__init__(**kwargs, n_jobs=inner_n_jobs) - ) + BaggingRegressor.__init__ = lambda self, **kwargs: super( + BaggingRegressor, self + ).__init__(**kwargs, n_jobs=inner_n_jobs) else: # Let bagging use all cores if outer loop is serial outer_n_jobs = 1 - inner_n_jobs = -1 # Use all - + inner_n_jobs = -1 # Use all + combinatorial_trained_estimators = Parallel(n_jobs=outer_n_jobs)( delayed(train_single_bagging_estimator)( clone(single_estimator), test_indices @@ -220,20 +215,18 @@ def train_single_bagging_estimator( combinations_list, split_segments ) ) - + # Restore default constructors BaggingClassifier.__init__ = BaggingClassifier.__init__ BaggingRegressor.__init__ = BaggingRegressor.__init__ - # 2. Assemble predictions (this is fast, can be serial or parallel) def get_path_data( - path_num: int, - locs: List[Tuple[int, int]] + path_num: int, locs: List[Tuple[int, int]] ) -> Dict[str, np.ndarray]: """Assemble predictions for one path.""" path_predictions = [] - for (group_idx, split_idx) in locs: + for group_idx, split_idx in locs: test_indices_segment = split_segments[group_idx] X_test = single_data.iloc[test_indices_segment] estimator = combinatorial_trained_estimators[split_idx] @@ -244,13 +237,13 @@ def get_path_data( preds = estimator.predict(X_test) path_predictions.append(preds) - + return {f"Path {path_num}": np.concatenate(path_predictions)} path_results = Parallel(n_jobs=outer_n_jobs)( delayed(get_path_data)(path_num, locs) for path_num, locs in locations.items() ) - + paths_predictions = dict(ChainMap(*reversed(path_results))) - return paths_predictions \ No newline at end of file + return paths_predictions diff --git a/RiskLabAI/backtest/validation/combinatorial_purged.py b/RiskLabAI/backtest/validation/combinatorial_purged.py index ee42d42..3efceff 100644 --- a/RiskLabAI/backtest/validation/combinatorial_purged.py +++ b/RiskLabAI/backtest/validation/combinatorial_purged.py @@ -8,9 +8,7 @@ from copy import deepcopy from itertools import combinations from math import comb -from typing import ( - Any, Dict, Generator, List, Optional, Tuple, Union -) +from typing import Any, Dict, Generator, List, Optional, Tuple, Union import numpy as np import pandas as pd @@ -22,6 +20,7 @@ # For type hinting sklearn-like estimators Estimator = Any + class CombinatorialPurged(PurgedKFold): """ Combinatorial Purged Cross-Validation (CPCV). @@ -50,8 +49,7 @@ class CombinatorialPurged(PurgedKFold): @staticmethod def _path_locations( - n_splits: int, - combinations_list: List[Tuple[int, ...]] + n_splits: int, combinations_list: List[Tuple[int, ...]] ) -> Dict[int, List[Tuple[int, int]]]: """ Generate a labeled path matrix to map splits to backtest paths. @@ -70,7 +68,7 @@ def _path_locations( coordinates (group, split_num) in the path matrix. """ n_combinations = len(combinations_list) - + # Initialize a zero matrix matrix = np.zeros((n_splits, n_combinations), dtype=int) @@ -103,8 +101,7 @@ def label_path_row(row: np.ndarray) -> np.ndarray: @staticmethod def _combinatorial_splits( - combinations_list: List[Tuple[int, ...]], - split_segments: List[np.ndarray] + combinations_list: List[Tuple[int, ...]], split_segments: List[np.ndarray] ) -> Generator[np.ndarray, None, None]: """ Generate combinatorial test sets. @@ -124,8 +121,7 @@ def _combinatorial_splits( """ for test_groups in combinations_list: test_sets = [ - split for i, split in enumerate(split_segments) - if i in test_groups + split for i, split in enumerate(split_segments) if i in test_groups ] yield np.concatenate(test_sets) @@ -134,7 +130,7 @@ def __init__( n_splits: int, n_test_groups: int, times: Union[pd.Series, Dict[str, pd.Series]], - embargo: float = 0 + embargo: float = 0, ) -> None: """ Initialize the CombinatorialPurged class. @@ -152,16 +148,14 @@ def __init__( """ super().__init__(n_splits, times, embargo) if n_test_groups >= n_splits: - raise ValueError( - "n_test_groups must be strictly less than n_splits" - ) + raise ValueError("n_test_groups must be strictly less than n_splits") self.n_test_groups = n_test_groups def get_n_splits( self, data: Optional[Union[pd.DataFrame, Dict[str, pd.DataFrame]]] = None, labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] = None, - groups: Optional[np.ndarray] = None + groups: Optional[np.ndarray] = None, ) -> int: """ Return the total number of combinatorial splits. @@ -213,9 +207,7 @@ def _single_split( self._validate_input(single_times, single_data) split_segments = self._get_split_segments(single_data) - combinations_list = list( - combinations(range(self.n_splits), self.n_test_groups) - ) + combinations_list = list(combinations(range(self.n_splits), self.n_test_groups)) all_combinatorial_splits = self._combinatorial_splits( combinations_list, split_segments @@ -229,9 +221,10 @@ def _single_split( yield train_indices, test_indices def _combinations_and_path_locations_and_split_segments( - self, - data: pd.DataFrame - ) -> Tuple[List[Tuple[int, ...]], Dict[int, List[Tuple[int, int]]], List[np.ndarray]]: + self, data: pd.DataFrame + ) -> Tuple[ + List[Tuple[int, ...]], Dict[int, List[Tuple[int, int]]], List[np.ndarray] + ]: """ Helper to compute all necessary components for CPCV. @@ -247,18 +240,16 @@ def _combinations_and_path_locations_and_split_segments( - locations: The path location dictionary from `_path_locations`. - split_segments: List of index arrays for each of the `n_splits` groups. """ - combinations_list = list( - combinations(range(self.n_splits), self.n_test_groups) - ) + combinations_list = list(combinations(range(self.n_splits), self.n_test_groups)) locations = self._path_locations(self.n_splits, combinations_list) split_segments = self._get_split_segments(data) return combinations_list, locations, split_segments def _single_backtest_paths( - self, - single_times: pd.Series, - single_data: pd.DataFrame, + self, + single_times: pd.Series, + single_data: pd.DataFrame, ) -> Dict[str, List[Dict[str, np.ndarray]]]: """ Generate all combinatorial backtest paths for a single dataset. @@ -279,34 +270,35 @@ def _single_backtest_paths( self._validate_input(single_times, single_data) paths = {} - combinations_list, locations, split_segments = \ + combinations_list, locations, split_segments = ( self._combinations_and_path_locations_and_split_segments(single_data) - + ) + all_combinatorial_splits = list( self._combinatorial_splits(combinations_list, split_segments) ) for path_num, locs in locations.items(): path_data = [] - for (group_idx, split_idx) in locs: + for group_idx, split_idx in locs: # Get the full test set for this *combination* combinatorial_test_indices = all_combinatorial_splits[split_idx] - + # Get the train set purged against this *combination* train_indices = self._get_train_indices( - combinatorial_test_indices, - single_times, - continous_test_times=False + combinatorial_test_indices, single_times, continous_test_times=False ) - + # The test set for this *path segment* is just one group test_indices_segment = split_segments[group_idx] - path_data.append({ - "Train": train_indices, - "Test": test_indices_segment, - }) - + path_data.append( + { + "Train": train_indices, + "Test": test_indices_segment, + } + ) + # This path is complete paths[f"Path {path_num}"] = path_data @@ -320,7 +312,7 @@ def _single_backtest_predictions( single_labels: pd.Series, single_weights: Optional[np.ndarray] = None, predict_probability: bool = False, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Dict[str, np.ndarray]: """ Obtain backtest predictions for all CPCV paths. @@ -353,18 +345,16 @@ def _single_backtest_predictions( if single_weights is None: single_weights = np.ones(len(single_data)) - combinations_list, locations, split_segments = \ + combinations_list, locations, split_segments = ( self._combinations_and_path_locations_and_split_segments(single_data) + ) def train_single_estimator( - estimator_: Estimator, - combinatorial_test_indices: np.ndarray + estimator_: Estimator, combinatorial_test_indices: np.ndarray ) -> Estimator: """Train one estimator for one C(n,k) split.""" train_indices = self._get_train_indices( - combinatorial_test_indices, - single_times, - continous_test_times=False + combinatorial_test_indices, single_times, continous_test_times=False ) X_train = single_data.iloc[train_indices] @@ -372,7 +362,7 @@ def train_single_estimator( weights_train = single_weights[train_indices] with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=ConvergenceWarning) + warnings.filterwarnings("ignore", category=ConvergenceWarning) try: estimator_.fit(X_train, y_train, sample_weight=weights_train) except TypeError: @@ -382,26 +372,23 @@ def train_single_estimator( # 1. Train all C(n, k) estimators in parallel combinatorial_trained_estimators = Parallel(n_jobs=n_jobs)( - delayed(train_single_estimator)( - deepcopy(single_estimator), test_indices - ) + delayed(train_single_estimator)(deepcopy(single_estimator), test_indices) for test_indices in self._combinatorial_splits( combinations_list, split_segments ) ) def get_path_data( - path_num: int, - locs: List[Tuple[int, int]] + path_num: int, locs: List[Tuple[int, int]] ) -> Dict[str, np.ndarray]: """Assemble predictions for one path.""" path_predictions = [] - for (group_idx, split_idx) in locs: + for group_idx, split_idx in locs: # Get the test segment for this path test_indices_segment = split_segments[group_idx] X_test = single_data.iloc[test_indices_segment] - + # Get the estimator trained for this split estimator = combinatorial_trained_estimators[split_idx] @@ -411,7 +398,7 @@ def get_path_data( preds = estimator.predict(X_test) path_predictions.append(preds) - + return {f"Path {path_num}": np.concatenate(path_predictions)} # 2. Assemble predictions for all paths in parallel @@ -419,7 +406,7 @@ def get_path_data( delayed(get_path_data)(path_num, locs) for path_num, locs in locations.items() ) - + # Combine list of dicts into one dict paths_predictions = dict(ChainMap(*reversed(path_results))) - return paths_predictions \ No newline at end of file + return paths_predictions diff --git a/RiskLabAI/backtest/validation/cross_validator_controller.py b/RiskLabAI/backtest/validation/cross_validator_controller.py index d889d1d..08d5669 100644 --- a/RiskLabAI/backtest/validation/cross_validator_controller.py +++ b/RiskLabAI/backtest/validation/cross_validator_controller.py @@ -7,6 +7,7 @@ from .cross_validator_factory import CrossValidatorFactory from .cross_validator_interface import CrossValidator + class CrossValidatorController: """ Controller class to handle the cross-validation process. @@ -15,11 +16,7 @@ class CrossValidatorController: creation and access to a specific cross-validator using the factory. """ - def __init__( - self, - validator_type: str, - **kwargs: Any - ): + def __init__(self, validator_type: str, **kwargs: Any): """ Initializes the CrossValidatorController. @@ -32,11 +29,9 @@ def __init__( Additional keyword arguments to be passed to the cross-validator's constructor. """ - self.cross_validator: CrossValidator = \ - CrossValidatorFactory.create_cross_validator( - validator_type, - **kwargs - ) + self.cross_validator: CrossValidator = ( + CrossValidatorFactory.create_cross_validator(validator_type, **kwargs) + ) def get_validator(self) -> CrossValidator: """ @@ -47,4 +42,4 @@ def get_validator(self) -> CrossValidator: CrossValidator The underlying cross-validator instance. """ - return self.cross_validator \ No newline at end of file + return self.cross_validator diff --git a/RiskLabAI/backtest/validation/cross_validator_factory.py b/RiskLabAI/backtest/validation/cross_validator_factory.py index 6e2adb1..c2ab4f8 100644 --- a/RiskLabAI/backtest/validation/cross_validator_factory.py +++ b/RiskLabAI/backtest/validation/cross_validator_factory.py @@ -12,28 +12,26 @@ from .purged_kfold import PurgedKFold from .walk_forward import WalkForward + class CrossValidatorFactory: """ Factory class for creating cross-validator objects. - + This class uses a static method to encapsulate the logic for instantiating different cross-validator strategies. """ VALIDATORS = { - 'kfold': KFold, - 'walkforward': WalkForward, - 'purgedkfold': PurgedKFold, - 'combinatorialpurged': CombinatorialPurged, - 'baggedcombinatorialpurged': BaggedCombinatorialPurged, - 'adaptivecombinatorialpurged': AdaptiveCombinatorialPurged, + "kfold": KFold, + "walkforward": WalkForward, + "purgedkfold": PurgedKFold, + "combinatorialpurged": CombinatorialPurged, + "baggedcombinatorialpurged": BaggedCombinatorialPurged, + "adaptivecombinatorialpurged": AdaptiveCombinatorialPurged, } @staticmethod - def create_cross_validator( - validator_type: str, - **kwargs: Any - ) -> CrossValidator: + def create_cross_validator(validator_type: str, **kwargs: Any) -> CrossValidator: """ Factory method to create and return an instance of a cross-validator. @@ -62,12 +60,10 @@ def create_cross_validator( if validator_class: sig = inspect.signature(validator_class.__init__) - valid_kwargs = { - k: v for k, v in kwargs.items() if k in sig.parameters - } + valid_kwargs = {k: v for k, v in kwargs.items() if k in sig.parameters} return validator_class(**valid_kwargs) raise ValueError( f"Invalid validator_type: {validator_type}. " f"Valid types are: {list(CrossValidatorFactory.VALIDATORS.keys())}" - ) \ No newline at end of file + ) diff --git a/RiskLabAI/backtest/validation/cross_validator_interface.py b/RiskLabAI/backtest/validation/cross_validator_interface.py index d289907..65c10b3 100644 --- a/RiskLabAI/backtest/validation/cross_validator_interface.py +++ b/RiskLabAI/backtest/validation/cross_validator_interface.py @@ -4,9 +4,7 @@ """ from abc import ABC, abstractmethod -from typing import ( - Any, Dict, Generator, List, Optional, Tuple, Union -) +from typing import Any, Dict, Generator, List, Optional, Tuple, Union import numpy as np import pandas as pd @@ -14,6 +12,7 @@ # For type hinting sklearn-like estimators Estimator = Any + class CrossValidator(ABC): """ Abstract Base Class (ABC) for cross-validation strategies. @@ -29,7 +28,7 @@ def get_n_splits( self, data: Optional[Union[pd.DataFrame, Dict[str, pd.DataFrame]]] = None, labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] = None, - groups: Optional[np.ndarray] = None + groups: Optional[np.ndarray] = None, ) -> int: """ Return the total number of splits. @@ -78,10 +77,10 @@ def split( self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] = None, - groups: Optional[np.ndarray] = None + groups: Optional[np.ndarray] = None, ) -> Union[ Generator[Tuple[np.ndarray, np.ndarray], None, None], - Generator[Tuple[str, Tuple[np.ndarray, np.ndarray]], None, None] + Generator[Tuple[str, Tuple[np.ndarray, np.ndarray]], None, None], ]: """ Split data (or dictionary of data) into train-test indices. @@ -109,8 +108,7 @@ def split( @abstractmethod def _single_backtest_paths( - self, - single_data: pd.DataFrame + self, single_data: pd.DataFrame ) -> Dict[str, List[Dict[str, np.ndarray]]]: """ Generate backtest paths for a single dataset. @@ -139,7 +137,7 @@ def backtest_paths( data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], ) -> Union[ Dict[str, List[Dict[str, np.ndarray]]], - Dict[str, Dict[str, List[Dict[str, np.ndarray]]]] + Dict[str, Dict[str, List[Dict[str, np.ndarray]]]], ]: """ Generate backtest paths for data or a dictionary of data. @@ -167,7 +165,7 @@ def _single_backtest_predictions( single_labels: pd.Series, single_weights: Optional[np.ndarray] = None, predict_probability: bool = False, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Dict[str, np.ndarray]: """ Obtain backtest predictions for a single dataset. @@ -203,7 +201,7 @@ def backtest_predictions( labels: Union[pd.Series, Dict[str, pd.Series]], sample_weights: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] = None, predict_probability: bool = False, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Union[Dict[str, np.ndarray], Dict[str, Dict[str, np.ndarray]]]: """ Generate backtest predictions for single or multiple datasets. @@ -233,4 +231,4 @@ def backtest_predictions( key is the dataset key, and the value is its dictionary of predictions. """ - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/RiskLabAI/backtest/validation/kfold.py b/RiskLabAI/backtest/validation/kfold.py index 0f665df..22f20f8 100644 --- a/RiskLabAI/backtest/validation/kfold.py +++ b/RiskLabAI/backtest/validation/kfold.py @@ -4,9 +4,7 @@ import warnings from copy import deepcopy -from typing import ( - Any, Dict, Generator, List, Optional, Tuple, Union -) +from typing import Any, Dict, Generator, List, Optional, Tuple, Union import numpy as np import pandas as pd @@ -18,6 +16,7 @@ # For type hinting sklearn-like estimators Estimator = Any + class KFold(CrossValidator): """ K-Fold cross-validator. @@ -31,10 +30,7 @@ class KFold(CrossValidator): """ def __init__( - self, - n_splits: int, - shuffle: bool = False, - random_seed: int = None + self, n_splits: int, shuffle: bool = False, random_seed: int = None ) -> None: """ Initialize the K-Fold cross-validator. @@ -58,7 +54,7 @@ def get_n_splits( self, data: Optional[Union[pd.DataFrame, Dict[str, pd.DataFrame]]] = None, labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] = None, - groups: Optional[np.ndarray] = None + groups: Optional[np.ndarray] = None, ) -> int: """ Return the number of splits. @@ -113,10 +109,10 @@ def split( self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] = None, - groups: Optional[np.ndarray] = None + groups: Optional[np.ndarray] = None, ) -> Union[ Generator[Tuple[np.ndarray, np.ndarray], None, None], - Generator[Tuple[str, Tuple[np.ndarray, np.ndarray]], None, None] + Generator[Tuple[str, Tuple[np.ndarray, np.ndarray]], None, None], ]: """ Split data (or dictionary of data) into train-test indices. @@ -145,8 +141,7 @@ def split( yield train_indices, test_indices def _single_backtest_paths( - self, - single_data: pd.DataFrame + self, single_data: pd.DataFrame ) -> Dict[str, List[Dict[str, np.ndarray]]]: """ Generate backtest paths for a single dataset. @@ -167,12 +162,14 @@ def _single_backtest_paths( paths = {} for train_indices, test_indices in self._single_split(single_data): - path_data.append({ - "Train": train_indices, - "Test": test_indices, - }) + path_data.append( + { + "Train": train_indices, + "Test": test_indices, + } + ) - paths['Path 1'] = path_data + paths["Path 1"] = path_data return paths def backtest_paths( @@ -180,7 +177,7 @@ def backtest_paths( data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], ) -> Union[ Dict[str, List[Dict[str, np.ndarray]]], - Dict[str, Dict[str, List[Dict[str, np.ndarray]]]] + Dict[str, Dict[str, List[Dict[str, np.ndarray]]]], ]: """ Generate backtest paths for data or a dictionary of data. @@ -203,7 +200,7 @@ def backtest_paths( for key in data: multiple_paths[key] = self._single_backtest_paths(data[key]) return multiple_paths - + return self._single_backtest_paths(data) def _single_backtest_predictions( @@ -213,7 +210,7 @@ def _single_backtest_predictions( single_labels: pd.Series, single_weights: Optional[np.ndarray] = None, predict_probability: bool = False, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Dict[str, np.ndarray]: """ Obtain backtest predictions for a single dataset. @@ -244,9 +241,7 @@ def _single_backtest_predictions( single_weights = np.ones(len(single_data)) def train_test_single_estimator( - estimator_: Estimator, - train_indices: np.ndarray, - test_indices: np.ndarray + estimator_: Estimator, train_indices: np.ndarray, test_indices: np.ndarray ) -> Tuple[np.ndarray, np.ndarray]: """Train model and return (predictions, test_indices).""" X_train = single_data.iloc[train_indices] @@ -254,7 +249,7 @@ def train_test_single_estimator( weights_train = single_weights[train_indices] with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=ConvergenceWarning) + warnings.filterwarnings("ignore", category=ConvergenceWarning) try: estimator_.fit(X_train, y_train, sample_weight=weights_train) except TypeError: @@ -274,7 +269,8 @@ def train_test_single_estimator( results = Parallel(n_jobs=n_jobs)( delayed(train_test_single_estimator)( deepcopy(single_estimator), train_indices, test_indices - ) for train_indices, test_indices in self._single_split(single_data) + ) + for train_indices, test_indices in self._single_split(single_data) ) # Unpack predictions and their corresponding original indices @@ -291,7 +287,7 @@ def train_test_single_estimator( reorder_indices = np.argsort(original_indices) path_data = path_data[reorder_indices] - paths_predictions = {'Path 1': path_data} + paths_predictions = {"Path 1": path_data} return paths_predictions def backtest_predictions( @@ -301,7 +297,7 @@ def backtest_predictions( labels: Union[pd.Series, Dict[str, pd.Series]], sample_weights: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] = None, predict_probability: bool = False, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Union[Dict[str, np.ndarray], Dict[str, Dict[str, np.ndarray]]]: """ Generate backtest predictions for single or multiple datasets. @@ -328,14 +324,16 @@ def backtest_predictions( - If data is a dict: A nested dictionary of predictions. """ if isinstance(data, dict): - if not isinstance(estimator, dict) or \ - not isinstance(labels, dict) or \ - (sample_weights and not isinstance(sample_weights, dict)): + if ( + not isinstance(estimator, dict) + or not isinstance(labels, dict) + or (sample_weights and not isinstance(sample_weights, dict)) + ): raise ValueError( "If data is a dict, estimator, labels, " "and sample_weights (if provided) must also be dicts." ) - + multiple_paths_predictions = {} for key in data: multiple_paths_predictions[key] = self._single_backtest_predictions( @@ -344,23 +342,17 @@ def backtest_predictions( labels[key], sample_weights[key] if sample_weights else None, predict_probability, - n_jobs + n_jobs, ) return multiple_paths_predictions # Handle single dataset case - if not (isinstance(data, pd.DataFrame) and - isinstance(labels, pd.Series)): + if not (isinstance(data, pd.DataFrame) and isinstance(labels, pd.Series)): raise ValueError( "If data is a DataFrame, estimator must be a single estimator " "and labels must be a single Series." ) return self._single_backtest_predictions( - estimator, - data, - labels, - sample_weights, - predict_probability, - n_jobs - ) \ No newline at end of file + estimator, data, labels, sample_weights, predict_probability, n_jobs + ) diff --git a/RiskLabAI/backtest/validation/purged_kfold.py b/RiskLabAI/backtest/validation/purged_kfold.py index c0a8f9e..c43da57 100644 --- a/RiskLabAI/backtest/validation/purged_kfold.py +++ b/RiskLabAI/backtest/validation/purged_kfold.py @@ -5,9 +5,7 @@ import warnings from copy import deepcopy -from typing import ( - Any, Dict, Generator, List, Optional, Set, Tuple, Union -) +from typing import Any, Dict, Generator, List, Optional, Set, Tuple, Union import numpy as np import pandas as pd @@ -19,6 +17,7 @@ # For type hinting sklearn-like estimators Estimator = Any + class PurgedKFold(CrossValidator): """ Purged K-Fold cross-validator. @@ -87,7 +86,7 @@ def filtered_training_indices_with_embargo( """ indices_to_drop: Set[int] = set() embargo_length = int(len(data_info_range) * embargo_fraction) - + if test_time_range.empty: return data_info_range @@ -95,23 +94,25 @@ def filtered_training_indices_with_embargo( if not continous_test_times: # Create blocks of contiguous test ranges - test_ranges = pd.DataFrame({ - 'start': sorted_test_time_range.index, - 'end': sorted_test_time_range.values - }) - gaps = test_ranges['start'] > test_ranges['end'].shift(1) + test_ranges = pd.DataFrame( + { + "start": sorted_test_time_range.index, + "end": sorted_test_time_range.values, + } + ) + gaps = test_ranges["start"] > test_ranges["end"].shift(1) blocks = gaps.cumsum() effective_test_time_range = test_ranges.groupby(blocks).agg( - {'start': 'min', 'end': 'max'} + {"start": "min", "end": "max"} ) effective_test_time_range = pd.Series( - effective_test_time_range['end'].values, - index=effective_test_time_range['start'] + effective_test_time_range["end"].values, + index=effective_test_time_range["start"], ) else: effective_test_time_range = pd.Series( sorted_test_time_range.values[-1], - index=[sorted_test_time_range.index[0]] + index=[sorted_test_time_range.index[0]], ) if embargo_length == 0: @@ -119,41 +120,45 @@ def filtered_training_indices_with_embargo( else: embargoed_values = [] for end_val in effective_test_time_range.values: - end_iloc = data_info_range.index.searchsorted(end_val, side='left') + end_iloc = data_info_range.index.searchsorted(end_val, side="left") if end_iloc >= len(data_info_range): embargoed_values.append(end_val) else: - embargoed_iloc = min(end_iloc + embargo_length, len(data_info_range) - 1) + embargoed_iloc = min( + end_iloc + embargo_length, len(data_info_range) - 1 + ) embargoed_values.append(data_info_range.index[embargoed_iloc]) embargoed_ranges = pd.Series( - embargoed_values, - index=effective_test_time_range.index + embargoed_values, index=effective_test_time_range.index ) # === END OF FIX === # Purge for test_start, test_end_embargoed in embargoed_ranges.items(): # 1. Overlap: train starts during test/embargo - cond1 = (data_info_range.index >= test_start) & \ - (data_info_range.index <= test_end_embargoed) + cond1 = (data_info_range.index >= test_start) & ( + data_info_range.index <= test_end_embargoed + ) # 2. Overlap: train ends during test/embargo - cond2 = (data_info_range.values >= test_start) & \ - (data_info_range.values <= test_end_embargoed) + cond2 = (data_info_range.values >= test_start) & ( + data_info_range.values <= test_end_embargoed + ) # 3. Overlap: train envelops test/embargo - cond3 = (data_info_range.index <= test_start) & \ - (data_info_range.values >= test_end_embargoed) + cond3 = (data_info_range.index <= test_start) & ( + data_info_range.values >= test_end_embargoed + ) indices_to_drop.update(data_info_range[cond1 | cond2 | cond3].index) return data_info_range.drop(indices_to_drop) def __init__( - self, - n_splits: int, - times: Union[pd.Series, Dict[str, pd.Series]], - embargo: float = 0, + self, + n_splits: int, + times: Union[pd.Series, Dict[str, pd.Series]], + embargo: float = 0, ) -> None: """ Initialize the PurgedKFold cross-validator. @@ -177,7 +182,7 @@ def get_n_splits( self, data: Optional[Union[pd.DataFrame, Dict[str, pd.DataFrame]]] = None, labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] = None, - groups: Optional[np.ndarray] = None + groups: Optional[np.ndarray] = None, ) -> int: """ Return the number of splits. @@ -199,15 +204,11 @@ def get_n_splits( return self.n_splits def _validate_input( - self, - single_times: pd.Series, - single_data: pd.DataFrame + self, single_times: pd.Series, single_data: pd.DataFrame ) -> None: """Validate that data and times indices match.""" if not single_data.index.equals(single_times.index): - raise ValueError( - "Data and 'times' (info range) must have the same index" - ) + raise ValueError("Data and 'times' (info range) must have the same index") def _get_train_indices( self, @@ -234,13 +235,13 @@ def _get_train_indices( """ if len(test_indices) == 0: return np.arange(len(single_times)) - + test_time_range = single_times.iloc[test_indices] - + train_times = self.filtered_training_indices_with_embargo( single_times, test_time_range, self.embargo, continous_test_times ) - + # Convert index labels back to integer locations train_indices = single_times.index.get_indexer(train_times.index) return train_indices @@ -268,21 +269,19 @@ def _single_split( self._validate_input(single_times, single_data) indices = np.arange(len(single_data)) - + for test_indices in np.array_split(indices, self.n_splits): - train_indices = self._get_train_indices( - test_indices, single_times, True - ) + train_indices = self._get_train_indices(test_indices, single_times, True) yield train_indices, test_indices def split( self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], labels: Optional[Union[pd.Series, Dict[str, pd.Series]]] = None, - groups: Optional[np.ndarray] = None + groups: Optional[np.ndarray] = None, ) -> Union[ Generator[Tuple[np.ndarray, np.ndarray], None, None], - Generator[Tuple[str, Tuple[np.ndarray, np.ndarray]], None, None] + Generator[Tuple[str, Tuple[np.ndarray, np.ndarray]], None, None], ]: """ Split data (or dictionary of data) into purged train-test indices. @@ -313,9 +312,7 @@ def split( else: if not isinstance(data, pd.DataFrame): raise ValueError("If 'times' is a Series, 'data' must be a DataFrame.") - for train_indices, test_indices in self._single_split( - self.times, data - ): + for train_indices, test_indices in self._single_split(self.times, data): yield train_indices, test_indices def _single_backtest_paths( @@ -348,12 +345,14 @@ def _single_backtest_paths( for train_indices, test_indices in self._single_split( single_times, single_data ): - path_data.append({ - "Train": train_indices, - "Test": test_indices, - }) + path_data.append( + { + "Train": train_indices, + "Test": test_indices, + } + ) - paths['Path 1'] = path_data + paths["Path 1"] = path_data return paths def backtest_paths( @@ -361,7 +360,7 @@ def backtest_paths( data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], ) -> Union[ Dict[str, List[Dict[str, np.ndarray]]], - Dict[str, Dict[str, List[Dict[str, np.ndarray]]]] + Dict[str, Dict[str, List[Dict[str, np.ndarray]]]], ]: """ Generate backtest paths for data or a dictionary of data. @@ -386,7 +385,7 @@ def backtest_paths( self.times[key], data[key] ) return multiple_paths - + if not isinstance(data, pd.DataFrame): raise ValueError("If 'times' is a Series, 'data' must be a DataFrame.") return self._single_backtest_paths(self.times, data) @@ -399,7 +398,7 @@ def _single_backtest_predictions( single_labels: pd.Series, single_weights: Optional[np.ndarray] = None, predict_probability: bool = False, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Dict[str, np.ndarray]: """ Obtain backtest predictions for a single dataset. @@ -433,9 +432,7 @@ def _single_backtest_predictions( single_weights = np.ones(len(single_data)) def train_test_single_estimator( - estimator_: Estimator, - train_indices: np.ndarray, - test_indices: np.ndarray + estimator_: Estimator, train_indices: np.ndarray, test_indices: np.ndarray ) -> np.ndarray: """Train model and return predictions.""" X_train = single_data.iloc[train_indices] @@ -443,7 +440,7 @@ def train_test_single_estimator( weights_train = single_weights[train_indices] with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=ConvergenceWarning) + warnings.filterwarnings("ignore", category=ConvergenceWarning) try: estimator_.fit(X_train, y_train, sample_weight=weights_train) except TypeError: @@ -453,19 +450,20 @@ def train_test_single_estimator( if predict_probability: return estimator_.predict_proba(X_test) - + return estimator_.predict(X_test) path_data = Parallel(n_jobs=n_jobs)( delayed(train_test_single_estimator)( deepcopy(single_estimator), train_indices, test_indices - ) for train_indices, test_indices in self._single_split( + ) + for train_indices, test_indices in self._single_split( single_times, single_data ) ) # Since PurgedKFold is not shuffled, we can just concatenate - paths_predictions = {'Path 1': np.concatenate(path_data)} + paths_predictions = {"Path 1": np.concatenate(path_data)} return paths_predictions def backtest_predictions( @@ -475,7 +473,7 @@ def backtest_predictions( labels: Union[pd.Series, Dict[str, pd.Series]], sample_weights: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] = None, predict_probability: bool = False, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Union[Dict[str, np.ndarray], Dict[str, Dict[str, np.ndarray]]]: """ Generate backtest predictions for single or multiple datasets. @@ -502,27 +500,33 @@ def backtest_predictions( - If data is a dict: A nested dictionary of predictions. """ if self.is_multiple_datasets: - if not (isinstance(data, dict) and - isinstance(estimator, dict) and - isinstance(labels, dict)): + if not ( + isinstance(data, dict) + and isinstance(estimator, dict) + and isinstance(labels, dict) + ): raise ValueError( "If 'times' is a dict, 'data', 'estimator', and 'labels' " "must also be dicts." ) - + multiple_paths_predictions = {} for key in self.times: s_weights = sample_weights[key] if sample_weights else None multiple_paths_predictions[key] = self._single_backtest_predictions( - estimator[key], self.times[key], data[key], labels[key], - s_weights, predict_probability, n_jobs + estimator[key], + self.times[key], + data[key], + labels[key], + s_weights, + predict_probability, + n_jobs, ) return multiple_paths_predictions # Handle single dataset case - if not (isinstance(data, pd.DataFrame) and - isinstance(labels, pd.Series)): - raise ValueError( + if not (isinstance(data, pd.DataFrame) and isinstance(labels, pd.Series)): + raise ValueError( "If 'times' is a Series, 'data' must be a DataFrame " "and 'labels' must be a Series." ) @@ -534,5 +538,5 @@ def backtest_predictions( labels, sample_weights, predict_probability, - n_jobs - ) \ No newline at end of file + n_jobs, + ) diff --git a/RiskLabAI/backtest/validation/walk_forward.py b/RiskLabAI/backtest/validation/walk_forward.py index ea678c5..48de8f0 100644 --- a/RiskLabAI/backtest/validation/walk_forward.py +++ b/RiskLabAI/backtest/validation/walk_forward.py @@ -4,9 +4,7 @@ import warnings from copy import deepcopy -from typing import ( - Any, Dict, Generator, Optional, Tuple -) +from typing import Any, Dict, Generator, Optional, Tuple import numpy as np import pandas as pd @@ -18,6 +16,7 @@ # For type hinting sklearn-like estimators Estimator = Any + class WalkForward(KFold): """ Walk-Forward Cross-Validator for Time Series Data. @@ -43,10 +42,7 @@ class WalkForward(KFold): """ def __init__( - self, - n_splits: int = 5, - max_train_size: Optional[int] = None, - gap: int = 0 + self, n_splits: int = 5, max_train_size: Optional[int] = None, gap: int = 0 ) -> None: """ Initialize the WalkForward cross-validator. @@ -84,15 +80,15 @@ def _single_split( A generator where each item is a tuple of (train_indices, test_indices). """ indices = np.arange(single_data.shape[0]) - + # np.array_split handles non-divisible splits for test_indices in np.array_split(indices, self.n_splits): # The first test index first_test_idx_loc = test_indices[0] - + # Train indices end `gap` samples before the test set train_end_loc = first_test_idx_loc - self.gap - + if train_end_loc < 0: # No training data possible train_indices = np.array([], dtype=int) @@ -102,7 +98,7 @@ def _single_split( train_indices = indices[train_start_loc:train_end_loc] else: train_indices = indices[:train_end_loc] - + yield train_indices, test_indices def _single_backtest_predictions( @@ -112,7 +108,7 @@ def _single_backtest_predictions( single_labels: pd.Series, single_weights: Optional[np.ndarray] = None, predict_probability: bool = False, - n_jobs: int = 1 + n_jobs: int = 1, ) -> Dict[str, np.ndarray]: """ Obtain backtest predictions for a single dataset. @@ -142,16 +138,18 @@ def _single_backtest_predictions( single_weights = np.ones(len(single_data)) def train_test_single_estimator( - estimator_: Estimator, - train_indices: np.ndarray, - test_indices: np.ndarray + estimator_: Estimator, train_indices: np.ndarray, test_indices: np.ndarray ) -> np.ndarray: """Train model and return predictions.""" - + if len(train_indices) == 0: # No training data, return NaNs n_classes = len(np.unique(single_labels)) if predict_probability else 1 - shape = (len(test_indices), n_classes) if predict_probability else (len(test_indices),) + shape = ( + (len(test_indices), n_classes) + if predict_probability + else (len(test_indices),) + ) return np.full(shape, np.nan) X_train = single_data.iloc[train_indices] @@ -159,7 +157,7 @@ def train_test_single_estimator( weights_train = single_weights[train_indices] with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=ConvergenceWarning) + warnings.filterwarnings("ignore", category=ConvergenceWarning) try: estimator_.fit(X_train, y_train, sample_weight=weights_train) except TypeError: @@ -169,15 +167,16 @@ def train_test_single_estimator( if predict_probability: return estimator_.predict_proba(X_test) - + return estimator_.predict(X_test) path_data = Parallel(n_jobs=n_jobs)( delayed(train_test_single_estimator)( deepcopy(single_estimator), train_indices, test_indices - ) for train_indices, test_indices in self._single_split(single_data) + ) + for train_indices, test_indices in self._single_split(single_data) ) # Since shuffle=False, we can just concatenate - paths_predictions = {'Path 1': np.concatenate(path_data)} - return paths_predictions \ No newline at end of file + paths_predictions = {"Path 1": np.concatenate(path_data)} + return paths_predictions diff --git a/RiskLabAI/cluster/__init__.py b/RiskLabAI/cluster/__init__.py index 5b6ff7b..61dd826 100644 --- a/RiskLabAI/cluster/__init__.py +++ b/RiskLabAI/cluster/__init__.py @@ -24,4 +24,4 @@ "random_covariance_sub", "random_block_covariance", "random_block_correlation", -] \ No newline at end of file +] diff --git a/RiskLabAI/cluster/clustering.py b/RiskLabAI/cluster/clustering.py index 5d6bc93..ef693cc 100644 --- a/RiskLabAI/cluster/clustering.py +++ b/RiskLabAI/cluster/clustering.py @@ -14,6 +14,7 @@ from sklearn.utils import check_random_state from typing import Tuple, Dict, List, Optional + def covariance_to_correlation(covariance: np.ndarray) -> np.ndarray: r""" Derive the correlation matrix from a covariance matrix. @@ -43,6 +44,7 @@ def covariance_to_correlation(covariance: np.ndarray) -> np.ndarray: return cov_to_corr(covariance) + def cluster_k_means_base( correlation: pd.DataFrame, max_clusters: int = 10, @@ -80,31 +82,31 @@ def cluster_k_means_base( """ # Calculate distance matrix distance = ((1 - correlation.fillna(0)) / 2.0) ** 0.5 - + best_kmeans = None best_silhouette_scores = None best_score = -np.inf rng = check_random_state(random_state) - + for _ in range(iterations): for n_clusters in range(2, max_clusters + 1): # Use a different random_state for each K-Means fit iter_seed = rng.randint(0, np.iinfo(np.int32).max) - + kmeans = KMeans( n_clusters=n_clusters, n_init=1, # We handle iterations externally random_state=iter_seed, ) kmeans.fit(distance) - + silhouette_scores_ = silhouette_samples(distance, kmeans.labels_) - + # Use silhouette score t-statistic (mean/std) as the quality metric stat_mean = silhouette_scores_.mean() stat_std = silhouette_scores_.std() - + if stat_std == 0: # Avoid division by zero if all silhouette scores are identical score = np.sign(stat_mean) * np.inf @@ -128,7 +130,7 @@ def cluster_k_means_base( i: correlation.columns[np.where(best_kmeans.labels_ == i)[0]].tolist() for i in np.unique(best_kmeans.labels_) } - + silhouette_series = pd.Series(best_silhouette_scores, index=distance.index) return correlation_sorted, clusters, silhouette_series @@ -187,6 +189,7 @@ def make_new_outputs( ) return correlation_new, clusters_new, silhouette_new + def cluster_k_means_top( correlation: pd.DataFrame, max_clusters: Optional[int] = None, @@ -223,13 +226,13 @@ def cluster_k_means_top( n_cols = correlation.shape[1] if max_clusters is None: max_clusters = n_cols - 1 - + max_clusters = min(max_clusters, n_cols - 1) if max_clusters < 2: return ( correlation, {0: correlation.columns.tolist()}, - pd.Series(dtype='float64'), + pd.Series(dtype="float64"), ) # 1. Run base clustering @@ -246,34 +249,30 @@ def cluster_k_means_top( for i in clusters if silhouette[clusters[i]].std() > 0 } - + if not cluster_t_stats: - return corr_sorted, clusters, silhouette # No valid clusters found + return corr_sorted, clusters, silhouette # No valid clusters found t_stat_mean = np.mean(list(cluster_t_stats.values())) # 3. Identify clusters to re-cluster - redo_clusters = [ - i for i, t_stat in cluster_t_stats.items() if t_stat < t_stat_mean - ] + redo_clusters = [i for i, t_stat in cluster_t_stats.items() if t_stat < t_stat_mean] if len(redo_clusters) <= 1: # Base case: All clusters are stable, or only one is unstable return corr_sorted, clusters, silhouette else: # 4. Recurse on unstable clusters - keys_redo = [ - item for i in redo_clusters for item in clusters[i] - ] + keys_redo = [item for i in redo_clusters for item in clusters[i]] corr_temp = correlation.loc[keys_redo, keys_redo] - + # Keep track of mean t-stat for comparison t_stat_mean_redo = np.mean([cluster_t_stats[i] for i in redo_clusters]) - + # Calculate remaining clusters for recursive call n_clusters_good = len(clusters) - len(redo_clusters) remained_n_clusters = max_clusters - n_clusters_good - + # Recursive call corr_sorted_2, clusters_2, silh_2 = cluster_k_means_top( corr_temp, @@ -283,9 +282,7 @@ def cluster_k_means_top( ) # 5. Merge results - clusters_1 = { - i: clusters[i] for i in clusters if i not in redo_clusters - } + clusters_1 = {i: clusters[i] for i in clusters if i not in redo_clusters} corr_new, clusters_new, silh_new = make_new_outputs( correlation, clusters_1, clusters_2 ) @@ -296,10 +293,10 @@ def cluster_k_means_top( for i in clusters_new if silh_new[clusters_new[i]].std() > 0 ] - + if not new_t_stats: - return corr_sorted, clusters, silhouette # Re-clustering failed - + return corr_sorted, clusters, silhouette # Re-clustering failed + new_t_stat_mean = np.mean(new_t_stats) if new_t_stat_mean <= t_stat_mean_redo: @@ -346,10 +343,10 @@ def random_covariance_sub( # Common factor data = rng.normal(size=(n_observations, 1)) data = np.repeat(data, n_columns, axis=1) - + # Idiosyncratic noise data += rng.normal(scale=sigma, size=data.shape) - + covariance = np.cov(data, rowvar=False) return covariance @@ -394,16 +391,14 @@ def random_block_covariance( replace=False, ) parts.sort() - parts = np.append( - parts, n_columns - (block_size_min - 1) * n_blocks - ) + parts = np.append(parts, n_columns - (block_size_min - 1) * n_blocks) parts = np.append(parts[0], np.diff(parts)) - 1 + block_size_min cov_list = [] for col_size in parts: # Number of observations must be > number of columns n_obs = int(max(col_size * (col_size + 1) / 2.0, 100)) - + this_covariance = random_covariance_sub( n_obs, col_size, sigma, random_state=rng ) @@ -453,7 +448,7 @@ def random_block_correlation( sigma=0.5, random_state=rng, ) - + # Market component (noise) covariance2 = random_block_covariance( n_columns, 1, block_size_min=n_columns, sigma=1.0, random_state=rng @@ -463,4 +458,4 @@ def random_block_correlation( correlation = covariance_to_correlation(covariance) correlation = pd.DataFrame(correlation) - return correlation \ No newline at end of file + return correlation diff --git a/RiskLabAI/controller/bars_initializer.py b/RiskLabAI/controller/bars_initializer.py index 4fa9420..66c0cb2 100644 --- a/RiskLabAI/controller/bars_initializer.py +++ b/RiskLabAI/controller/bars_initializer.py @@ -8,7 +8,8 @@ # Import bar types from RiskLabAI.data.structures.imbalance_bars import ( - ExpectedImbalanceBars, FixedImbalanceBars + ExpectedImbalanceBars, + FixedImbalanceBars, ) from RiskLabAI.data.structures.run_bars import ExpectedRunBars, FixedRunBars from RiskLabAI.data.structures.standard_bars import StandardBars @@ -17,9 +18,12 @@ # Import constants from RiskLabAI.utils.constants import ( - CUMULATIVE_DOLLAR, CUMULATIVE_VOLUME, CUMULATIVE_TICKS + CUMULATIVE_DOLLAR, + CUMULATIVE_VOLUME, + CUMULATIVE_TICKS, ) + class BarsInitializerController: """ Controller for initializing various types of bars. @@ -58,7 +62,7 @@ def initialize_expected_dollar_imbalance_bars( initial_estimate_of_expected_n_ticks_in_bar: int = 20000, expected_ticks_number_bounds: Optional[Tuple[float, float]] = None, analyse_thresholds: bool = False, - **kwargs: Any, # Accept extra kwargs but don't use them + **kwargs: Any, # Accept extra kwargs but don't use them ) -> ExpectedImbalanceBars: """ Initialize expected dollar imbalance bars. @@ -290,4 +294,4 @@ def initialize_time_bars( """Initialize time bars.""" return TimeBars( resolution_type=resolution_type, resolution_units=resolution_units - ) \ No newline at end of file + ) diff --git a/RiskLabAI/controller/data_structure_controller.py b/RiskLabAI/controller/data_structure_controller.py index 9113690..d828843 100644 --- a/RiskLabAI/controller/data_structure_controller.py +++ b/RiskLabAI/controller/data_structure_controller.py @@ -5,6 +5,7 @@ uses the BarsInitializerController to construct bars based on a specified method. """ + import logging import pandas as pd @@ -14,21 +15,39 @@ from RiskLabAI.controller.bars_initializer import BarsInitializerController from RiskLabAI.data.structures.abstract_bars import AbstractBars from RiskLabAI.utils.constants import ( - DATE_TIME, TICK_NUMBER, OPEN_PRICE, HIGH_PRICE, - LOW_PRICE, CLOSE_PRICE, CUMULATIVE_VOLUME, - CUMULATIVE_BUY_VOLUME, CUMULATIVE_SELL_VOLUME, - CUMULATIVE_TICKS, CUMULATIVE_DOLLAR, THRESHOLD + DATE_TIME, + TICK_NUMBER, + OPEN_PRICE, + HIGH_PRICE, + LOW_PRICE, + CLOSE_PRICE, + CUMULATIVE_VOLUME, + CUMULATIVE_BUY_VOLUME, + CUMULATIVE_SELL_VOLUME, + CUMULATIVE_TICKS, + CUMULATIVE_DOLLAR, + THRESHOLD, ) logger = logging.getLogger(__name__) # Define the bar column schema BAR_COLUMNS = [ - DATE_TIME, TICK_NUMBER, OPEN_PRICE, HIGH_PRICE, LOW_PRICE, CLOSE_PRICE, - CUMULATIVE_VOLUME, CUMULATIVE_BUY_VOLUME, CUMULATIVE_SELL_VOLUME, - CUMULATIVE_TICKS, CUMULATIVE_DOLLAR, THRESHOLD + DATE_TIME, + TICK_NUMBER, + OPEN_PRICE, + HIGH_PRICE, + LOW_PRICE, + CLOSE_PRICE, + CUMULATIVE_VOLUME, + CUMULATIVE_BUY_VOLUME, + CUMULATIVE_SELL_VOLUME, + CUMULATIVE_TICKS, + CUMULATIVE_DOLLAR, + THRESHOLD, ] + class Controller: """ Controller for initializing and processing bars from data sources. @@ -46,7 +65,7 @@ def handle_input_command( method_arguments: Dict[str, Any], input_data: Union[str, pd.DataFrame], output_path: Optional[str] = None, - batch_size: int = 1_000_000 + batch_size: int = 1_000_000, ) -> pd.DataFrame: """ Handles the command to initialize a bar generator and process data. @@ -54,21 +73,26 @@ def handle_input_command( """ # 1. Initialize the bar generator try: - initializer_method = self.bars_initializer.method_name_to_method[method_name] + initializer_method = self.bars_initializer.method_name_to_method[ + method_name + ] except KeyError: valid_methods = list(self.bars_initializer.method_name_to_method.keys()) logger.error( "Bar method '%s' not found. Valid methods are: %s", - method_name, valid_methods, + method_name, + valid_methods, ) return pd.DataFrame(columns=BAR_COLUMNS) - + try: bar_generator: AbstractBars = initializer_method(**method_arguments) except TypeError as e: logger.error( "Error initializing bar method '%s' with arguments %s: %s", - method_name, method_arguments, e, + method_name, + method_arguments, + e, ) return pd.DataFrame(columns=BAR_COLUMNS) @@ -81,7 +105,7 @@ def handle_input_command( raise TypeError("input_data must be a string (path) or pd.DataFrame") all_bars: List[List[Any]] = [] - + # 3. Process data in batches logger.info("Processing data in batches...") try: @@ -93,7 +117,8 @@ def handle_input_command( except Exception as e: logger.warning( "Error during bar construction: %s. Returning DataFrame with " - "bars constructed so far.", e, + "bars constructed so far.", + e, ) # Continue to return whatever was processed @@ -114,8 +139,7 @@ def handle_input_command( @staticmethod def read_batches_from_string( - input_path: str, - batch_size: int + input_path: str, batch_size: int ) -> Generator[pd.DataFrame, None, None]: """ Reads data in batches from a CSV file efficiently. @@ -135,9 +159,7 @@ def read_batches_from_string( try: # Use a generator to read the file in chunks # This is more memory-efficient and avoids reading the file twice. - for batch in pd.read_csv( - input_path, chunksize=batch_size, parse_dates=[0] - ): + for batch in pd.read_csv(input_path, chunksize=batch_size, parse_dates=[0]): yield batch except FileNotFoundError: logger.error("File not found at %s", input_path) @@ -151,11 +173,9 @@ def read_batches_from_string( ) return - @staticmethod def read_batches_from_dataframe( - input_data: pd.DataFrame, - batch_size: int + input_data: pd.DataFrame, batch_size: int ) -> Generator[pd.DataFrame, None, None]: """ Reads data in batches from a DataFrame. @@ -163,7 +183,7 @@ def read_batches_from_dataframe( n_rows = input_data.shape[0] if n_rows == 0: return - + for start_row in range(0, n_rows, batch_size): end_row = min(start_row + batch_size, n_rows) - yield input_data.iloc[start_row:end_row] \ No newline at end of file + yield input_data.iloc[start_row:end_row] diff --git a/RiskLabAI/core/registry.py b/RiskLabAI/core/registry.py index 131b541..bd67b69 100644 --- a/RiskLabAI/core/registry.py +++ b/RiskLabAI/core/registry.py @@ -182,15 +182,22 @@ class MyModel: ... if obj is None and callable(key) and not isinstance(key, str): target = key self._register( - target.__name__, target, aliases=aliases, - metadata=metadata, override=override, + target.__name__, + target, + aliases=aliases, + metadata=metadata, + override=override, ) return target # Direct call: reg.register("name", obj) if obj is not None: self._register( - key, obj, aliases=aliases, metadata=metadata, override=override, + key, + obj, + aliases=aliases, + metadata=metadata, + override=override, ) return obj @@ -198,8 +205,11 @@ class MyModel: ... def decorator(target: Factory) -> Factory: resolved_key = key if isinstance(key, str) else target.__name__ self._register( - resolved_key, target, aliases=aliases, - metadata=metadata, override=override, + resolved_key, + target, + aliases=aliases, + metadata=metadata, + override=override, ) return target @@ -227,8 +237,12 @@ def register_lazy( See :meth:`register`. """ self._register( - key, None, lazy_target=target, aliases=aliases, - metadata=metadata, override=override, + key, + None, + lazy_target=target, + aliases=aliases, + metadata=metadata, + override=override, ) def _register( @@ -242,9 +256,7 @@ def _register( override: bool = False, ) -> None: if not isinstance(key, str) or not key: - raise TypeError( - f"Registry key must be a non-empty string, got {key!r}." - ) + raise TypeError(f"Registry key must be a non-empty string, got {key!r}.") lower = key.lower() if lower in self._index and not override: existing = self._index[lower] @@ -257,12 +269,13 @@ def _register( if override and lower in self._index: old_canonical = self._index[lower] self._entries.pop(old_canonical, None) - self._index = { - k: v for k, v in self._index.items() if v != old_canonical - } + self._index = {k: v for k, v in self._index.items() if v != old_canonical} self._entries[key] = _Entry( - key, obj=obj, lazy_target=lazy_target, metadata=metadata, + key, + obj=obj, + lazy_target=lazy_target, + metadata=metadata, ) self._index[lower] = key for alias in aliases: diff --git a/RiskLabAI/data/__init__.py b/RiskLabAI/data/__init__.py index 785f77d..0e53b63 100644 --- a/RiskLabAI/data/__init__.py +++ b/RiskLabAI/data/__init__.py @@ -29,4 +29,4 @@ "structures", "synthetic_data", "weights", -] \ No newline at end of file +] diff --git a/RiskLabAI/data/denoise/__init__.py b/RiskLabAI/data/denoise/__init__.py index 7cc5603..2600912 100644 --- a/RiskLabAI/data/denoise/__init__.py +++ b/RiskLabAI/data/denoise/__init__.py @@ -19,7 +19,7 @@ cov_to_corr, corr_to_cov, denoise_cov, - optimal_portfolio, + optimal_portfolio, ) __all__ = [ @@ -27,10 +27,10 @@ "fit_kde", "find_max_eval", "pca", - "denoised_corr", - "denoised_corr2", + "denoised_corr", + "denoised_corr2", "cov_to_corr", "corr_to_cov", "denoise_cov", - "optimal_portfolio", -] \ No newline at end of file + "optimal_portfolio", +] diff --git a/RiskLabAI/data/denoise/denoising.py b/RiskLabAI/data/denoise/denoising.py index 8d3f9f0..9613871 100644 --- a/RiskLabAI/data/denoise/denoising.py +++ b/RiskLabAI/data/denoise/denoising.py @@ -21,9 +21,7 @@ # --- FIX 5: Removed unused imports for LedoitWolf and block_diag --- -def marcenko_pastur_pdf( - variance: float, q: float, num_points: int = 1000 -) -> pd.Series: +def marcenko_pastur_pdf(variance: float, q: float, num_points: int = 1000) -> pd.Series: r""" Compute the Marcenko-Pastur (MP) probability density function. @@ -50,18 +48,18 @@ def marcenko_pastur_pdf( """ lambda_min = variance * (1 - (1.0 / q) ** 0.5) ** 2 lambda_max = variance * (1 + (1.0 / q) ** 0.5) ** 2 - + # --- FIX 1: Add epsilon to prevent division by zero if lambda_min=0 (when q=1) --- - e_min = max(lambda_min, 1e-10) + e_min = max(lambda_min, 1e-10) eigenvalues = np.linspace(e_min, lambda_max, num_points) - + pdf = (q / (2 * np.pi * variance * eigenvalues)) * ( (lambda_max - eigenvalues) * (eigenvalues - lambda_min) ) ** 0.5 - + # Set PDF to 0 where eigenvalues are outside the valid range (e.g., due to numerical precision) pdf[np.isnan(pdf)] = 0 - + return pd.Series(pdf.flatten(), index=eigenvalues.flatten()) @@ -94,10 +92,10 @@ def fit_kde( def _mp_pdf_fit_error( variance: float, q: float, eigenvalues: np.ndarray, bandwidth: float -) -> float: # <-- FIX 2: Added bandwidth +) -> float: # <-- FIX 2: Added bandwidth r""" Error function for fitting the MP PDF to observed eigenvalues. - + Calculates the sum of squared errors between the theoretical MP PDF and the empirical PDF (from KDE). @@ -120,14 +118,16 @@ def _mp_pdf_fit_error( # Ensure eigenvalues is 1D for PDF generation if eigenvalues.ndim == 2: eigenvalues = np.diag(eigenvalues) - + theoretical_pdf = marcenko_pastur_pdf(variance, q, num_points=eigenvalues.shape[0]) - + # Fit empirical PDF # --- FIX 2: Pass bandwidth to fit_kde --- - kde = fit_kde(eigenvalues, bandwidth=bandwidth) - empirical_pdf = np.exp(kde.score_samples(theoretical_pdf.index.values.reshape(-1, 1))) - + kde = fit_kde(eigenvalues, bandwidth=bandwidth) + empirical_pdf = np.exp( + kde.score_samples(theoretical_pdf.index.values.reshape(-1, 1)) + ) + # Calculate SSE sse = np.sum((empirical_pdf - theoretical_pdf.values) ** 2) return sse @@ -160,21 +160,23 @@ def find_max_eval( eigenvalues_1d = np.diag(eigenvalues) else: eigenvalues_1d = eigenvalues - + # Minimize the SSE to find the best-fit variance # --- FIX 2: Pass bandwidth to the objective function --- - objective_func = lambda *args: _mp_pdf_fit_error(args[0], q, eigenvalues_1d, bandwidth) - + objective_func = lambda *args: _mp_pdf_fit_error( + args[0], q, eigenvalues_1d, bandwidth + ) + optimizer_result = minimize( objective_func, - x0=np.array([0.5]), # Initial variance guess + x0=np.array([0.5]), # Initial variance guess bounds=((1e-5, 1 - 1e-5),), ) if optimizer_result.success: variance = optimizer_result.x[0] else: - variance = 1.0 # Fallback + variance = 1.0 # Fallback # Calculate lambda_max based on the fitted variance lambda_max = variance * (1 + (1.0 / q) ** 0.5) ** 2 @@ -187,7 +189,7 @@ def denoised_corr( """ Reconstruct the correlation matrix using only the eigenvalues associated with signal (i.e., > lambda_max). - + Note: Assumes eigenvalues are sorted in descending order. Parameters @@ -206,39 +208,41 @@ def denoised_corr( """ # 1. Get the eigenvalues and eigenvectors for signal # --- FIX 3: This logic is now correct as eigenvalues are descending --- - eigenvalues_1d = np.diag(eigenvalues) - eigenvalues_signal = np.diag(eigenvalues_1d[:num_facts]) + eigenvalues_1d = np.diag(eigenvalues) + eigenvalues_signal = np.diag(eigenvalues_1d[:num_facts]) eigenvectors_signal = eigenvectors[:, :num_facts] - + # 2. Reconstruct the signal-only correlation matrix corr1 = eigenvectors_signal @ eigenvalues_signal @ eigenvectors_signal.T - + # 3. Get the eigenvalues for noise and average them if num_facts < eigenvalues.shape[0]: # --- FIX 3: Correctly averages the smaller (noise) eigenvalues --- avg_noise_eigenvalue = eigenvalues_1d[num_facts:].mean() eigenvectors_noise = eigenvectors[:, num_facts:] - + # 4. Reconstruct the noise-only correlation matrix - corr2 = eigenvectors_noise @ ( - np.diag([avg_noise_eigenvalue] * (eigenvalues.shape[0] - num_facts)) - ) @ eigenvectors_noise.T - + corr2 = ( + eigenvectors_noise + @ (np.diag([avg_noise_eigenvalue] * (eigenvalues.shape[0] - num_facts))) + @ eigenvectors_noise.T + ) + # 5. Add them back together corr1 = corr1 + corr2 - + # 6. Rescale to be a valid correlation matrix - diag_inv_sqrt = 1. / np.sqrt(np.diag(corr1)) + diag_inv_sqrt = 1.0 / np.sqrt(np.diag(corr1)) corr1 = np.diag(diag_inv_sqrt) @ corr1 @ np.diag(diag_inv_sqrt) - np.fill_diagonal(corr1, 1.0) # Clean up numerical errors + np.fill_diagonal(corr1, 1.0) # Clean up numerical errors return corr1 + # --- Utility Functions --- -def pca( - matrix: np.ndarray -) -> Tuple[np.ndarray, np.ndarray]: + +def pca(matrix: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ Computes the principal component analysis of a Hermitian matrix. Ensures eigenvalues are sorted descending. @@ -249,20 +253,21 @@ def pca( :rtype: Tuple[np.ndarray, np.ndarray] """ eigenvalues, eigenvectors = np.linalg.eigh(matrix) - indices = eigenvalues.argsort()[::-1] # Sort descending + indices = eigenvalues.argsort()[::-1] # Sort descending eigenvalues = eigenvalues[indices] eigenvectors = eigenvectors[:, indices] return eigenvalues, eigenvectors + def cov_to_corr(cov: np.ndarray) -> np.ndarray: """Convert covariance matrix to correlation matrix.""" std = np.sqrt(np.diag(cov)) # Handle division by zero if any std is 0 - std[std == 0] = 1.0 + std[std == 0] = 1.0 corr = cov / np.outer(std, std) - corr[corr < -1] = -1.0 # Handle numerical errors + corr[corr < -1] = -1.0 # Handle numerical errors corr[corr > 1] = 1.0 - np.fill_diagonal(corr, 1.0) # Ensure diagonal is 1 + np.fill_diagonal(corr, 1.0) # Ensure diagonal is 1 return corr @@ -270,9 +275,8 @@ def corr_to_cov(corr: np.ndarray, std: np.ndarray) -> np.ndarray: """Convert correlation matrix to covariance matrix.""" return corr * np.outer(std, std) -def denoise_cov( - cov0: np.ndarray, q: float, bandwidth: float = 0.01 -) -> np.ndarray: + +def denoise_cov(cov0: np.ndarray, q: float, bandwidth: float = 0.01) -> np.ndarray: """ De-noises a covariance matrix. @@ -292,32 +296,30 @@ def denoise_cov( """ corr0 = cov_to_corr(cov0) - + # --- FIX 3: Use pca helper to get DESCENDING eigenvalues/vectors --- eigenvalues, eigenvectors = pca(corr0) - eigenvalues_diag = np.diag(eigenvalues) # 2D diag matrix (desc) + eigenvalues_diag = np.diag(eigenvalues) # 2D diag matrix (desc) # Find the noise cutoff # --- FIX 2: Pass bandwidth down to find_max_eval --- emax0, var0 = find_max_eval(eigenvalues_diag, q, bandwidth) - + # --- FIX 3: Correctly find num factors as count of evals > emax0 --- n_facts0 = np.sum(eigenvalues > emax0) - + # Denoise the correlation matrix corr1 = denoised_corr(eigenvalues_diag, eigenvectors, n_facts0) - + # Convert back to covariance cov1 = corr_to_cov(corr1, np.diag(cov0) ** 0.5) return cov1 -def optimal_portfolio( - cov: np.ndarray, mu: Optional[np.ndarray] = None -) -> np.ndarray: +def optimal_portfolio(cov: np.ndarray, mu: Optional[np.ndarray] = None) -> np.ndarray: """ Compute the optimal (e.g., minimum variance) portfolio weights. - + (Note: This is duplicated in `optimization/nco.py`) Parameters @@ -334,12 +336,12 @@ def optimal_portfolio( """ inv_cov = np.linalg.inv(cov) ones = np.ones(shape=(inv_cov.shape[0], 1)) - + if mu is None: mu = ones - + w = inv_cov @ mu - w /= (ones.T @ w) + w /= ones.T @ w return w.flatten() @@ -370,6 +372,6 @@ def optimal_portfolio_denoised( """ # --- FIX 4: Pass bandwidth to denoise_cov --- cov_denoised = denoise_cov(cov, q, bandwidth) - + # Compute optimal portfolio on the denoised matrix - return optimal_portfolio(cov_denoised, mu) \ No newline at end of file + return optimal_portfolio(cov_denoised, mu) diff --git a/RiskLabAI/data/differentiation/__init__.py b/RiskLabAI/data/differentiation/__init__.py index 104681b..22874c6 100644 --- a/RiskLabAI/data/differentiation/__init__.py +++ b/RiskLabAI/data/differentiation/__init__.py @@ -1 +1 @@ -from .differentiation import * \ No newline at end of file +from .differentiation import * diff --git a/RiskLabAI/data/differentiation/differentiation.py b/RiskLabAI/data/differentiation/differentiation.py index 41824aa..d7cc5d2 100644 --- a/RiskLabAI/data/differentiation/differentiation.py +++ b/RiskLabAI/data/differentiation/differentiation.py @@ -15,6 +15,7 @@ from statsmodels.tsa.stattools import adfuller from typing import Tuple, Optional + def calculate_weights_std(degree: float, size: int) -> np.ndarray: """ Compute weights for standard (expanding window) fractional differentiation. @@ -38,10 +39,11 @@ def calculate_weights_std(degree: float, size: int) -> np.ndarray: for k in range(1, size): weight = -weights[-1] / k * (degree - k + 1) weights.append(weight) - + # Reverse for dot product: [w_k, ..., w_1, w_0] return np.array(weights[::-1]).reshape(-1, 1) + def calculate_weights_ffd(degree: float, threshold: float = 1e-5) -> np.ndarray: """ Compute weights for Fixed-Width Window Fractional Differentiation (FFD). @@ -75,10 +77,9 @@ def calculate_weights_ffd(degree: float, threshold: float = 1e-5) -> np.ndarray: # Reverse for dot product: [w_k, ..., w_1, w_0] return np.array(weights[::-1]).reshape(-1, 1) + def fractional_difference_std( - series: pd.DataFrame, - degree: float, - threshold: float = 0.01 + series: pd.DataFrame, degree: float, threshold: float = 0.01 ) -> pd.DataFrame: """ Compute the standard (expanding window) fractionally differentiated series. @@ -131,13 +132,11 @@ def fractional_difference_std( convolved = np.convolve(series_np, weights_natural[:n_obs])[:n_obs] result_df.loc[series_ffill.index[skip:], name] = convolved[skip:] - return result_df.dropna(how='all') + return result_df.dropna(how="all") def fractional_difference_fixed( - series: pd.DataFrame, - degree: float, - threshold: float = 1e-5 + series: pd.DataFrame, degree: float, threshold: float = 1e-5 ) -> pd.DataFrame: """ Compute the Fixed-Width Window (FFD) fractionally differentiated series. @@ -163,19 +162,17 @@ def fractional_difference_fixed( DataFrame of fractionally differentiated series. """ result_df = pd.DataFrame(index=series.index) - + for name in series.columns: result_df[name] = fractional_difference_fixed_single( series[name], degree, threshold ) - - return result_df.dropna(how='all') + + return result_df.dropna(how="all") def fractional_difference_fixed_single( - series: pd.Series, - degree: float, - threshold: float = 1e-5 + series: pd.Series, degree: float, threshold: float = 1e-5 ) -> pd.Series: """ Compute the FFD series for a single `pd.Series` using np.convolve. @@ -194,31 +191,31 @@ def fractional_difference_fixed_single( pd.Series The fractionally differentiated series. """ - + # 1. Compute weights # Reverse weights: calculate_weights_ffd returns [w_k, ..., w_0] weights = calculate_weights_ffd(degree, threshold).flatten()[::-1] width = len(weights) - + # 2. Prepare data (drop leading NaNs) series_ffill = series.ffill().dropna() - + if series_ffill.empty or series_ffill.shape[0] < width: # Not enough data to convolve return pd.Series(index=series.index, dtype="float64") - + series_np = series_ffill.to_numpy() - + # 3. Apply convolution # 'valid' mode computes the dot product only where the # window fully overlaps the series. - convolved_vals = np.convolve(series_np, weights, mode='valid') - + convolved_vals = np.convolve(series_np, weights, mode="valid") + # 4. Create result series on the valid index # The result aligns with the *end* of the window - result_index = series_ffill.index[width - 1:] + result_index = series_ffill.index[width - 1 :] valid_results = pd.Series(convolved_vals, index=result_index) - + # 5. Reindex to original full index # This correctly places NaNs at the start (warm-up) # and in any gaps that were in the original series. @@ -229,7 +226,7 @@ def plot_weights( degree_range: Tuple[float, float], number_degrees: int, size: int, - ax: Optional["plt.Axes"] = None + ax: Optional["plt.Axes"] = None, ) -> "plt.Axes": """ Plot the weights of fractionally differentiated series for various degrees. @@ -254,13 +251,13 @@ def plot_weights( if ax is None: fig, ax = plt.subplots(figsize=(10, 6)) - + weights_df = pd.DataFrame() for degree in np.linspace(degree_range[0], degree_range[1], number_degrees): degree = round(degree, 2) weights = calculate_weights_std(degree, size) weights_df[degree] = pd.Series(weights.flatten(), index=range(size - 1, -1, -1)) - + weights_df.plot(ax=ax) ax.set_xlabel("Lag") ax.set_ylabel("Weight") @@ -270,8 +267,7 @@ def plot_weights( def find_optimal_ffd_simple( - input_series: pd.DataFrame, - p_value_threshold: float = 0.05 + input_series: pd.DataFrame, p_value_threshold: float = 0.05 ) -> pd.DataFrame: """ Find the minimum 'd' that passes the ADF test, for a range of d. @@ -294,54 +290,53 @@ def find_optimal_ffd_simple( DataFrame of ADF test results for each 'd'. """ results_list = [] - + # Resample to daily to ensure consistent lags - series_daily = np.log(input_series[['close']]).resample('1D').last().dropna() - + series_daily = np.log(input_series[["close"]]).resample("1D").last().dropna() + for d in np.linspace(0, 1, 11): differentiated = fractional_difference_fixed( series_daily, d, threshold=0.01 ).dropna() - + if differentiated.empty: continue - + corr = np.corrcoef( - series_daily.loc[differentiated.index, 'close'], - differentiated['close'] + series_daily.loc[differentiated.index, "close"], differentiated["close"] )[0, 1] - + try: adf_result = adfuller( - differentiated['close'], maxlag=1, regression='c', autolag=None + differentiated["close"], maxlag=1, regression="c", autolag=None ) - + results_list.append( { - 'd': d, - 'adfStat': adf_result[0], - 'pVal': adf_result[1], - 'lags': adf_result[2], - 'nObs': adf_result[3], - '95% conf': adf_result[4]['5%'], - 'corr': corr + "d": d, + "adfStat": adf_result[0], + "pVal": adf_result[1], + "lags": adf_result[2], + "nObs": adf_result[3], + "95% conf": adf_result[4]["5%"], + "corr": corr, } ) except Exception: # Handle cases where ADF test fails (e.g., insufficient data) continue - + if not results_list: return pd.DataFrame() - - return pd.DataFrame(results_list).set_index('d') + + return pd.DataFrame(results_list).set_index("d") def fractionally_differentiated_log_price( input_series: pd.Series, threshold: float = 1e-5, step: float = 0.01, - p_value_threshold: float = 0.05 + p_value_threshold: float = 0.05, ) -> pd.Series: """ Find the minimum 'd' that makes a log-price series stationary. @@ -369,34 +364,32 @@ def fractionally_differentiated_log_price( log_price = np.log(input_series) degree = 0.0 p_value = 1.0 - + differentiated_series = None while p_value > p_value_threshold: degree += step - if degree > 2.0: # Safety break - raise ValueError("Failed to find stationary 'd' < 2.0") - + if degree > 2.0: # Safety break + raise ValueError("Failed to find stationary 'd' < 2.0") + differentiated = fractional_difference_fixed_single( log_price, degree, threshold=threshold ).dropna() - + if differentiated.empty: - continue # Not enough data for this 'd' - + continue # Not enough data for this 'd' + try: - adf_test = adfuller( - differentiated, maxlag=1, regression='c', autolag=None - ) + adf_test = adfuller(differentiated, maxlag=1, regression="c", autolag=None) p_value = adf_test[1] except Exception: - p_value = 1.0 # Failed test, keep going - + p_value = 1.0 # Failed test, keep going + if differentiated_series is None: - differentiated_series = differentiated # Store first valid series - + differentiated_series = differentiated # Store first valid series + # Return the last computed series that passed if differentiated_series is None: raise ValueError("Could not generate any differentiated series.") - - return differentiated \ No newline at end of file + + return differentiated diff --git a/RiskLabAI/data/distance/__init__.py b/RiskLabAI/data/distance/__init__.py index 3a647cc..ccca77c 100644 --- a/RiskLabAI/data/distance/__init__.py +++ b/RiskLabAI/data/distance/__init__.py @@ -1 +1 @@ -from .distance_metric import * \ No newline at end of file +from .distance_metric import * diff --git a/RiskLabAI/data/distance/distance_metric.py b/RiskLabAI/data/distance/distance_metric.py index 7dfce6f..6274790 100644 --- a/RiskLabAI/data/distance/distance_metric.py +++ b/RiskLabAI/data/distance/distance_metric.py @@ -18,6 +18,7 @@ from sklearn.metrics import mutual_info_score from typing import Optional + def calculate_variation_of_information( x: np.ndarray, y: np.ndarray, bins: int, norm: bool = False ) -> float: @@ -47,10 +48,8 @@ def calculate_variation_of_information( The Variation of Information. """ histogram_xy = np.histogram2d(x, y, bins)[0] - mutual_information = mutual_info_score( - None, None, contingency=histogram_xy - ) - + mutual_information = mutual_info_score(None, None, contingency=histogram_xy) + marginal_x = ss.entropy(histogram_xy.sum(axis=1)) marginal_y = ss.entropy(histogram_xy.sum(axis=0)) @@ -59,7 +58,7 @@ def calculate_variation_of_information( if norm: joint_xy = marginal_x + marginal_y - mutual_information if joint_xy == 0: - return 0.0 # Avoid division by zero if entropies are 0 + return 0.0 # Avoid division by zero if entropies are 0 variation_xy /= joint_xy return variation_xy @@ -102,7 +101,7 @@ def calculate_number_of_bins( if np.isclose(correlation, 1.0) or np.isclose(correlation, -1.0): # Handle perfect correlation case by setting to almost 1 correlation = np.sign(correlation) * (1.0 - 1e-10) - + if (1.0 - correlation**2) == 0: # Handle numerical instability if correlation is still 1 return calculate_number_of_bins(num_observations, correlation=None) @@ -110,15 +109,7 @@ def calculate_number_of_bins( # Bivariate formula bins = round( (2**-0.5) - * ( - 1 - + ( - 1 - + 24 * num_observations / (1.0 - correlation**2) - ) - ** 0.5 - ) - ** 0.5 + * (1 + (1 + 24 * num_observations / (1.0 - correlation**2)) ** 0.5) ** 0.5 ) return int(bins) @@ -151,7 +142,7 @@ def calculate_variation_of_information_extended( """ correlation = np.corrcoef(x, y)[0, 1] num_bins = calculate_number_of_bins(x.shape[0], correlation=correlation) - + return calculate_variation_of_information(x, y, num_bins, norm) @@ -183,11 +174,9 @@ def calculate_mutual_information( """ correlation = np.corrcoef(x, y)[0, 1] num_bins = calculate_number_of_bins(x.shape[0], correlation=correlation) - + histogram_xy = np.histogram2d(x, y, num_bins)[0] - mutual_information = mutual_info_score( - None, None, contingency=histogram_xy - ) + mutual_information = mutual_info_score(None, None, contingency=histogram_xy) if norm: marginal_x = ss.entropy(histogram_xy.sum(axis=1)) @@ -195,16 +184,14 @@ def calculate_mutual_information( min_entropy = min(marginal_x, marginal_y) if min_entropy == 0: - return 0.0 # Avoid division by zero - + return 0.0 # Avoid division by zero + mutual_information /= min_entropy return mutual_information -def calculate_distance( - dependence: np.ndarray, metric: str = "angular" -) -> np.ndarray: +def calculate_distance(dependence: np.ndarray, metric: str = "angular") -> np.ndarray: r""" Calculate a distance matrix from a dependence matrix (e.g., correlation). @@ -226,7 +213,7 @@ def calculate_distance( """ # Clip to handle potential floating point errors dependence = np.clip(dependence, -1.0, 1.0) - + if metric == "angular": distance = ((1 - dependence).round(6) / 2.0) ** 0.5 elif metric == "absolute_angular": @@ -236,9 +223,7 @@ def calculate_distance( return distance -def calculate_kullback_leibler_divergence( - p: np.ndarray, q: np.ndarray -) -> float: +def calculate_kullback_leibler_divergence(p: np.ndarray, q: np.ndarray) -> float: """ Calculate Kullback-Leibler (KL) divergence. @@ -260,7 +245,7 @@ def calculate_kullback_leibler_divergence( # Ensure probabilities sum to 1 p = p / np.sum(p) q = q / np.sum(q) - + # Filter for terms where p > 0 and q > 0 # Where p_i = 0, the term is 0. # Where q_i = 0 (and p_i > 0), the term is +inf. @@ -269,8 +254,8 @@ def calculate_kullback_leibler_divergence( q_filtered = q[mask] if len(p_filtered) == 0: - return 0.0 # No overlapping support - + return 0.0 # No overlapping support + # Check if any p_i > 0 corresponds to q_i = 0 if np.any(p[q == 0] > 0): return np.inf @@ -299,14 +284,14 @@ def calculate_cross_entropy(p: np.ndarray, q: np.ndarray) -> float: """ p = p / np.sum(p) q = q / np.sum(q) - + # Filter for terms where p > 0 and q > 0 # Where p_i = 0, the term is 0. # Where q_i = 0 (and p_i > 0), the term is +inf. mask = (p > 0) & (q > 0) p_filtered = p[mask] q_filtered = q[mask] - + if len(p_filtered) == 0: return 0.0 @@ -315,4 +300,4 @@ def calculate_cross_entropy(p: np.ndarray, q: np.ndarray) -> float: return np.inf entropy = -np.sum(p_filtered * np.log(q_filtered)) - return entropy \ No newline at end of file + return entropy diff --git a/RiskLabAI/data/labeling/__init__.py b/RiskLabAI/data/labeling/__init__.py index 6ea75b2..d3eae51 100644 --- a/RiskLabAI/data/labeling/__init__.py +++ b/RiskLabAI/data/labeling/__init__.py @@ -43,8 +43,7 @@ "process_jobs", "expand_call", "report_progress", - # from financial_labels.py "calculate_t_value_linear_regression", "find_trend_using_trend_scanning", -] \ No newline at end of file +] diff --git a/RiskLabAI/data/labeling/financial_labels.py b/RiskLabAI/data/labeling/financial_labels.py index dcea92a..8e3c58b 100644 --- a/RiskLabAI/data/labeling/financial_labels.py +++ b/RiskLabAI/data/labeling/financial_labels.py @@ -15,6 +15,7 @@ from scipy import stats from typing import List, Tuple, Optional + def calculate_t_value_linear_regression(prices: pd.Series) -> float: """ Calculate the t-value of the slope of a linear regression. @@ -33,13 +34,13 @@ def calculate_t_value_linear_regression(prices: pd.Series) -> float: # --- SUGGESTION: Explicitly handle insufficient data --- if prices.shape[0] < 2: return np.nan - + x = np.arange(prices.shape[0]) try: ols = stats.linregress(x, prices.values) except ValueError: return np.nan - + if ols.stderr == 0: # Perfect trend: t -> +/- inf. Constant series: 0/0 -> undefined (NaN), # consistent with the documented contract. @@ -82,9 +83,7 @@ def find_trend_using_trend_scanning( - 't-Value': The t-value of the most significant trend found. - 'Trend': The sign of the trend (-1, 0, or 1). """ - outputs = pd.DataFrame( - index=molecule, columns=["End Time", "t-Value", "Trend"] - ) + outputs = pd.DataFrame(index=molecule, columns=["End Time", "t-Value", "Trend"]) # --- SUGGESTION: Add robustness check for span --- # Ensure min_span < max_span and min_span is at least 2 for OLS. @@ -95,15 +94,15 @@ def find_trend_using_trend_scanning( spans = range(*span) # Use span[1] - 1 directly. It's safer than max(spans) # which fails on an empty range. - max_span_val = span[1] - 1 + max_span_val = span[1] - 1 for index in molecule: t_values = pd.Series(dtype="float64") - + try: location = close.index.get_loc(index) except KeyError: - continue # Event timestamp not in close index + continue # Event timestamp not in close index # Ensure we don't scan past the end of the series if location + max_span_val >= close.shape[0]: @@ -116,11 +115,9 @@ def find_trend_using_trend_scanning( # End of this specific window tail_time = close.index[location + span_val - 1] window_prices = close.loc[index:tail_time] - - t_values.loc[tail_time] = calculate_t_value_linear_regression( - window_prices - ) - + + t_values.loc[tail_time] = calculate_t_value_linear_regression(window_prices) + if t_values.empty: continue @@ -128,7 +125,7 @@ def find_trend_using_trend_scanning( # Use idxmax on the absolute values, but get the original t-value best_t_value_idx = t_values.replace([-np.inf, np.inf, np.nan], 0).abs().idxmax() best_t_value = t_values[best_t_value_idx] - + outputs.loc[index] = [ vertical_barrier_time, best_t_value, @@ -138,4 +135,4 @@ def find_trend_using_trend_scanning( outputs["End Time"] = pd.to_datetime(outputs["End Time"]) outputs["Trend"] = pd.to_numeric(outputs["Trend"], downcast="signed") - return outputs.dropna(subset=["Trend"]) \ No newline at end of file + return outputs.dropna(subset=["Trend"]) diff --git a/RiskLabAI/data/labeling/labeling.py b/RiskLabAI/data/labeling/labeling.py index 86fc6cd..9c22e7c 100644 --- a/RiskLabAI/data/labeling/labeling.py +++ b/RiskLabAI/data/labeling/labeling.py @@ -59,13 +59,9 @@ def cusum_filter_events_dynamic_threshold( price_delta = prices.diff().dropna() # Align price changes with thresholds - price_delta, thresholds = price_delta.align( - threshold, join="inner", copy=False - ) + price_delta, thresholds = price_delta.align(threshold, join="inner", copy=False) - for (index, value), thresh_val in zip( - price_delta.items(), thresholds.values - ): + for (index, value), thresh_val in zip(price_delta.items(), thresholds.values): shift_positive = max(0.0, shift_positive + value) shift_negative = min(0.0, shift_negative + value) @@ -79,9 +75,7 @@ def cusum_filter_events_dynamic_threshold( return pd.DatetimeIndex(time_events) -def symmetric_cusum_filter( - prices: pd.Series, threshold: float -) -> pd.DatetimeIndex: +def symmetric_cusum_filter(prices: pd.Series, threshold: float) -> pd.DatetimeIndex: """ Detect events using the Symmetric CUSUM filter with a fixed threshold. @@ -118,9 +112,7 @@ def symmetric_cusum_filter( return pd.DatetimeIndex(time_events) -def daily_volatility_with_log_returns( - close: pd.Series, span: int = 100 -) -> pd.Series: +def daily_volatility_with_log_returns(close: pd.Series, span: int = 100) -> pd.Series: """ Calculate daily volatility using log returns. @@ -235,7 +227,7 @@ def triple_barrier( # Filter for events this worker owns events_filtered = events.loc[molecule] output = pd.DataFrame(index=events_filtered.index) - output["End Time"] = events_filtered["End Time"] # Use original end time + output["End Time"] = events_filtered["End Time"] # Use original end time # 1. Set horizontal barriers if ptsl[0] > 0: @@ -275,11 +267,13 @@ def triple_barrier( n_events = len(events_filtered) stop_loss_touch = np.full(n_events, np.datetime64("NaT"), dtype="datetime64[ns]") - profit_taking_touch = np.full(n_events, np.datetime64("NaT"), dtype="datetime64[ns]") + profit_taking_touch = np.full( + n_events, np.datetime64("NaT"), dtype="datetime64[ns]" + ) for i in range(n_events): start, end = start_positions[i], end_positions[i] - segment = close_values[start:end + 1] + segment = close_values[start : end + 1] path_returns = np.log(segment / segment[0]) * side_values[i] below = path_returns < stop_loss_values[i] @@ -293,11 +287,13 @@ def triple_barrier( # First barrier touched = earliest of {vertical, stop-loss, profit-taking}, # ignoring NaT (the same semantics as the previous output.min(axis=1)). candidates = pd.DataFrame( - np.vstack([ - events_filtered["End Time"].to_numpy().astype("datetime64[ns]"), - stop_loss_touch, - profit_taking_touch, - ]).T, + np.vstack( + [ + events_filtered["End Time"].to_numpy().astype("datetime64[ns]"), + stop_loss_touch, + profit_taking_touch, + ] + ).T, index=events_filtered.index, ) output["End Time"] = candidates.min(axis=1) @@ -364,10 +360,10 @@ def meta_events( # 3. Set up sides if side is None: side_series = pd.Series(1.0, index=target.index) - ptsl_final = [ptsl[0], ptsl[0]] # Symmetric barriers + ptsl_final = [ptsl[0], ptsl[0]] # Symmetric barriers else: side_series = side.reindex(target.index) - ptsl_final = ptsl[:2] # Asymmetric barriers + ptsl_final = ptsl[:2] # Asymmetric barriers # 4. Create base events DataFrame events = pd.concat( @@ -383,13 +379,15 @@ def meta_events( molecule_subsets = np.array_split(events.index, num_threads) with ProcessPoolExecutor(max_workers=num_threads) as executor: - results = list(executor.map( - triple_barrier, - [close] * num_threads, - [events] * num_threads, - [ptsl_final] * num_threads, - molecule_subsets - )) + results = list( + executor.map( + triple_barrier, + [close] * num_threads, + [events] * num_threads, + [ptsl_final] * num_threads, + molecule_subsets, + ) + ) # Combine results and update End Time first_touch_times = pd.concat(results, axis=0)["End Time"] @@ -401,9 +399,7 @@ def meta_events( return events -def meta_labeling( - events: pd.DataFrame, close: pd.Series -) -> pd.DataFrame: +def meta_labeling(events: pd.DataFrame, close: pd.Series) -> pd.DataFrame: """ Calculate returns and assign binary labels for meta-labeling. @@ -439,10 +435,9 @@ def meta_labeling( out = pd.DataFrame(index=events_filtered.index) out["End Time"] = events_filtered["End Time"] - out["Return"] = ( - np.log(close_filtered.loc[events_filtered["End Time"].values].values) - - np.log(close_filtered.loc[events_filtered.index].values) - ) + out["Return"] = np.log( + close_filtered.loc[events_filtered["End Time"].values].values + ) - np.log(close_filtered.loc[events_filtered.index].values) if "Side" in events_filtered: out["Return"] *= events_filtered["Side"] diff --git a/RiskLabAI/data/structures/__init__.py b/RiskLabAI/data/structures/__init__.py index fc6176c..28ccdbf 100644 --- a/RiskLabAI/data/structures/__init__.py +++ b/RiskLabAI/data/structures/__init__.py @@ -25,4 +25,4 @@ "FixedImbalanceBars", "ExpectedRunBars", "FixedRunBars", -] \ No newline at end of file +] diff --git a/RiskLabAI/data/structures/abstract_bars.py b/RiskLabAI/data/structures/abstract_bars.py index 0a999a4..b4db081 100644 --- a/RiskLabAI/data/structures/abstract_bars.py +++ b/RiskLabAI/data/structures/abstract_bars.py @@ -11,6 +11,7 @@ # Type hint for a single tick: (datetime, price, volume) TickData = Union[List[Any], Tuple[Any, ...], np.ndarray] + class AbstractBars(ABC): """ Abstract base class for all bar types. @@ -39,7 +40,7 @@ def __init__(self, bar_type: str): self.close_price: Optional[float] = None self.high_price: float = -np.inf self.low_price: float = np.inf - + self.base_statistics: Dict[str, Union[int, float]] = { PREVIOUS_TICK_RULE: 0, CUMULATIVE_TICKS: 0, @@ -238,4 +239,4 @@ def _construct_next_bar( threshold, ] - return next_bar \ No newline at end of file + return next_bar diff --git a/RiskLabAI/data/structures/abstract_imbalance_bars.py b/RiskLabAI/data/structures/abstract_imbalance_bars.py index 9f98dae..457bf59 100644 --- a/RiskLabAI/data/structures/abstract_imbalance_bars.py +++ b/RiskLabAI/data/structures/abstract_imbalance_bars.py @@ -7,7 +7,7 @@ import numpy as np from RiskLabAI.data.structures.abstract_information_driven_bars import ( - AbstractInformationDrivenBars + AbstractInformationDrivenBars, ) from RiskLabAI.data.structures.abstract_bars import TickData from RiskLabAI.utils.constants import * @@ -58,16 +58,16 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: (Parameters same as original) """ bars_list = [] - + # Keep track of last timestamp and threshold date_time = None threshold = np.inf - + for tick_data in data: self.tick_counter += 1 date_time, price, volume = tick_data[0], tick_data[1], tick_data[2] - + # Update common fields tick_rule = self._tick_rule(price) self.update_base_fields(price, tick_rule, volume) @@ -82,14 +82,14 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: # Warm-up E[b] if it's the first time if np.isnan(self.imbalance_bars_statistics[EXPECTED_IMBALANCE]): - self.imbalance_bars_statistics[ - EXPECTED_IMBALANCE - ] = self._ewma_expected_imbalance( - self.imbalance_bars_statistics[PREVIOUS_TICK_IMBALANCES_LIST], - self.information_driven_bars_statistics[ - EXPECTED_IMBALANCE_WINDOW - ], - warm_up=True, + self.imbalance_bars_statistics[EXPECTED_IMBALANCE] = ( + self._ewma_expected_imbalance( + self.imbalance_bars_statistics[PREVIOUS_TICK_IMBALANCES_LIST], + self.information_driven_bars_statistics[ + EXPECTED_IMBALANCE_WINDOW + ], + warm_up=True, + ) ) if self.analyse_thresholds is not None: @@ -97,7 +97,7 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: **self.base_statistics, **self.information_driven_bars_statistics, **self.imbalance_bars_statistics, - 'timestamp': date_time + "timestamp": date_time, } self.analyse_thresholds.append(stats) @@ -105,10 +105,8 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: expected_ticks = self.information_driven_bars_statistics[ EXPECTED_TICKS_NUMBER ] - expected_imbalance = self.imbalance_bars_statistics[ - EXPECTED_IMBALANCE - ] - + expected_imbalance = self.imbalance_bars_statistics[EXPECTED_IMBALANCE] + if np.isnan(expected_ticks) or np.isnan(expected_imbalance): threshold = np.inf else: @@ -126,24 +124,24 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: bars_list.append(next_bar) # Store T for E[T] update - self.imbalance_bars_statistics[ - PREVIOUS_BARS_N_TICKS_LIST - ].append(self.base_statistics[CUMULATIVE_TICKS]) + self.imbalance_bars_statistics[PREVIOUS_BARS_N_TICKS_LIST].append( + self.base_statistics[CUMULATIVE_TICKS] + ) # Update E[T] - self.information_driven_bars_statistics[ - EXPECTED_TICKS_NUMBER - ] = self._expected_number_of_ticks() + self.information_driven_bars_statistics[EXPECTED_TICKS_NUMBER] = ( + self._expected_number_of_ticks() + ) # Update E[b] - self.imbalance_bars_statistics[ - EXPECTED_IMBALANCE - ] = self._ewma_expected_imbalance( - self.imbalance_bars_statistics[PREVIOUS_TICK_IMBALANCES_LIST], - self.information_driven_bars_statistics[ - EXPECTED_IMBALANCE_WINDOW - ], - warm_up=False, + self.imbalance_bars_statistics[EXPECTED_IMBALANCE] = ( + self._ewma_expected_imbalance( + self.imbalance_bars_statistics[PREVIOUS_TICK_IMBALANCES_LIST], + self.information_driven_bars_statistics[ + EXPECTED_IMBALANCE_WINDOW + ], + warm_up=False, + ) ) # Reset cached fields @@ -155,7 +153,7 @@ def _bar_construction_condition(self, threshold: float) -> bool: """Check if cumulative imbalance |theta| exceeds the threshold.""" if np.isnan(threshold) or np.isinf(threshold): return False - + cumulative_theta = self.imbalance_bars_statistics[CUMULATIVE_ΞΈ] return np.abs(cumulative_theta) >= threshold @@ -167,4 +165,4 @@ def _reset_cached_fields(self): @abstractmethod def _expected_number_of_ticks(self) -> float: """Calculate E[T] when a new bar is sampled.""" - pass \ No newline at end of file + pass diff --git a/RiskLabAI/data/structures/abstract_information_driven_bars.py b/RiskLabAI/data/structures/abstract_information_driven_bars.py index 9dc81d5..6dcb1f0 100644 --- a/RiskLabAI/data/structures/abstract_information_driven_bars.py +++ b/RiskLabAI/data/structures/abstract_information_driven_bars.py @@ -10,6 +10,7 @@ from RiskLabAI.data.structures.abstract_bars import AbstractBars from RiskLabAI.utils.constants import * + class AbstractInformationDrivenBars(AbstractBars): """ Abstract class for Information-Driven Bars (Imbalance and Run). @@ -44,9 +45,7 @@ def __init__( """ super().__init__(bar_type) self.information_driven_bars_statistics = { - EXPECTED_TICKS_NUMBER: float( - initial_estimate_of_expected_n_ticks_in_bar - ), + EXPECTED_TICKS_NUMBER: float(initial_estimate_of_expected_n_ticks_in_bar), EXPECTED_IMBALANCE_WINDOW: window_size_for_expected_imbalance_estimation, } self.window_size_for_expected_n_ticks_estimation = ( @@ -84,9 +83,7 @@ def _ewma_expected_imbalance( if ewma_window == 0: return np.nan - return ewma( - np.array(array[-ewma_window:], dtype=float), window=ewma_window - )[-1] + return ewma(np.array(array[-ewma_window:], dtype=float), window=ewma_window)[-1] def _imbalance_at_tick( self, price: float, signed_tick: int, volume: float @@ -125,4 +122,4 @@ def _expected_number_of_ticks(self) -> float: Abstract method to update the expected number of ticks (E[T]). This is implemented differently for "fixed" vs. "expected" bars. """ - pass \ No newline at end of file + pass diff --git a/RiskLabAI/data/structures/abstract_run_bars.py b/RiskLabAI/data/structures/abstract_run_bars.py index 7c30e03..3ecf1cf 100644 --- a/RiskLabAI/data/structures/abstract_run_bars.py +++ b/RiskLabAI/data/structures/abstract_run_bars.py @@ -7,7 +7,7 @@ import numpy as np from RiskLabAI.data.structures.abstract_information_driven_bars import ( - AbstractInformationDrivenBars + AbstractInformationDrivenBars, ) from RiskLabAI.data.structures.abstract_bars import TickData from RiskLabAI.utils.ewma import ewma @@ -60,16 +60,16 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: (Parameters same as original) """ bars_list = [] - + # Keep track of last timestamp and threshold date_time = None threshold = np.inf - + for tick_data in data: self.tick_counter += 1 date_time, price, volume = tick_data[0], tick_data[1], tick_data[2] - + # Update common fields tick_rule = self._tick_rule(price) self.update_base_fields(price, tick_rule, volume) @@ -94,25 +94,29 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: warm_up_stats = [ self.run_bars_statistics[EXPECTED_BUY_IMBALANCE], self.run_bars_statistics[EXPECTED_SELL_IMBALANCE], - self.run_bars_statistics[EXPECTED_BUY_TICKS_PROPORTION] + self.run_bars_statistics[EXPECTED_BUY_TICKS_PROPORTION], ] - + if np.isnan(warm_up_stats).any(): - self.run_bars_statistics[ - EXPECTED_BUY_IMBALANCE - ] = self._ewma_expected_imbalance( - self.run_bars_statistics[PREVIOUS_TICK_IMBALANCES_BUY_LIST], - self.information_driven_bars_statistics[EXPECTED_IMBALANCE_WINDOW], - warm_up=True + self.run_bars_statistics[EXPECTED_BUY_IMBALANCE] = ( + self._ewma_expected_imbalance( + self.run_bars_statistics[PREVIOUS_TICK_IMBALANCES_BUY_LIST], + self.information_driven_bars_statistics[ + EXPECTED_IMBALANCE_WINDOW + ], + warm_up=True, + ) ) - self.run_bars_statistics[ - EXPECTED_SELL_IMBALANCE - ] = self._ewma_expected_imbalance( - self.run_bars_statistics[PREVIOUS_TICK_IMBALANCES_SELL_LIST], - self.information_driven_bars_statistics[EXPECTED_IMBALANCE_WINDOW], - warm_up=True + self.run_bars_statistics[EXPECTED_SELL_IMBALANCE] = ( + self._ewma_expected_imbalance( + self.run_bars_statistics[PREVIOUS_TICK_IMBALANCES_SELL_LIST], + self.information_driven_bars_statistics[ + EXPECTED_IMBALANCE_WINDOW + ], + warm_up=True, + ) ) - + # Update P[buy] if self.base_statistics[CUMULATIVE_TICKS] > 0: buy_ticks_num = self.run_bars_statistics[BUY_TICKS_NUMBER] @@ -126,10 +130,10 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: **self.base_statistics, **self.information_driven_bars_statistics, **self.run_bars_statistics, - 'timestamp': date_time + "timestamp": date_time, } self.analyse_thresholds.append(stats) - + # Calculate threshold and check condition threshold = self._calculate_run_threshold() @@ -147,11 +151,9 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: # Store T and P[buy] for E[T] and E[P[buy]] updates cum_ticks = self.base_statistics[CUMULATIVE_TICKS] buy_ticks_num = self.run_bars_statistics[BUY_TICKS_NUMBER] - - self.run_bars_statistics[PREVIOUS_BARS_N_TICKS_LIST].append( - cum_ticks - ) - + + self.run_bars_statistics[PREVIOUS_BARS_N_TICKS_LIST].append(cum_ticks) + # Avoid division by zero if bar has 0 ticks (should be rare) buy_proportion = (buy_ticks_num / cum_ticks) if cum_ticks > 0 else 0 self.run_bars_statistics[ @@ -159,37 +161,43 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: ].append(buy_proportion) # Update E[T] - self.information_driven_bars_statistics[ - EXPECTED_TICKS_NUMBER - ] = self._expected_number_of_ticks() + self.information_driven_bars_statistics[EXPECTED_TICKS_NUMBER] = ( + self._expected_number_of_ticks() + ) # Update E[P[buy]] - window = self.window_size_for_expected_n_ticks_estimation or \ - self.information_driven_bars_statistics[EXPECTED_IMBALANCE_WINDOW] + window = ( + self.window_size_for_expected_n_ticks_estimation + or self.information_driven_bars_statistics[ + EXPECTED_IMBALANCE_WINDOW + ] + ) prob_buy_list = self.run_bars_statistics[ PREVIOUS_BARS_BUY_TICKS_PROPORTIONS_LIST ] - - self.run_bars_statistics[ - EXPECTED_BUY_TICKS_PROPORTION - ] = ewma( + + self.run_bars_statistics[EXPECTED_BUY_TICKS_PROPORTION] = ewma( np.array(prob_buy_list[-window:], dtype=float), window, )[-1] # Update E[theta_buy] and E[theta_sell] - self.run_bars_statistics[ - EXPECTED_BUY_IMBALANCE - ] = self._ewma_expected_imbalance( - self.run_bars_statistics[PREVIOUS_TICK_IMBALANCES_BUY_LIST], - self.information_driven_bars_statistics[EXPECTED_IMBALANCE_WINDOW] + self.run_bars_statistics[EXPECTED_BUY_IMBALANCE] = ( + self._ewma_expected_imbalance( + self.run_bars_statistics[PREVIOUS_TICK_IMBALANCES_BUY_LIST], + self.information_driven_bars_statistics[ + EXPECTED_IMBALANCE_WINDOW + ], + ) ) - self.run_bars_statistics[ - EXPECTED_SELL_IMBALANCE - ] = self._ewma_expected_imbalance( - self.run_bars_statistics[PREVIOUS_TICK_IMBALANCES_SELL_LIST], - self.information_driven_bars_statistics[EXPECTED_IMBALANCE_WINDOW] + self.run_bars_statistics[EXPECTED_SELL_IMBALANCE] = ( + self._ewma_expected_imbalance( + self.run_bars_statistics[PREVIOUS_TICK_IMBALANCES_SELL_LIST], + self.information_driven_bars_statistics[ + EXPECTED_IMBALANCE_WINDOW + ], + ) ) # Reset cached fields @@ -210,15 +218,14 @@ def _calculate_run_threshold(self) -> float: # Threshold = E[T] * max(P[buy] * E[theta_buy], (1-P[buy]) * E[theta_sell]) buy_threshold = e_p_buy * e_theta_buy sell_threshold = (1 - e_p_buy) * e_theta_sell - - return e_t * max(buy_threshold, sell_threshold) + return e_t * max(buy_threshold, sell_threshold) def _bar_construction_condition(self, threshold: float) -> bool: """Check if cumulative buy or sell run exceeds the threshold.""" if np.isinf(threshold) or np.isnan(threshold): return False - + max_theta = max( self.run_bars_statistics[CUMULATIVE_BUY_ΞΈ], self.run_bars_statistics[CUMULATIVE_SELL_ΞΈ], @@ -235,4 +242,4 @@ def _reset_cached_fields(self): @abstractmethod def _expected_number_of_ticks(self) -> float: """Calculate E[T] when a new bar is sampled.""" - pass \ No newline at end of file + pass diff --git a/RiskLabAI/data/structures/imbalance_bars.py b/RiskLabAI/data/structures/imbalance_bars.py index 5c05ca9..6fa0189 100644 --- a/RiskLabAI/data/structures/imbalance_bars.py +++ b/RiskLabAI/data/structures/imbalance_bars.py @@ -12,6 +12,7 @@ from RiskLabAI.utils.ewma import ewma + class ExpectedImbalanceBars(AbstractImbalanceBars): """ Concrete class for Imbalance Bars with a dynamic, EWMA-based @@ -57,29 +58,23 @@ def __init__( self.expected_ticks_number_lower_bound = 0.0 self.expected_ticks_number_upper_bound = np.inf else: - self.expected_ticks_number_lower_bound = ( - expected_ticks_number_bounds[0] - ) - self.expected_ticks_number_upper_bound = ( - expected_ticks_number_bounds[1] - ) + self.expected_ticks_number_lower_bound = expected_ticks_number_bounds[0] + self.expected_ticks_number_upper_bound = expected_ticks_number_bounds[1] def _expected_number_of_ticks(self) -> float: """ Calculate E[T] using an EWMA of previous bar tick counts. """ - prev_ticks_list = self.imbalance_bars_statistics[ - PREVIOUS_BARS_N_TICKS_LIST - ] - + prev_ticks_list = self.imbalance_bars_statistics[PREVIOUS_BARS_N_TICKS_LIST] + window = self.window_size_for_expected_n_ticks_estimation if window is None or window <= 0: - # Fallback to simple mean if window is invalid - return np.mean(prev_ticks_list) - + # Fallback to simple mean if window is invalid + return np.mean(prev_ticks_list) + if not prev_ticks_list: - # No bars yet, return initial estimate - return self.information_driven_bars_statistics[EXPECTED_TICKS_NUMBER] + # No bars yet, return initial estimate + return self.information_driven_bars_statistics[EXPECTED_TICKS_NUMBER] ewma_ticks = ewma( np.array(prev_ticks_list[-window:], dtype=float), window=window @@ -96,13 +91,14 @@ class FixedImbalanceBars(AbstractImbalanceBars): Concrete class for Imbalance Bars with a fixed (constant) Expected Number of Ticks (E[T]). """ + def __init__( self, bar_type: str, initial_estimate_of_expected_n_ticks_in_bar: int = 20000, window_size_for_expected_imbalance_estimation: int = 10000, analyse_thresholds: bool = False, - window_size_for_expected_n_ticks_estimation: Optional[int] = None, + window_size_for_expected_n_ticks_estimation: Optional[int] = None, ): """ Constructor. @@ -132,4 +128,4 @@ def _expected_number_of_ticks(self) -> float: """ Return the fixed E[T] value. """ - return self.information_driven_bars_statistics[EXPECTED_TICKS_NUMBER] \ No newline at end of file + return self.information_driven_bars_statistics[EXPECTED_TICKS_NUMBER] diff --git a/RiskLabAI/data/structures/run_bars.py b/RiskLabAI/data/structures/run_bars.py index 7564165..8537529 100644 --- a/RiskLabAI/data/structures/run_bars.py +++ b/RiskLabAI/data/structures/run_bars.py @@ -58,25 +58,21 @@ def __init__( self.expected_ticks_number_lower_bound = 0.0 self.expected_ticks_number_upper_bound = np.inf else: - self.expected_ticks_number_lower_bound = ( - expected_ticks_number_bounds[0] - ) - self.expected_ticks_number_upper_bound = ( - expected_ticks_number_bounds[1] - ) + self.expected_ticks_number_lower_bound = expected_ticks_number_bounds[0] + self.expected_ticks_number_upper_bound = expected_ticks_number_bounds[1] def _expected_number_of_ticks(self) -> float: """ Calculate E[T] using an EWMA of previous bar tick counts. """ prev_ticks_list = self.run_bars_statistics[PREVIOUS_BARS_N_TICKS_LIST] - + window = self.window_size_for_expected_n_ticks_estimation if window is None or window <= 0: - return np.mean(prev_ticks_list) - + return np.mean(prev_ticks_list) + if not prev_ticks_list: - return self.information_driven_bars_statistics[EXPECTED_TICKS_NUMBER] + return self.information_driven_bars_statistics[EXPECTED_TICKS_NUMBER] ewma_ticks = ewma( np.array(prev_ticks_list[-window:], dtype=float), window=window @@ -100,7 +96,7 @@ def __init__( initial_estimate_of_expected_n_ticks_in_bar: int, window_size_for_expected_imbalance_estimation: int, analyse_thresholds: bool = False, - window_size_for_expected_n_ticks_estimation: Optional[int] = None, # Not used + window_size_for_expected_n_ticks_estimation: Optional[int] = None, # Not used ): """ Constructor. @@ -130,4 +126,4 @@ def _expected_number_of_ticks(self) -> float: """ Return the fixed E[T] value. """ - return self.information_driven_bars_statistics[EXPECTED_TICKS_NUMBER] \ No newline at end of file + return self.information_driven_bars_statistics[EXPECTED_TICKS_NUMBER] diff --git a/RiskLabAI/data/structures/standard_bars.py b/RiskLabAI/data/structures/standard_bars.py index 993e1ac..8fa1828 100644 --- a/RiskLabAI/data/structures/standard_bars.py +++ b/RiskLabAI/data/structures/standard_bars.py @@ -6,6 +6,7 @@ import numpy as np from RiskLabAI.data.structures.abstract_bars import AbstractBars, TickData + class StandardBars(AbstractBars): """ Concrete class for Standard Bars (Tick, Volume, Dollar). @@ -45,20 +46,20 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: A list of the constructed standard bars. """ bars_list = [] - + # Keep track of last timestamp for final bar - date_time = None - + date_time = None + for tick_data in data: self.tick_counter += 1 # Unpack data date_time, price, volume = tick_data[0], tick_data[1], tick_data[2] - + # Update common fields tick_rule = self._tick_rule(price) self.update_base_fields(price, tick_rule, volume) - self.close_price = price # Update close price continuously + self.close_price = price # Update close price continuously # Check if bar construction condition is met if self._bar_construction_condition(self.threshold): @@ -71,7 +72,7 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: self.threshold, ) bars_list.append(next_bar) - + # Reset cached fields for the next bar self._reset_cached_fields() @@ -82,4 +83,4 @@ def _bar_construction_condition(self, threshold: float) -> bool: Check if the cumulative value of the `bar_type` has exceeded the threshold. """ - return self.base_statistics[self.bar_type] >= threshold \ No newline at end of file + return self.base_statistics[self.bar_type] >= threshold diff --git a/RiskLabAI/data/structures/time_bars.py b/RiskLabAI/data/structures/time_bars.py index b116bfb..855291d 100644 --- a/RiskLabAI/data/structures/time_bars.py +++ b/RiskLabAI/data/structures/time_bars.py @@ -7,6 +7,7 @@ import pandas as pd from RiskLabAI.data.structures.abstract_bars import AbstractBars, TickData + class TimeBars(AbstractBars): """ Concrete class for Time Bars. @@ -36,14 +37,16 @@ def __init__(self, resolution_type: str, resolution_units: int): } if resolution_type.upper() not in self.resolution_to_n_seconds: - raise ValueError(f"Invalid resolution_type. Use one of {list(self.resolution_to_n_seconds.keys())}") - + raise ValueError( + f"Invalid resolution_type. Use one of {list(self.resolution_to_n_seconds.keys())}" + ) + self.resolution_type = resolution_type.upper() self.resolution_units = resolution_units self.threshold_in_seconds = ( self.resolution_units * self.resolution_to_n_seconds[self.resolution_type] ) - + self.current_bar_timestamp = np.nan self.current_bar_end_timestamp = np.nan @@ -55,7 +58,7 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: ---------- data : Iterable[TickData] An iterable (list, tuple, generator) of tick data. - Each tick is (date_time, price, volume). + Each tick is (date_time, price, volume). `date_time` must be a pandas Timestamp. Returns @@ -69,7 +72,7 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: # Unpack data date_time, price, volume = tick_data[0], tick_data[1], tick_data[2] - + # Get tick timestamp in seconds try: tick_timestamp_sec = date_time.timestamp() @@ -78,29 +81,29 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: "TimeBars require `date_time` to be a pandas Timestamp " "or datetime object with a .timestamp() method." ) - + # Determine the "floor" timestamp for this bar bar_start_timestamp_sec = ( int(tick_timestamp_sec // self.threshold_in_seconds) * self.threshold_in_seconds ) - + # Initialize first bar if np.isnan(self.current_bar_timestamp): self.current_bar_timestamp = bar_start_timestamp_sec self.current_bar_end_timestamp = ( bar_start_timestamp_sec + self.threshold_in_seconds ) - + # Check if this tick belongs to a new bar if self._bar_construction_condition(tick_timestamp_sec): # Construct the *previous* bar - bar_end_time = pd.to_datetime(self.current_bar_end_timestamp, unit='s') - + bar_end_time = pd.to_datetime(self.current_bar_end_timestamp, unit="s") + next_bar = self._construct_next_bar( bar_end_time, - self.tick_counter - 1, # Index of the *previous* tick - self.close_price, # Close price from *previous* tick + self.tick_counter - 1, # Index of the *previous* tick + self.close_price, # Close price from *previous* tick self.high_price, self.low_price, self.current_bar_end_timestamp, @@ -118,7 +121,7 @@ def construct_bars_from_data(self, data: Iterable[TickData]) -> List[List[Any]]: tick_rule = self._tick_rule(price) self.update_base_fields(price, tick_rule, volume) self.close_price = price - + return bars_list def _bar_construction_condition(self, tick_timestamp_sec: float) -> bool: @@ -126,4 +129,4 @@ def _bar_construction_condition(self, tick_timestamp_sec: float) -> bool: Check if the current tick's timestamp has crossed the end-time of the current bar. """ - return tick_timestamp_sec >= self.current_bar_end_timestamp \ No newline at end of file + return tick_timestamp_sec >= self.current_bar_end_timestamp diff --git a/RiskLabAI/data/synthetic_data/__init__.py b/RiskLabAI/data/synthetic_data/__init__.py index e1148dd..8960510 100644 --- a/RiskLabAI/data/synthetic_data/__init__.py +++ b/RiskLabAI/data/synthetic_data/__init__.py @@ -32,4 +32,4 @@ "form_block_matrix", "form_true_matrix", "simulates_cov_mu", -] \ No newline at end of file +] diff --git a/RiskLabAI/data/synthetic_data/drift_burst_hypothesis.py b/RiskLabAI/data/synthetic_data/drift_burst_hypothesis.py index 3ff3b71..f5522f4 100644 --- a/RiskLabAI/data/synthetic_data/drift_burst_hypothesis.py +++ b/RiskLabAI/data/synthetic_data/drift_burst_hypothesis.py @@ -8,6 +8,7 @@ import numpy as np from typing import Tuple + def drift_volatility_burst( bubble_length: int, a_before: float, @@ -89,6 +90,6 @@ def drift_volatility_burst( volatilities[nan_mask] = volatilities[nan_index - 1] else: # Handle case where midpoint is the first element - volatilities[nan_mask] = b_before # Fallback + volatilities[nan_mask] = b_before # Fallback - return drifts, volatilities \ No newline at end of file + return drifts, volatilities diff --git a/RiskLabAI/data/synthetic_data/simulation.py b/RiskLabAI/data/synthetic_data/simulation.py index 3a71bc6..bd7ced9 100644 --- a/RiskLabAI/data/synthetic_data/simulation.py +++ b/RiskLabAI/data/synthetic_data/simulation.py @@ -13,10 +13,8 @@ # Import the utility from the denoising module from RiskLabAI.data.denoise.denoising import corr_to_cov -def random_cov( - num_columns: int, - num_factors: int -) -> np.ndarray: + +def random_cov(num_columns: int, num_factors: int) -> np.ndarray: """ Generate a random covariance matrix. @@ -34,9 +32,7 @@ def random_cov( def form_block_matrix( - n_blocks: int, - block_size: int, - block_correlation: float + n_blocks: int, block_size: int, block_correlation: float ) -> np.ndarray: """ Forms a block diagonal correlation matrix. @@ -57,9 +53,7 @@ def form_block_matrix( def form_true_matrix( - n_blocks: int, - block_size: int, - block_correlation: float + n_blocks: int, block_size: int, block_correlation: float ) -> Tuple[np.ndarray, np.ndarray]: """ Forms a shuffled block diagonal correlation matrix and the @@ -79,18 +73,15 @@ def form_true_matrix( cols = corr0.columns.tolist() np.random.shuffle(cols) corr0 = corr0[cols].loc[cols].copy(deep=True).values - - std0 = np.random.uniform(.05, .2, corr0.shape[0]) + + std0 = np.random.uniform(0.05, 0.2, corr0.shape[0]) cov0 = corr_to_cov(corr0, std0) mu0 = np.random.normal(std0, std0, cov0.shape[0]).reshape(-1, 1) return mu0, cov0 def simulates_cov_mu( - mu0: np.ndarray, - cov0: np.ndarray, - n_obs: int, - shrink: bool = False + mu0: np.ndarray, cov0: np.ndarray, n_obs: int, shrink: bool = False ) -> Tuple[np.ndarray, np.ndarray]: """ Simulates multivariate normal observations and computes the @@ -110,4 +101,4 @@ def simulates_cov_mu( x = np.random.multivariate_normal(mu0.flatten(), cov0, size=n_obs) mu1 = x.mean(axis=0).reshape(-1, 1) cov1 = LedoitWolf().fit(x).covariance_ if shrink else np.cov(x, rowvar=0) - return mu1, cov1 \ No newline at end of file + return mu1, cov1 diff --git a/RiskLabAI/data/synthetic_data/synthetic_controlled_environment.py b/RiskLabAI/data/synthetic_data/synthetic_controlled_environment.py index 16f2e35..5488361 100644 --- a/RiskLabAI/data/synthetic_data/synthetic_controlled_environment.py +++ b/RiskLabAI/data/synthetic_data/synthetic_controlled_environment.py @@ -13,6 +13,7 @@ # Type hint for regime parameters RegimeParams = Dict[str, Union[float, List[float]]] + @jit(nopython=True) def compute_log_returns( n_steps: int, @@ -94,7 +95,7 @@ def compute_log_returns( # Ensure v[i] is non-negative for sqrt v_i_safe = max(v[i], 0.0) - + # Volatility process (Heston) v[i + 1] = ( v[i] @@ -157,12 +158,18 @@ def heston_merton_log_returns( """ # Ensure vectors are of length n_steps params = [ - mu_vector, kappa_vector, theta_vector, xi_vector, rho_vector, - lambda_vector, m_vector, v_vector + mu_vector, + kappa_vector, + theta_vector, + xi_vector, + rho_vector, + lambda_vector, + m_vector, + v_vector, ] if not all(len(p) == n_steps for p in params): raise ValueError("All parameter vectors must have length `n_steps`") - + rng = np.random.default_rng(random_state) dt = total_time / n_steps sqrt_dt = np.sqrt(dt) @@ -171,14 +178,14 @@ def heston_merton_log_returns( z = np.zeros((n_steps, 3)) n = np.zeros(n_steps) for i in range(n_steps): - cov_matrix = np.array([ - [1.0, rho_vector[i], 0.0], - [rho_vector[i], 1.0, 0.0], - [0.0, 0.0, v_vector[i] ** 2], - ]) - z[i] = rng.multivariate_normal( - [0.0, 0.0, m_vector[i]], cov_matrix + cov_matrix = np.array( + [ + [1.0, rho_vector[i], 0.0], + [rho_vector[i], 1.0, 0.0], + [0.0, 0.0, v_vector[i] ** 2], + ] ) + z[i] = rng.multivariate_normal([0.0, 0.0, m_vector[i]], cov_matrix) n[i] = rng.poisson(lambda_vector[i] * dt) dw_stock = z[:, 0] @@ -225,9 +232,7 @@ def align_params_length( - The aligned regime parameter dictionary. - The maximum length (number of steps) for this regime. """ - max_len = max( - len(v) if isinstance(v, list) else 1 for v in regime_params.values() - ) + max_len = max(len(v) if isinstance(v, list) else 1 for v in regime_params.values()) aligned_params: Dict[str, List[float]] = {} for key, value in regime_params.items(): @@ -236,7 +241,7 @@ def align_params_length( # Extend list by repeating last value aligned_params[key] = value + [value[-1]] * (max_len - len(value)) else: - aligned_params[key] = value[:max_len] # Truncate if too long + aligned_params[key] = value[:max_len] # Truncate if too long else: # Broadcast float to list aligned_params[key] = [value] * max_len @@ -274,34 +279,38 @@ def generate_prices_from_regimes( - The array of simulated regime names for each step. """ rng = np.random.default_rng(random_state) - + # 1. Simulate the Markov Chain import quantecon.markov as qe # optional dependency: RiskLabAI[synth] regime_names = list(regimes.keys()) markov_chain = qe.MarkovChain(transition_matrix, state_values=regime_names) - simulated_regimes = markov_chain.simulate( - ts_length=n_steps, random_state=rng - ) + simulated_regimes = markov_chain.simulate(ts_length=n_steps, random_state=rng) # 2. Unpack parameters based on simulated regimes param_lists: Dict[str, List[float]] = { - "mu": [], "kappa": [], "theta": [], "xi": [], - "rho": [], "lam": [], "m": [], "v": [], + "mu": [], + "kappa": [], + "theta": [], + "xi": [], + "rho": [], + "lam": [], + "m": [], + "v": [], } - + regime_path_expanded = [] - + current_step = 0 while current_step < n_steps: regime_name = simulated_regimes[current_step] params, regime_len = align_params_length(regimes[regime_name].copy()) - + steps_to_take = min(regime_len, n_steps - current_step) - + for key in param_lists: param_lists[key].extend(params[key][:steps_to_take]) - + regime_path_expanded.extend([regime_name] * steps_to_take) current_step += steps_to_take @@ -335,16 +344,15 @@ def generate_prices_from_regimes( # 6. Create price series with a Business Day index start_day = "2000-01-01" - business_days = pd.date_range( - start=start_day, periods=n_steps, freq="B" - ) - + business_days = pd.date_range(start=start_day, periods=n_steps, freq="B") + price_series = pd.Series(log_returns, index=business_days).ffill() prices = 100 * np.exp(price_series.cumsum()) prices.name = "Price" return prices, simulated_regimes_final + def parallel_generate_prices( number_of_paths: int, regimes: Dict[str, RegimeParams], @@ -383,7 +391,7 @@ def parallel_generate_prices( rng = np.random.default_rng(random_state) # Generate unique seeds for each parallel job random_states = rng.integers(0, 10 * number_of_paths, size=number_of_paths) - + results = Parallel(n_jobs=n_jobs)( delayed(generate_prices_from_regimes)( regimes, @@ -399,9 +407,9 @@ def parallel_generate_prices( prices_df = pd.concat(prices, axis=1) prices_df.columns = range(number_of_paths) - + simulated_regimes_df = pd.DataFrame(simulated_regimes).T simulated_regimes_df.columns = range(number_of_paths) simulated_regimes_df.index = prices_df.index - return prices_df, simulated_regimes_df \ No newline at end of file + return prices_df, simulated_regimes_df diff --git a/RiskLabAI/data/weights/__init__.py b/RiskLabAI/data/weights/__init__.py index 0b30168..87c8883 100644 --- a/RiskLabAI/data/weights/__init__.py +++ b/RiskLabAI/data/weights/__init__.py @@ -21,4 +21,4 @@ "calculate_average_uniqueness", "sample_weight_absolute_return_meta_labeling", "calculate_time_decay", -] \ No newline at end of file +] diff --git a/RiskLabAI/data/weights/sample_weights.py b/RiskLabAI/data/weights/sample_weights.py index d6ebd1a..23971b8 100644 --- a/RiskLabAI/data/weights/sample_weights.py +++ b/RiskLabAI/data/weights/sample_weights.py @@ -11,6 +11,7 @@ import pandas as pd from typing import Optional + def expand_label_for_meta_labeling( close_index: pd.Index, timestamp: pd.Series, @@ -45,7 +46,7 @@ def expand_label_for_meta_labeling( ts = timestamp.fillna(close_index[-1]) ts = ts[ts.index.isin(molecule)] ts = ts[ts > molecule[0]] - + if ts.empty: # Return an empty series; align in the caller will handle it return pd.Series(dtype=float) @@ -53,7 +54,7 @@ def expand_label_for_meta_labeling( # Find min/max index locations iloc_min = close_index.searchsorted(ts.index[0]) iloc_max = close_index.searchsorted(ts.max()) - + # Create a count series over the relevant time span count = pd.Series(0, index=close_index[iloc_min : iloc_max + 1]) @@ -91,23 +92,23 @@ def calculate_average_uniqueness( """ # c_t: Concurrency at each timestamp concurrency = index_matrix.sum(axis=1) - + # 1/c_t: Uniqueness at each timestamp # This is a (T x N) DataFrame, 0 where event is not active uniqueness = index_matrix.div(concurrency, axis=0).fillna(0) - + # Sum of 1/c_t for each event total_uniqueness = uniqueness.sum(axis=0) - + # Number of active periods for each event event_duration = (index_matrix > 0).sum(axis=0) - + # Average uniqueness: sum(1/c_t) / sum(I) average_uniqueness = total_uniqueness / event_duration - + # Handle events that never occurred (duration 0) average_uniqueness = average_uniqueness.fillna(0) - + return average_uniqueness @@ -146,10 +147,10 @@ def sample_weight_absolute_return_meta_labeling( concurrency_events = expand_label_for_meta_labeling( price.index, timestamp, molecule ) - + # 2. Compute absolute log returns log_return = np.log(price).diff().abs() - + # Align returns and concurrency # Use 'left' join to keep the log_return (price) index log_return, concurrency_events = log_return.align( @@ -161,38 +162,36 @@ def sample_weight_absolute_return_meta_labeling( # 3. Calculate weighted returns for t_in, t_out in timestamp.loc[weight.index].items(): if t_out not in log_return.index: - # Find the closest preceding index - t_out = log_return.index[log_return.index.searchsorted(t_out) - 1] - + # Find the closest preceding index + t_out = log_return.index[log_return.index.searchsorted(t_out) - 1] + # r_t / c_t # Filter concurrency > 0 to avoid division by zero relevant_concurrency = concurrency_events.loc[t_in:t_out] relevant_log_return = log_return.loc[t_in:t_out] - + active_periods = relevant_concurrency > 0 if active_periods.any(): weighted_return = ( - relevant_log_return[active_periods] / - relevant_concurrency[active_periods] + relevant_log_return[active_periods] + / relevant_concurrency[active_periods] ) weight.loc[t_in] = weighted_return.sum() else: weight.loc[t_in] = 0.0 weight = weight.abs() - + # 4. Normalize if weight.sum() == 0: # Avoid division by zero if all weights are 0 return pd.Series(1.0, index=molecule) - + weight *= len(weight) / weight.sum() return weight -def calculate_time_decay( - weight: pd.Series, clf_last_weight: float = 1.0 -) -> pd.Series: +def calculate_time_decay(weight: pd.Series, clf_last_weight: float = 1.0) -> pd.Series: """ Apply a time-decay factor to sample weights. @@ -218,7 +217,7 @@ def calculate_time_decay( The new weights with time decay applied. """ clf_weight = weight.sort_index().cumsum() - + if clf_last_weight < 0 or clf_last_weight > 1: raise ValueError("clf_last_weight must be between 0 and 1") @@ -229,8 +228,8 @@ def calculate_time_decay( else: slope = (1.0 - clf_last_weight) / clf_weight.iloc[-1] const = 1.0 - slope * clf_weight.iloc[-1] - + clf_weight = const + slope * clf_weight - clf_weight[clf_weight < 0] = 0.0 # Should not happen if clf_last_weight >= 0 - - return clf_weight \ No newline at end of file + clf_weight[clf_weight < 0] = 0.0 # Should not happen if clf_last_weight >= 0 + + return clf_weight diff --git a/RiskLabAI/ensemble/__init__.py b/RiskLabAI/ensemble/__init__.py index d0104fb..8ff0b5c 100644 --- a/RiskLabAI/ensemble/__init__.py +++ b/RiskLabAI/ensemble/__init__.py @@ -12,7 +12,7 @@ from .empirical_bagging_accuracy import ( BaggingClassifierAccuracy, calculate_bootstrap_accuracy, - plot_bootstrap_accuracy_distribution + plot_bootstrap_accuracy_distribution, ) # Expose all names for import @@ -20,5 +20,5 @@ "bagging_classifier_accuracy", # The function "BaggingClassifierAccuracy", # The class "calculate_bootstrap_accuracy", - "plot_bootstrap_accuracy_distribution" -] \ No newline at end of file + "plot_bootstrap_accuracy_distribution", +] diff --git a/RiskLabAI/ensemble/bagging_classifier_accuracy.py b/RiskLabAI/ensemble/bagging_classifier_accuracy.py index bd33c40..82f60a5 100644 --- a/RiskLabAI/ensemble/bagging_classifier_accuracy.py +++ b/RiskLabAI/ensemble/bagging_classifier_accuracy.py @@ -7,6 +7,7 @@ # import numpy as np <-- Removed unused import from scipy.stats import binom + def bagging_classifier_accuracy(N: int, p: float) -> float: """ Calculates the theoretical accuracy of a bagging classifier @@ -29,15 +30,15 @@ def bagging_classifier_accuracy(N: int, p: float) -> float: """ if N % 2 == 0: raise ValueError(f"Number of estimators N must be odd. Got {N}.") - + # The majority threshold. # e.g., if N=101, k=50. We need 51 or more correct votes. k = (N - 1) // 2 - + # Probability of k or fewer successes (P(X <= k)) prob_majority_wrong = binom.cdf(k, N, p) - + # Probability of more than k successes (P(X > k)) prob_majority_correct = 1.0 - prob_majority_wrong - - return prob_majority_correct \ No newline at end of file + + return prob_majority_correct diff --git a/RiskLabAI/ensemble/empirical_bagging_accuracy.py b/RiskLabAI/ensemble/empirical_bagging_accuracy.py index 5556d19..fdc8daf 100644 --- a/RiskLabAI/ensemble/empirical_bagging_accuracy.py +++ b/RiskLabAI/ensemble/empirical_bagging_accuracy.py @@ -15,7 +15,6 @@ from typing import List, Optional, Tuple, Dict, Any - class BaggingClassifierAccuracy: """ Evaluates a bagging classifier's accuracy using different @@ -34,7 +33,7 @@ def __init__( n_estimators: int = 1000, max_samples: int = 100, max_features: float = 1.0, - random_state: Optional[int] = None + random_state: Optional[int] = None, ): """ Initializes the BaggingClassifier. @@ -60,19 +59,19 @@ def __init__( self.random_state = random_state self.base_estimator = DecisionTreeClassifier( - criterion='entropy', + criterion="entropy", max_features=1, # Trees vote on one feature - class_weight='balanced' + class_weight="balanced", ) - + self.clf = BaggingClassifier( estimator=self.base_estimator, n_estimators=self.n_estimators, max_samples=self.max_samples, max_features=self.max_features, - random_state=self.random_state + random_state=self.random_state, ) - + self.estimators_ = None self.weights_ = None self.c_i_scores_ = None @@ -80,7 +79,7 @@ def __init__( self.class_0_ = None self.class_1_ = None - def fit(self, X: pd.DataFrame, y: pd.Series) -> 'BaggingClassifierAccuracy': + def fit(self, X: pd.DataFrame, y: pd.Series) -> "BaggingClassifierAccuracy": """ Fits the bagging classifier on the training data. @@ -98,14 +97,14 @@ def fit(self, X: pd.DataFrame, y: pd.Series) -> 'BaggingClassifierAccuracy': """ self.clf.fit(X, y) self.estimators_ = self.clf.estimators_ - + # <-- ADDED: Check for binary classification and store classes if len(self.clf.classes_) != 2: raise ValueError("This class only supports binary classification.") - + self.class_0_ = self.clf.classes_[0] self.class_1_ = self.clf.classes_[1] - + return self def calculate_c_i(self, X: pd.DataFrame, y: pd.Series) -> np.ndarray: @@ -127,21 +126,17 @@ def calculate_c_i(self, X: pd.DataFrame, y: pd.Series) -> np.ndarray: """ if self.estimators_ is None: raise NotFittedError("Classifier must be fitted first. Call .fit()") - + c_i_scores = [] for tree in self.estimators_: y_pred = tree.predict(X) acc = accuracy_score(y, y_pred) c_i_scores.append(acc) - + self.c_i_scores_ = np.array(c_i_scores) return self.c_i_scores_ - def calculate_weights( - self, - X: pd.DataFrame, - y: pd.Series - ) -> Dict[str, np.ndarray]: + def calculate_weights(self, X: pd.DataFrame, y: pd.Series) -> Dict[str, np.ndarray]: """ Calculates weights for each estimator based on three schemes: 1. Uniform (w_i = 1/N) @@ -163,7 +158,7 @@ def calculate_weights( if self.c_i_scores_ is None: # calculate_c_i also checks if model is fitted self.calculate_c_i(X, y) - + c_i = self.c_i_scores_ n = len(c_i) @@ -176,22 +171,14 @@ def calculate_weights( # 3. 1 - c_i^2 weights (proportional to variance) c_i_squared = c_i**2 - w_variance = 1. - c_i_squared + w_variance = 1.0 - c_i_squared sum_w_var = np.sum(w_variance) w_variance = w_variance / sum_w_var if sum_w_var != 0 else w_uniform - - self.weights_ = { - 'uniform': w_uniform, - 'c_i': w_c_i, - 'variance': w_variance - } + + self.weights_ = {"uniform": w_uniform, "c_i": w_c_i, "variance": w_variance} return self.weights_ - def predict( - self, - X: pd.DataFrame, - weight_scheme: str = 'uniform' - ) -> np.ndarray: + def predict(self, X: pd.DataFrame, weight_scheme: str = "uniform") -> np.ndarray: """ Predicts class labels for X using the specified weighting scheme. @@ -210,33 +197,37 @@ def predict( # <-- UPDATED: Check fit status first if self.estimators_ is None: raise NotFittedError("Classifier must be fitted first. Call .fit()") - + if self.weights_ is None: # <-- UPDATED: More specific error - raise NotFittedError("Weights must be calculated first. Call .calculate_weights()") - + raise NotFittedError( + "Weights must be calculated first. Call .calculate_weights()" + ) + if weight_scheme not in self.weights_: - raise ValueError(f"Unknown weight_scheme: {weight_scheme}. " - f"Must be one of {list(self.weights_.keys())}") + raise ValueError( + f"Unknown weight_scheme: {weight_scheme}. " + f"Must be one of {list(self.weights_.keys())}" + ) weights = self.weights_[weight_scheme] - + # Get predictions from each tree # (N_samples, N_estimators) tree_preds = np.array([tree.predict(X) for tree in self.estimators_]).T - + # <-- UPDATED: Convert labels {class_0, class_1} to {-1, 1} # Map class_1 to 1, and class_0 to -1 tree_preds_signed = np.where(tree_preds == self.class_1_, 1, -1) - + # Calculate weighted average vote # (N_samples, N_estimators) * (N_estimators,) -> (N_samples,) weighted_votes = np.dot(tree_preds_signed, weights) - + # <-- UPDATED: Convert vote back to {class_0, class_1} # Positive vote -> class_1, Negative or Zero vote -> class_0 y_pred = np.where(weighted_votes > 0, self.class_1_, self.class_0_) - + return y_pred def evaluate_all_schemes( @@ -244,7 +235,7 @@ def evaluate_all_schemes( X_test: pd.DataFrame, y_test: pd.Series, X_train: pd.DataFrame, - y_train: pd.Series + y_train: pd.Series, ) -> Dict[str, float]: """ Fits, calculates weights, and evaluates accuracy for all @@ -268,26 +259,24 @@ def evaluate_all_schemes( """ # Fit classifier self.fit(X_train, y_train) - + # Calculate weights (uses X_train, y_train implicitly) self.calculate_weights(X_train, y_train) - + accuracies = {} for scheme in self.weights_.keys(): y_pred = self.predict(X_test, weight_scheme=scheme) acc = accuracy_score(y_test, y_pred) accuracies[scheme] = acc - + return accuracies # --- Standalone Functions for Bootstrap Analysis --- + def calculate_bootstrap_accuracy( - clf: BaggingClassifier, - X: pd.DataFrame, - y: pd.Series, - n_bootstraps: int = 1000 + clf: BaggingClassifier, X: pd.DataFrame, y: pd.Series, n_bootstraps: int = 1000 ) -> Tuple[np.ndarray, float, float]: """ Calculates the accuracy of a bagging classifier over multiple @@ -315,27 +304,27 @@ def calculate_bootstrap_accuracy( """ a_n_values = [] n_samples = len(y) - + # Use indices from the original X/y DataFrames/Series indices = X.index - + # --- CHANGE: Fixed typo n_bootstraMps -> n_bootstraps --- for _ in range(n_bootstraps): - # --- END CHANGE --- + # --- END CHANGE --- # Sample test set with replacement boot_indices = np.random.choice(indices, n_samples, replace=True) X_boot = X.loc[boot_indices] y_boot = y.loc[boot_indices] - + # Predict on the bootstrapped sample y_pred = clf.predict(X_boot) acc = accuracy_score(y_boot, y_pred) a_n_values.append(acc) - + a_n_values = np.array(a_n_values) a_n_mean = np.mean(a_n_values) a_n_std = np.std(a_n_values, ddof=1) - + return a_n_values, a_n_mean, a_n_std @@ -343,7 +332,7 @@ def plot_bootstrap_accuracy_distribution( a_n_values: np.ndarray, a_n_mean: float, a_n_std: float, - ax: Optional["plt.Axes"] = None + ax: Optional["plt.Axes"] = None, ) -> "plt.Axes": """ Plots the distribution of bootstrapped accuracy scores. @@ -370,13 +359,15 @@ def plot_bootstrap_accuracy_distribution( if ax is None: fig, ax = plt.subplots(figsize=(10, 6)) - sns.histplot(a_n_values, kde=True, ax=ax, stat='density', label='Empirical Distribution') - + sns.histplot( + a_n_values, kde=True, ax=ax, stat="density", label="Empirical Distribution" + ) + # Overlay a normal distribution x_min, x_max = ax.get_xlim() x = np.linspace(x_min, x_max, 100) p = norm.pdf(x, a_n_mean, a_n_std) - ax.plot(x, p, 'k', linewidth=2, label=f'Normal(ΞΌ={a_n_mean:.3f}, Οƒ={a_n_std:.3f})') - + ax.plot(x, p, "k", linewidth=2, label=f"Normal(ΞΌ={a_n_mean:.3f}, Οƒ={a_n_std:.3f})") + ax.legend() - return ax \ No newline at end of file + return ax diff --git a/RiskLabAI/features/entropy_features/__init__.py b/RiskLabAI/features/entropy_features/__init__.py index d9b2679..b725cf1 100644 --- a/RiskLabAI/features/entropy_features/__init__.py +++ b/RiskLabAI/features/entropy_features/__init__.py @@ -18,4 +18,4 @@ "plug_in_entropy_estimator", "kontoyiannis_entropy", "longest_match_length", -] \ No newline at end of file +] diff --git a/RiskLabAI/features/entropy_features/kontoyiannis.py b/RiskLabAI/features/entropy_features/kontoyiannis.py index 7353a1d..84fe66e 100644 --- a/RiskLabAI/features/entropy_features/kontoyiannis.py +++ b/RiskLabAI/features/entropy_features/kontoyiannis.py @@ -5,9 +5,8 @@ from math import log2 from typing import Tuple, Optional -def longest_match_length( - message: str, i: int, n: int -) -> Tuple[int, str]: + +def longest_match_length(message: str, i: int, n: int) -> Tuple[int, str]: """ Find the length of the longest match for the substring starting at `i`. @@ -33,11 +32,11 @@ def longest_match_length( # Iterate through possible lengths `l` for l in range(1, n + 1): pattern = message[i : i + l] - + # Stop if pattern goes beyond message length if i + l > len(message): break - + found = False # Look back in the window [max(0, i-n), i-1] for j in range(max(0, i - n), i): @@ -46,7 +45,7 @@ def longest_match_length( longest_match = pattern found = True break - + # If pattern of length `l` was not found, the # longest match was of length `l-1`. if not found: @@ -55,9 +54,7 @@ def longest_match_length( return len(longest_match) + 1, longest_match -def kontoyiannis_entropy( - message: str, window: Optional[int] = None -) -> float: +def kontoyiannis_entropy(message: str, window: Optional[int] = None) -> float: r""" Calculate Kontoyiannis Entropy (an LZ78-based estimator). @@ -98,14 +95,15 @@ def kontoyiannis_entropy( for i in points: n = i if window is None else window - if n == 0: continue # Avoid log2(0) - + if n == 0: + continue # Avoid log2(0) + l_i, _ = longest_match_length(message, i, n) - + sum_h += log2(n) / l_i num_points += 1 if num_points == 0: return 0.0 - return sum_h / num_points \ No newline at end of file + return sum_h / num_points diff --git a/RiskLabAI/features/entropy_features/lempel_ziv.py b/RiskLabAI/features/entropy_features/lempel_ziv.py index 2ec8595..24f375c 100644 --- a/RiskLabAI/features/entropy_features/lempel_ziv.py +++ b/RiskLabAI/features/entropy_features/lempel_ziv.py @@ -2,6 +2,7 @@ Implements the Lempel-Ziv (LZ) Entropy estimator. """ + def lempel_ziv_entropy(message: str) -> float: """ Calculate the Lempel-Ziv (LZ) complexity as an entropy estimator. @@ -32,9 +33,9 @@ def lempel_ziv_entropy(message: str) -> float: # Find the longest substring starting at `i` that is *not* in the library while j < message_length and message[i : j + 1] in library: j += 1 - + # Add the new, unseen substring to the library library.add(message[i : j + 1]) i = j + 1 - return len(library) / message_length \ No newline at end of file + return len(library) / message_length diff --git a/RiskLabAI/features/entropy_features/plug_in.py b/RiskLabAI/features/entropy_features/plug_in.py index 238935c..f30b020 100644 --- a/RiskLabAI/features/entropy_features/plug_in.py +++ b/RiskLabAI/features/entropy_features/plug_in.py @@ -6,9 +6,8 @@ from typing import Dict from .pmf import probability_mass_function -def plug_in_entropy_estimator( - message: str, approximate_word_length: int = 1 -) -> float: + +def plug_in_entropy_estimator(message: str, approximate_word_length: int = 1) -> float: """ Calculate the Plug-in Entropy Estimator (based on n-gram PMF). @@ -29,14 +28,12 @@ def plug_in_entropy_estimator( """ if not message: return 0.0 - + pmf = probability_mass_function(message, approximate_word_length) if not pmf: return 0.0 - plug_in_entropy = -sum( - p * log2(p) for p in pmf.values() if p > 0 - ) - + plug_in_entropy = -sum(p * log2(p) for p in pmf.values() if p > 0) + # Normalize by word length - return plug_in_entropy / approximate_word_length \ No newline at end of file + return plug_in_entropy / approximate_word_length diff --git a/RiskLabAI/features/entropy_features/pmf.py b/RiskLabAI/features/entropy_features/pmf.py index 7831c7f..434e184 100644 --- a/RiskLabAI/features/entropy_features/pmf.py +++ b/RiskLabAI/features/entropy_features/pmf.py @@ -5,6 +5,7 @@ from collections import Counter from typing import Dict + def probability_mass_function( message: str, approximate_word_length: int ) -> Dict[str, float]: @@ -25,7 +26,7 @@ def probability_mass_function( """ if not message or len(message) < approximate_word_length: return {} - + # Find all n-grams (words) library = Counter( message[i : i + approximate_word_length] @@ -38,8 +39,6 @@ def probability_mass_function( return {} # Calculate probability for each n-gram - pmf = { - key: count / num_windows for key, count in library.items() - } + pmf = {key: count / num_windows for key, count in library.items()} - return pmf \ No newline at end of file + return pmf diff --git a/RiskLabAI/features/entropy_features/shannon.py b/RiskLabAI/features/entropy_features/shannon.py index 6194d0c..cfc5ce9 100644 --- a/RiskLabAI/features/entropy_features/shannon.py +++ b/RiskLabAI/features/entropy_features/shannon.py @@ -5,6 +5,7 @@ from collections import Counter from math import log2 + def shannon_entropy(message: str) -> float: """ Calculate the Shannon Entropy of a message. @@ -30,4 +31,4 @@ def shannon_entropy(message: str) -> float: for count in character_counts.values() ) - return entropy \ No newline at end of file + return entropy diff --git a/RiskLabAI/features/feature_importance/__init__.py b/RiskLabAI/features/feature_importance/__init__.py index 2f9028a..816d2c0 100644 --- a/RiskLabAI/features/feature_importance/__init__.py +++ b/RiskLabAI/features/feature_importance/__init__.py @@ -40,22 +40,19 @@ "FeatureImportanceStrategy", "FeatureImportanceFactory", "FeatureImportanceController", - # Strategy Implementations "FeatureImportanceMDI", "ClusteredFeatureImportanceMDI", "FeatureImportanceMDA", "ClusteredFeatureImportanceMDA", "FeatureImportanceSFI", - # Imported utilities from *other* modules "cluster_k_means_top", "random_block_correlation", "form_true_matrix", "simulates_cov_mu", - # Utilities *from this* module "get_test_dataset", "orthogonal_features", "calculate_weighted_tau", -] \ No newline at end of file +] diff --git a/RiskLabAI/features/feature_importance/clustered_feature_importance_mda.py b/RiskLabAI/features/feature_importance/clustered_feature_importance_mda.py index 83eae73..785cf0b 100644 --- a/RiskLabAI/features/feature_importance/clustered_feature_importance_mda.py +++ b/RiskLabAI/features/feature_importance/clustered_feature_importance_mda.py @@ -13,6 +13,7 @@ logger = logging.getLogger(__name__) + class ClusteredFeatureImportanceMDA(FeatureImportanceStrategy): """ Computes clustered feature importance using MDA. @@ -21,7 +22,6 @@ class ClusteredFeatureImportanceMDA(FeatureImportanceStrategy): and measures the decrease in model performance. """ - def __init__( self, classifier: object, @@ -48,7 +48,6 @@ def __init__( self.n_splits = n_splits self.random_state = random_state - def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: """ Compute Clustered MDA feature importance. @@ -69,8 +68,8 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: DataFrame with "Mean" and "StandardDeviation" of importance for each *cluster*. """ - train_weights = kwargs.get('train_sample_weights') - score_weights = kwargs.get('score_sample_weights') + train_weights = kwargs.get("train_sample_weights") + score_weights = kwargs.get("score_sample_weights") if train_weights is None: train_weights = np.ones(x.shape[0]) @@ -81,7 +80,7 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: n_splits=self.n_splits, shuffle=True, random_state=self.random_state ) baseline_scores = pd.Series(dtype=float) - shuffled_scores = pd.DataFrame(columns=self.clusters.keys(), dtype=float) + shuffled_scores = pd.DataFrame(columns=self.clusters.keys(), dtype=float) for i, (train_idx, test_idx) in enumerate(cv_generator.split(X=x)): logger.debug("Fold %d start ...", i) @@ -113,31 +112,30 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: rng = np.random.default_rng(self.random_state + i) for cluster_name in shuffled_scores.columns: x_test_shuffled = x_test.copy(deep=True) - + # --- CORRECTED SHUFFLING LOGIC --- # Get all feature names for this cluster cluster_cols = self.clusters[cluster_name] - - if not cluster_cols: # Skip if cluster is empty + + if not cluster_cols: # Skip if cluster is empty shuffled_scores.loc[i, cluster_name] = baseline_scores.loc[i] continue - + # Get the underlying numpy array for these columns cluster_data = x_test_shuffled[cluster_cols].values.copy() - + # Shuffle the rows of this array in-place. # This applies the *same* permutation to all features # in the cluster, preserving intra-cluster correlation. rng.shuffle(cluster_data) - + # Assign the shuffled data back x_test_shuffled[cluster_cols] = cluster_data # --- END CORRECTION --- - + prob = classifier_fit.predict_proba(x_test_shuffled) shuffled_scores.loc[i, cluster_name] = -log_loss( - y_test, prob, labels=self.classifier.classes_, - sample_weight=w_test + y_test, prob, labels=self.classifier.classes_, sample_weight=w_test ) # Calculate importance as the simple drop in score @@ -154,7 +152,5 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: axis=1, ) - importances_summary.index = [ - f"C_{i}" for i in importances_summary.index - ] - return importances_summary \ No newline at end of file + importances_summary.index = [f"C_{i}" for i in importances_summary.index] + return importances_summary diff --git a/RiskLabAI/features/feature_importance/clustered_feature_importance_mdi.py b/RiskLabAI/features/feature_importance/clustered_feature_importance_mdi.py index 18e723d..73fea1e 100644 --- a/RiskLabAI/features/feature_importance/clustered_feature_importance_mdi.py +++ b/RiskLabAI/features/feature_importance/clustered_feature_importance_mdi.py @@ -8,6 +8,7 @@ from typing import Dict, List, Any from .feature_importance_strategy import FeatureImportanceStrategy + class ClusteredFeatureImportanceMDI(FeatureImportanceStrategy): """ Computes Clustered MDI feature importance. @@ -47,13 +48,13 @@ def _group_mean_std( for cluster_name, feature_names in clusters.items(): # Sum importance for all features in the cluster cluster_data = dataframe[feature_names].sum(axis=1) - + cluster_mean = cluster_data.mean() cluster_std = cluster_data.std() - + output.loc[f"C_{cluster_name}", "Mean"] = cluster_mean - output.loc[f"C_{cluster_name}", "StandardDeviation"] = ( - cluster_std * (cluster_data.shape[0] ** -0.5) + output.loc[f"C_{cluster_name}", "StandardDeviation"] = cluster_std * ( + cluster_data.shape[0] ** -0.5 ) return output @@ -77,29 +78,29 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: DataFrame with "Mean" and "StandardDeviation" of importance for each *cluster*. """ - train_sample_weights = kwargs.get('sample_weight') - + train_sample_weights = kwargs.get("sample_weight") + # Fit the classifier self.classifier.fit(x, y, sample_weight=train_sample_weights) - + # Get importance from each tree importances_dict = { i: tree.feature_importances_ for i, tree in enumerate(self.classifier.estimators_) } importances_df = pd.DataFrame.from_dict(importances_dict, orient="index") - - if hasattr(self.classifier, 'feature_names_in_'): - importances_df.columns = self.classifier.feature_names_in_ + + if hasattr(self.classifier, "feature_names_in_"): + importances_df.columns = self.classifier.feature_names_in_ else: - importances_df.columns = x.columns + importances_df.columns = x.columns # Replace 0 with NaN importances_df.replace(0, np.nan, inplace=True) # Group by cluster aggregated_importances = self._group_mean_std(importances_df, self.clusters) - + # Normalize aggregated_importances /= aggregated_importances["Mean"].sum() - return aggregated_importances \ No newline at end of file + return aggregated_importances diff --git a/RiskLabAI/features/feature_importance/feature_importance_controller.py b/RiskLabAI/features/feature_importance/feature_importance_controller.py index ddc7988..8eb656f 100644 --- a/RiskLabAI/features/feature_importance/feature_importance_controller.py +++ b/RiskLabAI/features/feature_importance/feature_importance_controller.py @@ -7,6 +7,7 @@ from .feature_importance_factory import FeatureImportanceFactory from .feature_importance_strategy import FeatureImportanceStrategy + class FeatureImportanceController: """ Controller class to manage and execute feature importance strategies. @@ -16,14 +17,14 @@ class FeatureImportanceController: .. code-block:: python from sklearn.ensemble import RandomForestClassifier - + my_classifier = RandomForestClassifier(n_estimators=10, seed=42) my_clusters = {'cluster_0': ['feat_0', 'feat_1']} # Initialize the controller controller = FeatureImportanceController( 'ClusteredMDA', - classifier=my_classifier, + classifier=my_classifier, clusters=my_clusters, n_splits=10 ) @@ -45,9 +46,7 @@ def __init__(self, strategy_type: str, **kwargs: Any): constructor (e.g., `classifier`, `clusters`, `n_splits`). """ self.strategy_instance: FeatureImportanceStrategy = ( - FeatureImportanceFactory.create_feature_importance( - strategy_type, **kwargs - ) + FeatureImportanceFactory.create_feature_importance(strategy_type, **kwargs) ) def calculate_importance( @@ -65,10 +64,10 @@ def calculate_importance( **kwargs : Any Additional arguments to pass to the strategy's `compute` method (e.g., `sample_weight`). - + Returns ------- pd.DataFrame Feature importance results. """ - return self.strategy_instance.compute(x, y, **kwargs) \ No newline at end of file + return self.strategy_instance.compute(x, y, **kwargs) diff --git a/RiskLabAI/features/feature_importance/feature_importance_factory.py b/RiskLabAI/features/feature_importance/feature_importance_factory.py index 40d49f4..510660b 100644 --- a/RiskLabAI/features/feature_importance/feature_importance_factory.py +++ b/RiskLabAI/features/feature_importance/feature_importance_factory.py @@ -10,6 +10,7 @@ from .clustered_feature_importance_mda import ClusteredFeatureImportanceMDA from .feature_importance_sfi import FeatureImportanceSFI + class FeatureImportanceFactory: """ Factory class to create feature importance strategy instances. @@ -42,7 +43,7 @@ def create_feature_importance( ValueError If an invalid `strategy_type` is provided. """ - + strategies: Dict[str, Type[FeatureImportanceStrategy]] = { "MDI": FeatureImportanceMDI, "ClusteredMDI": ClusteredFeatureImportanceMDI, @@ -50,20 +51,19 @@ def create_feature_importance( "ClusteredMDA": ClusteredFeatureImportanceMDA, "SFI": FeatureImportanceSFI, } - + strategy_class = strategies.get(strategy_type) - + if strategy_class: # Pass only the relevant arguments to the constructor # This uses introspection to be robust import inspect + sig = inspect.signature(strategy_class.__init__) - valid_kwargs = { - k: v for k, v in kwargs.items() if k in sig.parameters - } + valid_kwargs = {k: v for k, v in kwargs.items() if k in sig.parameters} return strategy_class(**valid_kwargs) - + raise ValueError( f"Invalid strategy_type: {strategy_type}. " f"Valid types are: {list(strategies.keys())}" - ) \ No newline at end of file + ) diff --git a/RiskLabAI/features/feature_importance/feature_importance_mda.py b/RiskLabAI/features/feature_importance/feature_importance_mda.py index ec003ac..4c5bd59 100644 --- a/RiskLabAI/features/feature_importance/feature_importance_mda.py +++ b/RiskLabAI/features/feature_importance/feature_importance_mda.py @@ -12,6 +12,7 @@ logger = logging.getLogger(__name__) + class FeatureImportanceMDA(FeatureImportanceStrategy): """ Computes feature importance using Mean Decrease Accuracy (MDA). @@ -20,7 +21,6 @@ class FeatureImportanceMDA(FeatureImportanceStrategy): much the model's performance (e.g., log loss) decreases. """ - def __init__(self, classifier: object, n_splits: int = 10, random_state: int = 42): """ Initialize the strategy. @@ -36,7 +36,6 @@ def __init__(self, classifier: object, n_splits: int = 10, random_state: int = 4 self.n_splits = n_splits self.random_state = random_state - def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: """ Compute MDA feature importance. @@ -56,16 +55,17 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: pd.DataFrame DataFrame with "Mean" and "StandardDeviation" of importance. """ - train_weights = kwargs.get('train_sample_weights') - score_weights = kwargs.get('score_sample_weights') - + train_weights = kwargs.get("train_sample_weights") + score_weights = kwargs.get("score_sample_weights") + if train_weights is None: train_weights = np.ones(x.shape[0]) if score_weights is None: score_weights = np.ones(x.shape[0]) - - cv_generator = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state) + cv_generator = KFold( + n_splits=self.n_splits, shuffle=True, random_state=self.random_state + ) baseline_scores = pd.Series(dtype=float) shuffled_scores = pd.DataFrame(columns=x.columns, dtype=float) @@ -97,7 +97,7 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: ) # Get scores for each shuffled feature - rng = np.random.default_rng(self.random_state + i) + rng = np.random.default_rng(self.random_state + i) for feature in x.columns: x_test_shuffled = x_test.copy(deep=True) # Shuffle a copy and assign back: with pandas copy-on-write @@ -108,17 +108,17 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: x_test_shuffled[feature] = shuffled_values shuffled_proba = fitted_classifier.predict_proba(x_test_shuffled) - + shuffled_scores.loc[i, feature] = -log_loss( y_test, shuffled_proba, labels=self.classifier.classes_, - sample_weight=w_test + sample_weight=w_test, ) # Calculate importance as the simple drop in score importances = shuffled_scores.rsub(baseline_scores, axis=0) - + # Calculate mean and std dev importances_summary = pd.concat( { @@ -130,4 +130,4 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: axis=1, ) - return importances_summary \ No newline at end of file + return importances_summary diff --git a/RiskLabAI/features/feature_importance/feature_importance_mdi.py b/RiskLabAI/features/feature_importance/feature_importance_mdi.py index 44563eb..0a0f92b 100644 --- a/RiskLabAI/features/feature_importance/feature_importance_mdi.py +++ b/RiskLabAI/features/feature_importance/feature_importance_mdi.py @@ -6,9 +6,10 @@ import numpy as np from typing import List, Optional, Union, Any from sklearn.ensemble import BaseEnsemble -from sklearn.ensemble import BaseEnsemble +from sklearn.ensemble import BaseEnsemble from .feature_importance_strategy import FeatureImportanceStrategy + class FeatureImportanceMDI(FeatureImportanceStrategy): """ Computes feature importance using Mean Decrease Impurity (MDI). @@ -50,8 +51,8 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: pd.DataFrame DataFrame with "Mean" and "StandardDeviation" of importance. """ - train_sample_weights = kwargs.get('sample_weight') - + train_sample_weights = kwargs.get("sample_weight") + # Fit the classifier self.classifier.fit(x, y, sample_weight=train_sample_weights) @@ -61,12 +62,12 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: for i, tree in enumerate(self.classifier.estimators_) } importances_df = pd.DataFrame.from_dict(importances_dict, orient="index") - + # Ensure correct feature names - if hasattr(self.classifier, 'feature_names_in_'): - importances_df.columns = self.classifier.feature_names_in_ + if hasattr(self.classifier, "feature_names_in_"): + importances_df.columns = self.classifier.feature_names_in_ else: - importances_df.columns = x.columns + importances_df.columns = x.columns # Replace 0 with NaN (as per user's original code) importances_df.replace(0, np.nan, inplace=True) @@ -75,8 +76,7 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: { "Mean": importances_df.mean(), "StandardDeviation": ( - importances_df.std() - * (importances_df.shape[0] ** -0.5) + importances_df.std() * (importances_df.shape[0] ** -0.5) ), }, axis=1, @@ -84,4 +84,4 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: # Normalize importances /= importances["Mean"].sum() - return importances \ No newline at end of file + return importances diff --git a/RiskLabAI/features/feature_importance/feature_importance_sfi.py b/RiskLabAI/features/feature_importance/feature_importance_sfi.py index bb498cd..b0ec307 100644 --- a/RiskLabAI/features/feature_importance/feature_importance_sfi.py +++ b/RiskLabAI/features/feature_importance/feature_importance_sfi.py @@ -9,6 +9,7 @@ from typing import List, Optional, Union, Any from .feature_importance_strategy import FeatureImportanceStrategy + class FeatureImportanceSFI(FeatureImportanceStrategy): """ Computes Single Feature Importance (SFI). @@ -59,9 +60,9 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: DataFrame with "FeatureName", "Mean", and "StandardDeviation" of the SFI scores. """ - train_sample_weights = kwargs.get('train_sample_weights') - score_sample_weights = kwargs.get('score_sample_weights') - + train_sample_weights = kwargs.get("train_sample_weights") + score_sample_weights = kwargs.get("score_sample_weights") + if train_sample_weights is None: train_sample_weights = np.ones(x.shape[0]) if score_sample_weights is None: @@ -99,9 +100,7 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: ) elif self.scoring == "accuracy": pred = self.classifier.predict(x_test) - score = accuracy_score( - y_test, pred, sample_weight=w_test - ) + score = accuracy_score(y_test, pred, sample_weight=w_test) else: raise ValueError(f"'{self.scoring}' method not defined.") @@ -111,9 +110,8 @@ def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: { "FeatureName": feature_name, "Mean": np.mean(scores), - "StandardDeviation": np.std(scores, ddof=1) - * (len(scores) ** -0.5), + "StandardDeviation": np.std(scores, ddof=1) * (len(scores) ** -0.5), } ) - return pd.DataFrame(importances).set_index("FeatureName") \ No newline at end of file + return pd.DataFrame(importances).set_index("FeatureName") diff --git a/RiskLabAI/features/feature_importance/feature_importance_strategy.py b/RiskLabAI/features/feature_importance/feature_importance_strategy.py index da6140a..c06c548 100644 --- a/RiskLabAI/features/feature_importance/feature_importance_strategy.py +++ b/RiskLabAI/features/feature_importance/feature_importance_strategy.py @@ -6,6 +6,7 @@ import pandas as pd from typing import Any + class FeatureImportanceStrategy(ABC): """ Abstract Base Class for computing feature importance. @@ -14,9 +15,7 @@ class FeatureImportanceStrategy(ABC): """ @abstractmethod - def compute( - self, x: pd.DataFrame, y: pd.Series, **kwargs: Any - ) -> pd.DataFrame: + def compute(self, x: pd.DataFrame, y: pd.Series, **kwargs: Any) -> pd.DataFrame: """ Abstract method to compute feature importance. @@ -34,4 +33,4 @@ def compute( pd.DataFrame A DataFrame containing feature importances. """ - pass \ No newline at end of file + pass diff --git a/RiskLabAI/features/feature_importance/generate_synthetic_data.py b/RiskLabAI/features/feature_importance/generate_synthetic_data.py index 5624776..3637656 100644 --- a/RiskLabAI/features/feature_importance/generate_synthetic_data.py +++ b/RiskLabAI/features/feature_importance/generate_synthetic_data.py @@ -9,6 +9,7 @@ from sklearn.datasets import make_classification from typing import Tuple + def get_test_dataset( n_features: int = 100, n_informative: int = 25, @@ -63,19 +64,18 @@ def get_test_dataset( # 3. Create redundant features # Randomly pick informative features to copy - redundant_indices = rng.choice( - range(n_informative), size=n_redundant, replace=True - ) - + redundant_indices = rng.choice(range(n_informative), size=n_redundant, replace=True) + for i, orig_idx in enumerate(redundant_indices): orig_feature_name = f"I_{orig_idx}" new_feature_name = f"R_{i}" - + # Add noise to the original informative feature - noise = rng.normal( - loc=0.0, scale=sigma_std, size=n_samples - ) * x_df[orig_feature_name].std() - + noise = ( + rng.normal(loc=0.0, scale=sigma_std, size=n_samples) + * x_df[orig_feature_name].std() + ) + x_df[new_feature_name] = x_df[orig_feature_name] + noise - return x_df[sorted(x_df.columns)], y_series \ No newline at end of file + return x_df[sorted(x_df.columns)], y_series diff --git a/RiskLabAI/features/feature_importance/orthogonal_features.py b/RiskLabAI/features/feature_importance/orthogonal_features.py index fe0be68..37a659c 100644 --- a/RiskLabAI/features/feature_importance/orthogonal_features.py +++ b/RiskLabAI/features/feature_importance/orthogonal_features.py @@ -9,6 +9,7 @@ import numpy as np from typing import Tuple + def _compute_eigenvectors( dot_product: np.ndarray, explained_variance_threshold: float ) -> pd.DataFrame: @@ -48,7 +49,7 @@ def _compute_eigenvectors( # Find the index where cumulative variance crosses the threshold index = cumulative_variance.searchsorted(explained_variance_threshold) - + # Keep components up to and including the one that crosses the threshold eigen_dataframe = eigen_dataframe.iloc[: index + 1, :] @@ -76,11 +77,11 @@ def orthogonal_features( """ # 1. Normalize features (z-score) normalized_features = (features - features.mean(axis=0)) / features.std(axis=0) - normalized_features = normalized_features.dropna(axis=1) # Drop constant cols - + normalized_features = normalized_features.dropna(axis=1) # Drop constant cols + # 2. Compute dot product (proportional to covariance) dot_product = normalized_features.T @ normalized_features - + # 3. Get principal components eigen_dataframe = _compute_eigenvectors(dot_product, variance_threshold) @@ -89,11 +90,11 @@ def orthogonal_features( # 5. Transform features orthogonal_features_arr = normalized_features.values @ transformation_matrix - + orthogonal_features_df = pd.DataFrame( orthogonal_features_arr, index=features.index, columns=eigen_dataframe.index, ) - return orthogonal_features_df, eigen_dataframe \ No newline at end of file + return orthogonal_features_df, eigen_dataframe diff --git a/RiskLabAI/features/feature_importance/weighted_tau.py b/RiskLabAI/features/feature_importance/weighted_tau.py index 8d680c7..e024624 100644 --- a/RiskLabAI/features/feature_importance/weighted_tau.py +++ b/RiskLabAI/features/feature_importance/weighted_tau.py @@ -5,6 +5,7 @@ import scipy.stats as stats import numpy as np + def calculate_weighted_tau( feature_importances: np.ndarray, principal_component_ranks: np.ndarray ) -> float: @@ -30,6 +31,6 @@ def calculate_weighted_tau( """ # Weights are the inverse of the rank weights = 1.0 / principal_component_ranks - + tau, _ = stats.weightedtau(feature_importances, weights) - return tau \ No newline at end of file + return tau diff --git a/RiskLabAI/features/microstructural_features/__init__.py b/RiskLabAI/features/microstructural_features/__init__.py index 60dd6b3..f1f2f57 100644 --- a/RiskLabAI/features/microstructural_features/__init__.py +++ b/RiskLabAI/features/microstructural_features/__init__.py @@ -23,4 +23,4 @@ "corwin_schultz_estimator", "sigma_estimates", "bekker_parkinson_volatility_estimates", -] \ No newline at end of file +] diff --git a/RiskLabAI/features/microstructural_features/bekker_parkinson_volatility_estimator.py b/RiskLabAI/features/microstructural_features/bekker_parkinson_volatility_estimator.py index daf8826..d0afdb7 100644 --- a/RiskLabAI/features/microstructural_features/bekker_parkinson_volatility_estimator.py +++ b/RiskLabAI/features/microstructural_features/bekker_parkinson_volatility_estimator.py @@ -10,11 +10,8 @@ from math import pi import pandas as pd import numpy as np -from .corwin_schultz import ( - beta_estimates, - gamma_estimates, - _DENOMINATOR -) +from .corwin_schultz import beta_estimates, gamma_estimates, _DENOMINATOR + def sigma_estimates(beta: pd.Series, gamma: pd.Series) -> pd.Series: r""" @@ -42,7 +39,7 @@ def sigma_estimates(beta: pd.Series, gamma: pd.Series) -> pd.Series: term1 = (2**0.5 - 1) * (beta**0.5) / _DENOMINATOR term2 = (gamma / (k2**2 * _DENOMINATOR)) ** 0.5 - + # Floor at zero sigma = np.maximum(term1 + term2, 0) @@ -75,4 +72,4 @@ def bekker_parkinson_volatility_estimates( beta = beta_estimates(high_prices, low_prices, window_span) gamma = gamma_estimates(high_prices, low_prices) - return sigma_estimates(beta, gamma) \ No newline at end of file + return sigma_estimates(beta, gamma) diff --git a/RiskLabAI/features/microstructural_features/corwin_schultz.py b/RiskLabAI/features/microstructural_features/corwin_schultz.py index 416955a..b773650 100644 --- a/RiskLabAI/features/microstructural_features/corwin_schultz.py +++ b/RiskLabAI/features/microstructural_features/corwin_schultz.py @@ -40,10 +40,10 @@ def beta_estimates( The estimated \(\beta\) vector. """ log_ratios_sq = np.log(high_prices / low_prices) ** 2 - + # Sum of current and previous day's squared log-ratio beta = log_ratios_sq.rolling(window=2).sum() - + # Average over the window span beta = beta.rolling(window=window_span).mean() return beta @@ -97,7 +97,7 @@ def alpha_estimates(beta: pd.Series, gamma: pd.Series) -> pd.Series: """ term1 = ((2**0.5) - 1) * (beta**0.5) / _DENOMINATOR term2 = (gamma / _DENOMINATOR) ** 0.5 - + # Floor at zero alpha = np.maximum(term1 - term2, 0) return alpha @@ -129,7 +129,7 @@ def corwin_schultz_estimator( beta = beta_estimates(high_prices, low_prices, window_span) gamma = gamma_estimates(high_prices, low_prices) alpha = alpha_estimates(beta, gamma) - + # Calculate spread spread = 2 * (np.exp(alpha) - 1) / (1 + np.exp(alpha)) - return spread \ No newline at end of file + return spread diff --git a/RiskLabAI/features/structural_breaks/__init__.py b/RiskLabAI/features/structural_breaks/__init__.py index de850ae..6752482 100644 --- a/RiskLabAI/features/structural_breaks/__init__.py +++ b/RiskLabAI/features/structural_breaks/__init__.py @@ -20,4 +20,4 @@ "compute_beta", "get_expanding_window_adf", "get_bsadf_statistic", -] \ No newline at end of file +] diff --git a/RiskLabAI/features/structural_breaks/structural_breaks.py b/RiskLabAI/features/structural_breaks/structural_breaks.py index 7a7f02f..0f1f405 100644 --- a/RiskLabAI/features/structural_breaks/structural_breaks.py +++ b/RiskLabAI/features/structural_breaks/structural_breaks.py @@ -11,6 +11,7 @@ import pandas as pd from typing import List, Union, Tuple, Dict, Any + def lag_dataframe( market_data: pd.DataFrame, lags: Union[int, List[int]] ) -> pd.DataFrame: @@ -34,7 +35,7 @@ def lag_dataframe( A DataFrame with lagged columns, e.g., 'price_0', 'price_1', ... """ lagged_parts = [] - + if isinstance(lags, int): lags_list = range(lags + 1) else: @@ -74,7 +75,7 @@ def prepare_data( """ # <-- ADDED: Convert univariate series to frame for internal processing log_price = log_price_series.to_frame() - + price_diff = log_price.diff().dropna() y_df = price_diff @@ -86,12 +87,12 @@ def prepare_data( if lags > 0: lagged_deltas = price_diff.shift(1) lagged_deltas.columns = ["delta_l1"] - + if lags > 1: for i in range(2, lags + 1): - lagged_deltas[f'delta_l{i}'] = price_diff.shift(i) + lagged_deltas[f"delta_l{i}"] = price_diff.shift(i) - x_df = x_df.join(lagged_deltas, how='outer') + x_df = x_df.join(lagged_deltas, how="outer") # 3. Add constants if constant == "c": @@ -103,13 +104,13 @@ def prepare_data( x_df["constant"] = 1 x_df["trend"] = np.arange(1, len(x_df) + 1) x_df["trend_sq"] = x_df["trend"] ** 2 - + # Align y and X by dropping NaNs created by lagging - combined = y_df.join(x_df, how='inner').dropna() - + combined = y_df.join(x_df, how="inner").dropna() + y_df = combined.iloc[:, [0]] x_df = combined.iloc[:, 1:] - + return y_df, x_df @@ -138,19 +139,20 @@ def compute_beta( try: xt_x_inv = np.linalg.inv(x_window.T @ x_window) xt_y = x_window.T @ y_window - + beta_mean = xt_x_inv @ xt_y - + error = y_window - (x_window @ beta_mean) variance_e = (error.T @ error) / (x_window.shape[0] - x_window.shape[1]) beta_variance = variance_e * xt_x_inv - + return beta_mean, beta_variance - + except np.linalg.LinAlgError: # Handle singular matrix - return np.full((x_window.shape[1], 1), np.nan), \ - np.full((x_window.shape[1], x_window.shape[1]), np.nan) + return np.full((x_window.shape[1], 1), np.nan), np.full( + (x_window.shape[1], x_window.shape[1]), np.nan + ) def get_expanding_window_adf( @@ -185,16 +187,16 @@ def get_expanding_window_adf( """ # <-- CHANGED: Pass Series directly to prepare_data y_df, x_df = prepare_data(log_price, constant=constant, lags=lags) - + adf_stats = [] timestamps = [] - + for i in range(min_sample_length, y_df.shape[0] + 1): y_window = y_df.iloc[:i].values x_window = x_df.iloc[:i].values - + beta_mean, beta_variance = compute_beta(y_window, x_window) - + if np.isnan(beta_variance[0, 0]): t_stat = np.nan else: @@ -203,7 +205,7 @@ def get_expanding_window_adf( t_stat = -np.inf if beta_mean[0, 0] < 0 else np.inf else: t_stat = beta_mean[0, 0] / beta_std_level - + adf_stats.append(t_stat) timestamps.append(y_df.index[i - 1]) @@ -212,9 +214,9 @@ def get_expanding_window_adf( def get_bsadf_statistic( log_price: pd.Series, # <-- CHANGED: Accept Series - min_sample_length: int, - constant: str, - lags: int + min_sample_length: int, + constant: str, + lags: int, ) -> Dict[str, Any]: """ Compute the Backward Supremum ADF (BSADF) statistic. @@ -247,34 +249,34 @@ def get_bsadf_statistic( # 1. Prepare the full X, y matrices # <-- CHANGED: Pass Series directly y, x = prepare_data(log_price, constant=constant, lags=lags) - + # 2. Define all possible start points # <-- BUG FIX: Removed '+ lags' from the range start_points = range(0, y.shape[0] - min_sample_length + 1) bsadf = -np.inf # Supremum ADF - + y_np, x_np = y.values, x.values # 3. Loop over all expanding windows for start in start_points: y_window, x_window = y_np[start:], x_np[start:] - + # 4. Compute ADF regression for this window beta_mean, beta_variance = compute_beta(y_window, x_window) - + if np.isnan(beta_variance[0, 0]): continue # 5. Get t-statistic for the first coefficient (the level) beta_mean_level = beta_mean[0, 0] beta_std_level = beta_variance[0, 0] ** 0.5 - + if beta_std_level == 0: t_stat = -np.inf if beta_mean_level < 0 else np.inf else: t_stat = beta_mean_level / beta_std_level - + if t_stat > bsadf: bsadf = t_stat - return {"Time": log_price.index[-1], "bsadf": bsadf} \ No newline at end of file + return {"Time": log_price.index[-1], "bsadf": bsadf} diff --git a/RiskLabAI/hpc/__init__.py b/RiskLabAI/hpc/__init__.py index f6f5ad5..a1d61b9 100644 --- a/RiskLabAI/hpc/__init__.py +++ b/RiskLabAI/hpc/__init__.py @@ -12,7 +12,7 @@ linear_partitions, nested_partitions, mp_pandas_obj, - parallel_run + parallel_run, ) __all__ = [ @@ -23,5 +23,5 @@ "linear_partitions", "nested_partitions", "mp_pandas_obj", - "parallel_run" -] \ No newline at end of file + "parallel_run", +] diff --git a/RiskLabAI/hpc/hpc.py b/RiskLabAI/hpc/hpc.py index 284c272..8456e6d 100644 --- a/RiskLabAI/hpc/hpc.py +++ b/RiskLabAI/hpc/hpc.py @@ -16,12 +16,13 @@ logger = logging.getLogger(__name__) + def parallel_run( func: Callable[..., Any], iterable: Iterable[Any], num_cpus: int = -1, lin_partition: bool = False, - **kwargs + **kwargs, ) -> List[Any]: """ Executes a function in parallel over an iterable using Joblib. @@ -61,7 +62,7 @@ def parallel_run( except TypeError: iterable = list(iterable) iterable_len = len(iterable) - + num_atoms = num_cpus iterable_partition = np.array_split(range(iterable_len), num_atoms) jobs = (iterable_partition[i] for i in range(num_atoms)) @@ -69,7 +70,7 @@ def parallel_run( results = joblib.Parallel(n_jobs=num_cpus)( joblib.delayed(func)(job, **kwargs) for job in jobs ) - + # Flatten the list of lists return [item for sublist in results for item in sublist] @@ -103,18 +104,18 @@ def report_progress( """ progress = job_number / total_jobs elapsed_time_min = (time.time() - start_time) / 60.0 - + # Avoid division by zero if progress is 0 remaining_time_min = 0.0 if progress > 0: remaining_time_min = elapsed_time_min * (1 / progress - 1) - - timestamp = dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + timestamp = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") message = ( f"{timestamp} {progress*100:.2f}% {task} done after " f"{elapsed_time_min:.2f} minutes. Remaining {remaining_time_min:.2f} minutes." ) - + # Progress is emitted via logging (configure the 'RiskLabAI' logger to see # it; the library is silent by default). logger.info(message) @@ -138,7 +139,7 @@ def expand_call(kargs: Dict[str, Any]) -> Any: Any The output of the callback function. """ - func = kargs.pop('func') + func = kargs.pop("func") return func(**kargs) @@ -167,7 +168,7 @@ def process_jobs( A list containing the results from all jobs. """ if task is None: - task = jobs[0]['func'].__name__ + task = jobs[0]["func"].__name__ # <-- Handle sequential case for debugging if num_threads == 1: @@ -181,15 +182,15 @@ def process_jobs( with mp.Pool(processes=num_threads) as pool: outputs = [] start_time = time.time() - + # Use imap_unordered for efficient processing imap_results = pool.imap_unordered(expand_call, jobs) - + # Process results as they complete for i, result in enumerate(imap_results, 1): outputs.append(result) report_progress(i, len(jobs), start_time, task) - + return outputs @@ -211,13 +212,13 @@ def process_jobs_sequential(jobs: List[Dict[str, Any]]) -> List[Any]: start_time = time.time() task = "Sequential Processing" if jobs: - task = jobs[0].get('func', lambda: None).__name__ + task = jobs[0].get("func", lambda: None).__name__ for i, job in enumerate(jobs, 1): output_ = expand_call(job) output.append(output_) report_progress(i, len(jobs), start_time, task) - + return output @@ -242,7 +243,7 @@ def linear_partitions(num_atoms: int, num_threads: int) -> np.ndarray: n_parts = min(num_threads, num_atoms) if n_parts == 0: return np.array([0]) - + partitions = np.linspace(0, num_atoms, n_parts + 1) partitions = np.ceil(partitions).astype(int) return partitions @@ -277,7 +278,7 @@ def nested_partitions( """ partitions = [0] n_threads_ = min(num_threads, num_atoms) - + if n_threads_ == 0: return np.array([0]) @@ -288,13 +289,13 @@ def nested_partitions( ) part_val = (-1 + part_size**0.5) / 2.0 partitions.append(part_val) - + partitions = np.round(partitions).astype(int) - + if upper_triangle: # The first rows are the heaviest partitions = np.cumsum(np.diff(partitions)[::-1]) partitions = np.append(np.array([0]), partitions) - + return partitions @@ -304,7 +305,7 @@ def mp_pandas_obj( num_threads: int = -1, mp_batches: int = 1, linear_partition: bool = True, - **kwargs: Any + **kwargs: Any, ) -> Union[pd.DataFrame, pd.Series, List[Any]]: """ Parallelize a function call on a pandas object (DataFrame/Series). @@ -343,17 +344,13 @@ def mp_pandas_obj( # <-- Resolve num_threads here to correctly handle mp_batches if num_threads == -1: num_threads = mp.cpu_count() - + total_parts = num_threads * mp_batches - + if linear_partition: - parts = linear_partitions( - len(pandas_object[1]), total_parts - ) + parts = linear_partitions(len(pandas_object[1]), total_parts) else: - parts = nested_partitions( - len(pandas_object[1]), total_parts - ) + parts = nested_partitions(len(pandas_object[1]), total_parts) jobs = [] for i in range(1, len(parts)): @@ -366,15 +363,15 @@ def mp_pandas_obj( # <-- Pass num_threads to process_jobs, which now handles 1 correctly out = process_jobs(jobs, num_threads=num_threads) - + if not out: - return pd.DataFrame() # Return empty DataFrame if no results + return pd.DataFrame() # Return empty DataFrame if no results if isinstance(out[0], pd.DataFrame): result_df = pd.concat(out) elif isinstance(out[0], pd.Series): result_df = pd.concat(out) else: - return out # Return list of other objects + return out # Return list of other objects - return result_df.sort_index() \ No newline at end of file + return result_df.sort_index() diff --git a/RiskLabAI/optimization/__init__.py b/RiskLabAI/optimization/__init__.py index 2ef50d2..f5d7809 100644 --- a/RiskLabAI/optimization/__init__.py +++ b/RiskLabAI/optimization/__init__.py @@ -38,15 +38,12 @@ "quasi_diagonal", "recursive_bisection", "hrp", - # nco.py "get_optimal_portfolio_weights", "get_optimal_portfolio_weights_nco", - # hedging.py "pca_weights", - # hyper_parameter_tuning.py "MyPipeline", "clf_hyper_fit", -] \ No newline at end of file +] diff --git a/RiskLabAI/optimization/hedging.py b/RiskLabAI/optimization/hedging.py index ae5bb27..2164c1b 100644 --- a/RiskLabAI/optimization/hedging.py +++ b/RiskLabAI/optimization/hedging.py @@ -5,6 +5,7 @@ import numpy as np from typing import Optional + def pca_weights( cov: np.ndarray, risk_distribution: Optional[np.ndarray] = None, @@ -48,7 +49,7 @@ def pca_weights( """ # Calculate eigenvalues and eigenvectors eigen_values, eigen_vectors = np.linalg.eigh(cov) - + # Sort in descending order indices = eigen_values.argsort()[::-1] eigen_values, eigen_vectors = eigen_values[indices], eigen_vectors[:, indices] @@ -60,8 +61,8 @@ def pca_weights( # Compute loads (allocation in the orthogonal basis) loads = risk_target * (risk_distribution / eigen_values) ** 0.5 - + # Calculate weights in the original basis weights = np.dot(eigen_vectors, np.reshape(loads, (-1, 1))) - return weights.flatten() \ No newline at end of file + return weights.flatten() diff --git a/RiskLabAI/optimization/hrp.py b/RiskLabAI/optimization/hrp.py index 7959800..208bff5 100644 --- a/RiskLabAI/optimization/hrp.py +++ b/RiskLabAI/optimization/hrp.py @@ -12,6 +12,7 @@ import scipy.spatial.distance as scd from typing import List + def inverse_variance_weights(covariance_matrix: pd.DataFrame) -> np.ndarray: """ Compute the inverse-variance portfolio weights. @@ -51,7 +52,7 @@ def cluster_variance( """ cov_slice = covariance_matrix.loc[clustered_items, clustered_items] weights = inverse_variance_weights(cov_slice).reshape(-1, 1) - + # V_cluster = w' * C * w cluster_var = np.dot(np.dot(weights.T, cov_slice), weights)[0, 0] return cluster_var @@ -129,20 +130,20 @@ def recursive_bisection( for i in range(0, len(clustered_items), 2): cluster_0 = clustered_items[i] cluster_1 = clustered_items[i + 1] - + # 1. Calculate variance for each cluster variance_0 = cluster_variance(covariance_matrix, cluster_0) variance_1 = cluster_variance(covariance_matrix, cluster_1) - + # 2. Calculate allocation factor (alpha) if variance_0 + variance_1 == 0: - alpha = 0.5 # Default to equal weight if both variances are zero + alpha = 0.5 # Default to equal weight if both variances are zero else: alpha = 1 - variance_0 / (variance_0 + variance_1) - + # 3. Apply weights weights[cluster_0] *= alpha - weights[cluster_1] *= (1 - alpha) + weights[cluster_1] *= 1 - alpha return weights @@ -193,16 +194,16 @@ def hrp(cov: pd.DataFrame, corr: pd.DataFrame) -> pd.Series: # 1. Calculate distance distance = distance_corr(corr_df.values) - dist_condensed = scd.squareform(distance, force='tovector') + dist_condensed = scd.squareform(distance, force="tovector") # 2. Cluster link = sch.linkage(dist_condensed, "single") - + # 3. Quasi-diagonalize sorted_items_idx = quasi_diagonal(link) sorted_items_names = corr_df.index[sorted_items_idx].tolist() - + # 4. Recursive bisection hrp_portfolio = recursive_bisection(cov_df, sorted_items_names) - - return hrp_portfolio.sort_index() \ No newline at end of file + + return hrp_portfolio.sort_index() diff --git a/RiskLabAI/optimization/hyper_parameter_tuning.py b/RiskLabAI/optimization/hyper_parameter_tuning.py index eb36f3b..5752c62 100644 --- a/RiskLabAI/optimization/hyper_parameter_tuning.py +++ b/RiskLabAI/optimization/hyper_parameter_tuning.py @@ -50,7 +50,7 @@ def fit( # Add sample_weight to fit_params for the *last step* step_name = self.steps[-1][0] fit_params[f"{step_name}__sample_weight"] = sample_weight - + return super().fit(X, y, **fit_params) @@ -106,7 +106,7 @@ def clf_hyper_fit( The fitted grid search object, or a fitted Bagging pipeline. """ if bagging is None: - bagging = [0, 0.0, 1.0] # Default to no bagging + bagging = [0, 0.0, 1.0] # Default to no bagging if set(label.unique()) == {0, 1}: scoring = "f1" # F1-score for meta-labeling @@ -121,8 +121,8 @@ def clf_hyper_fit( } else: # Ensure 'times' is passed if not already present - if 'times' not in validator_params: - validator_params['times'] = times + if "times" not in validator_params: + validator_params["times"] = times # 1. Set up the custom cross-validator inner_cv = CrossValidatorController( @@ -147,17 +147,17 @@ def clf_hyper_fit( n_jobs=n_jobs, n_iter=rnd_search_iter, ) - + # 3. Fit the search gs = gs.fit(feature_data, label, **fit_params) # 4. (Optional) Fit bagging classifier on the best model if bagging[0] > 0: best_estimator = gs.best_estimator_ - + # Create a new pipeline with the best estimator's steps bag_pipe = MyPipeline(best_estimator.steps) - + bag_clf = BaggingClassifier( estimator=bag_pipe, n_estimators=int(bagging[0]), @@ -165,12 +165,12 @@ def clf_hyper_fit( max_features=float(bagging[2]), n_jobs=n_jobs, ) - + # Fit the bagging classifier bag_clf = bag_clf.fit(feature_data, label, **fit_params) - + # Return as a pipeline return Pipeline([("bag", bag_clf)]) - + # 5. Return the best estimator found - return gs.best_estimator_ \ No newline at end of file + return gs.best_estimator_ diff --git a/RiskLabAI/optimization/nco.py b/RiskLabAI/optimization/nco.py index 49cb9ed..8c92d2d 100644 --- a/RiskLabAI/optimization/nco.py +++ b/RiskLabAI/optimization/nco.py @@ -10,9 +10,8 @@ from typing import Optional, Tuple, Dict, List # Import canonical implementations instead of duplicating -from RiskLabAI.cluster.clustering import ( - cluster_k_means_base, covariance_to_correlation -) +from RiskLabAI.cluster.clustering import cluster_k_means_base, covariance_to_correlation + def get_optimal_portfolio_weights( covariance: np.ndarray, mu: Optional[np.ndarray] = None @@ -40,7 +39,7 @@ def get_optimal_portfolio_weights( if mu is None: mu = ones # For GMV portfolio - + weights = np.dot(inverse_covariance, mu) weights /= np.dot(ones.T, weights) # Normalize weights to sum to 1 return weights @@ -77,13 +76,13 @@ def get_optimal_portfolio_weights_nco( """ covariance = pd.DataFrame(covariance) correlation = covariance_to_correlation(covariance.to_numpy()) - correlation = pd.DataFrame(correlation, - index=covariance.index, - columns=covariance.columns) - + correlation = pd.DataFrame( + correlation, index=covariance.index, columns=covariance.columns + ) + if mu is not None: mu = pd.Series(mu.flatten(), index=covariance.index) - + if number_clusters is None: number_clusters = int(correlation.shape[0] / 2) @@ -98,14 +97,14 @@ def get_optimal_portfolio_weights_nco( ) for i, cluster_assets in clusters.items(): cov_intra = covariance.loc[cluster_assets, cluster_assets].values - + mu_intra = None if mu is not None: mu_intra = mu.loc[cluster_assets].values.reshape(-1, 1) - - weights_intra_cluster.loc[cluster_assets, i] = ( - get_optimal_portfolio_weights(cov_intra, mu_intra).flatten() - ) + + weights_intra_cluster.loc[cluster_assets, i] = get_optimal_portfolio_weights( + cov_intra, mu_intra + ).flatten() # 3. Compute inter-cluster weights # Reduce covariance matrix using intra-cluster weights @@ -126,7 +125,5 @@ def get_optimal_portfolio_weights_nco( ) # 4. Combine weights - weights_nco = weights_intra_cluster.mul(weights_inter_cluster, axis=1).sum( - axis=1 - ) - return weights_nco.values.reshape(-1, 1) \ No newline at end of file + weights_nco = weights_intra_cluster.mul(weights_inter_cluster, axis=1).sum(axis=1) + return weights_nco.values.reshape(-1, 1) diff --git a/RiskLabAI/pde/__init__.py b/RiskLabAI/pde/__init__.py index 7c29f93..f1a94ec 100644 --- a/RiskLabAI/pde/__init__.py +++ b/RiskLabAI/pde/__init__.py @@ -43,7 +43,7 @@ from .solver import ( initialize_weights, FBSDESolver, - FBSNNolver, # Note: Typo in original filename? + FBSNNolver, # Note: Typo in original filename? ) __all__ = [ @@ -53,20 +53,21 @@ "HJBLQ", "BlackScholesBarenblatt", "PricingDiffRate", - # Models "TimeNet", "Net1", - "MAB", "SAB", "ISAB", "PMA", + "MAB", + "SAB", + "ISAB", + "PMA", "TimeNetForSet", "DeepTimeSetTransformer", "FBSNNNetwork", "DeepBSDE", "TimeDependentNetwork", "TimeDependentNetworkMonteCarlo", - # Solvers "initialize_weights", "FBSDESolver", "FBSNNolver", -] \ No newline at end of file +] diff --git a/RiskLabAI/pde/equation.py b/RiskLabAI/pde/equation.py index 5d676a1..1d01c19 100644 --- a/RiskLabAI/pde/equation.py +++ b/RiskLabAI/pde/equation.py @@ -11,6 +11,7 @@ import numpy as np from typing import Tuple, Optional, Union + class Equation: """ Base class for defining PDE-related functions. @@ -28,9 +29,9 @@ def __init__(self, eqn_config: dict): - 'total_time': Total time horizon (float) - 'num_time_interval': Number of time steps (int) """ - self.dim: int = eqn_config['dim'] - self.total_time: float = eqn_config['total_time'] - self.num_time_interval: int = eqn_config['num_time_interval'] + self.dim: int = eqn_config["dim"] + self.total_time: float = eqn_config["total_time"] + self.num_time_interval: int = eqn_config["num_time_interval"] self.delta_t: float = self.total_time / self.num_time_interval self.sqrt_delta_t: float = np.sqrt(self.delta_t) self.y_init: Optional[float] = None @@ -115,11 +116,11 @@ def terminal(self, t: float, x: Tensor) -> Tensor: Terminal payoff value [batch_size, 1]. """ raise NotImplementedError - + def sigma_matrix(self, x: Tensor) -> Tensor: """Helper to get the volatility matrix sigma(x).""" raise NotImplementedError - + def terminal_for_sample(self, x: Tensor) -> Tensor: """Terminal condition for a multi-sample path.""" raise NotImplementedError @@ -129,11 +130,12 @@ class PricingDefaultRisk(Equation): """ PDE for pricing with default risk. """ + def __init__(self, eqn_config: dict): super(PricingDefaultRisk, self).__init__(eqn_config) self.x_init = np.ones(self.dim) * 100.0 self.sigma = 0.2 - self.rate = 0.02 # R + self.rate = 0.02 # R self.delta = 2.0 / 3 self.gammah = 0.2 self.gammal = 0.02 @@ -144,9 +146,10 @@ def __init__(self, eqn_config: dict): self.slope = (self.gammah - self.gammal) / (self.vh - self.vl) def sample(self, num_sample: int) -> Tuple[np.ndarray, np.ndarray]: - dw_sample = np.random.normal( - size=[num_sample, self.dim, self.num_time_interval] - ) * self.sqrt_delta_t + dw_sample = ( + np.random.normal(size=[num_sample, self.dim, self.num_time_interval]) + * self.sqrt_delta_t + ) x_sample = np.zeros([num_sample, self.dim, self.num_time_interval + 1]) x_sample[:, :, 0] = np.ones([num_sample, self.dim]) * self.x_init for i in range(self.num_time_interval): @@ -156,9 +159,10 @@ def sample(self, num_sample: int) -> Tuple[np.ndarray, np.ndarray]: return dw_sample, x_sample def r_u(self, t: float, x: Tensor, y: Tensor, z: Tensor) -> Tensor: - piecewise_linear = nn.ReLU()( - nn.ReLU()(y - self.vh) * self.slope + self.gammah - self.gammal - ) + self.gammal + piecewise_linear = ( + nn.ReLU()(nn.ReLU()(y - self.vh) * self.slope + self.gammah - self.gammal) + + self.gammal + ) return (1 - self.delta) * piecewise_linear + self.rate def h_z(self, t: float, x: Tensor, y: Tensor, z: Tensor) -> Tensor: @@ -179,6 +183,7 @@ class HJBLQ(Equation): """ Hamilton-Jacobi-Bellman (HJB) equation with Linear-Quadratic (LQ) cost. """ + def __init__(self, eqn_config: dict): super(HJBLQ, self).__init__(eqn_config) self.x_init = np.zeros(self.dim) @@ -186,9 +191,10 @@ def __init__(self, eqn_config: dict): self.lambd = 1.0 def sample(self, num_sample: int) -> Tuple[np.ndarray, np.ndarray]: - dw_sample = np.random.normal( - size=[num_sample, self.dim, self.num_time_interval] - ) * self.sqrt_delta_t + dw_sample = ( + np.random.normal(size=[num_sample, self.dim, self.num_time_interval]) + * self.sqrt_delta_t + ) x_sample = np.zeros([num_sample, self.dim, self.num_time_interval + 1]) x_sample[:, :, 0] = np.ones([num_sample, self.dim]) * self.x_init for i in range(self.num_time_interval): @@ -202,31 +208,35 @@ def h_z(self, t: float, x: Tensor, y: Tensor, z: Tensor) -> Tensor: return torch.sum(torch.square(z), dim=1, keepdim=True) / (self.sigma**2) def terminal(self, t: float, x: Tensor) -> Tensor: - return torch.log(0.5 * (1 + torch.norm(x, p=2, dim=1, keepdim=True)**2)) + return torch.log(0.5 * (1 + torch.norm(x, p=2, dim=1, keepdim=True) ** 2)) def sigma_matrix(self, x: Union[np.ndarray, Tensor]) -> float: return self.sigma - + def terminal_for_sample(self, x: Tensor) -> Tensor: # Used by Monte-Carlo solver, needs to match HJB terminal - return torch.log(0.5 * (1 + torch.norm(x, p=2, dim=2, keepdim=True)**2)) + return torch.log(0.5 * (1 + torch.norm(x, p=2, dim=2, keepdim=True) ** 2)) class BlackScholesBarenblatt(Equation): """ Black-Scholes-Barenblatt equation. """ + def __init__(self, eqn_config: dict): super(BlackScholesBarenblatt, self).__init__(eqn_config) - self.x_init = np.ones(self.dim) * np.array([1.0 / (1.0 + i % 2) for i in range(self.dim)]) + self.x_init = np.ones(self.dim) * np.array( + [1.0 / (1.0 + i % 2) for i in range(self.dim)] + ) self.sigma = 0.4 - self.rate = 0.05 # interest rate R + self.rate = 0.05 # interest rate R self.mu_bar = 0.0 def sample(self, num_sample: int) -> Tuple[np.ndarray, np.ndarray]: - dw_sample = np.random.normal( - size=(num_sample, self.dim, self.num_time_interval) - ) * self.sqrt_delta_t + dw_sample = ( + np.random.normal(size=(num_sample, self.dim, self.num_time_interval)) + * self.sqrt_delta_t + ) x_sample = np.zeros((num_sample, self.dim, self.num_time_interval + 1)) x_sample[:, :, 0] = np.ones((num_sample, self.dim)) * self.x_init for i in range(self.num_time_interval): @@ -242,13 +252,13 @@ def h_z(self, t: float, x: Tensor, y: Tensor, z: Tensor) -> Tensor: return -1 * torch.sum(z, dim=1, keepdim=True) * self.rate / self.sigma def terminal(self, t: float, x: Tensor) -> Tensor: - return torch.sum(x ** 2, dim=1, keepdim=True) + return torch.sum(x**2, dim=1, keepdim=True) def sigma_matrix(self, x: Union[np.ndarray, Tensor]) -> Union[np.ndarray, Tensor]: return self.sigma * x def terminal_for_sample(self, x: Tensor) -> Tensor: - return torch.sum(x ** 2, dim=2, keepdim=True) + return torch.sum(x**2, dim=2, keepdim=True) class PricingDiffRate(Equation): @@ -256,6 +266,7 @@ class PricingDiffRate(Equation): Nonlinear Black-Scholes with different interest rates for borrowing and lending. """ + def __init__(self, eqn_config: dict): super(PricingDiffRate, self).__init__(eqn_config) self.x_init = np.ones(self.dim) * 100 @@ -266,9 +277,10 @@ def __init__(self, eqn_config: dict): self.alpha = 1.0 / self.dim def sample(self, num_sample: int) -> Tuple[np.ndarray, np.ndarray]: - dw_sample = np.random.normal( - size=[num_sample, self.dim, self.num_time_interval] - ) * self.sqrt_delta_t + dw_sample = ( + np.random.normal(size=[num_sample, self.dim, self.num_time_interval]) + * self.sqrt_delta_t + ) x_sample = np.zeros([num_sample, self.dim, self.num_time_interval + 1]) x_sample[:, :, 0] = np.ones([num_sample, self.dim]) * self.x_init factor = np.exp((self.mu_bar - (self.sigma**2) / 2) * self.delta_t) @@ -300,4 +312,4 @@ def sigma_matrix(self, x: Union[np.ndarray, Tensor]) -> Union[np.ndarray, Tensor def terminal_for_sample(self, x: Tensor) -> Tensor: temp = torch.max(x, 2, keepdim=True)[0] - return torch.maximum(temp - 120, torch.tensor(0.0, device=x.device)) \ No newline at end of file + return torch.maximum(temp - 120, torch.tensor(0.0, device=x.device)) diff --git a/RiskLabAI/pde/model.py b/RiskLabAI/pde/model.py index 60d1a9e..d7d118c 100644 --- a/RiskLabAI/pde/model.py +++ b/RiskLabAI/pde/model.py @@ -19,9 +19,13 @@ # --- Set Transformer Components --- + class MAB(Module): """Multi-Head Attention Block (MAB).""" - def __init__(self, dim_q: int, dim_k: int, dim_v: int, num_heads: int, ln: bool = False): + + def __init__( + self, dim_q: int, dim_k: int, dim_v: int, num_heads: int, ln: bool = False + ): super(MAB, self).__init__() self.dim_v = dim_v self.num_heads = num_heads @@ -43,7 +47,7 @@ def forward(self, q: Tensor, k: Tensor) -> Tensor: attention = torch.softmax(q_.bmm(k_.transpose(1, 2)) / math.sqrt(self.dim_v), 2) out = torch.cat((q_ + attention.bmm(v_)).split(q.size(0), 0), 2) - + out = self.ln0(out) if self.ln0 is not None else out out = out + F.relu(self.fc_o(out)) out = self.ln1(out) if self.ln1 is not None else out @@ -52,6 +56,7 @@ def forward(self, q: Tensor, k: Tensor) -> Tensor: class SAB(Module): """Self-Attention Block (SAB).""" + def __init__(self, dim_in: int, dim_out: int, num_heads: int, ln: bool = False): super(SAB, self).__init__() self.mab = MAB(dim_in, dim_in, dim_out, num_heads, ln=ln) @@ -62,7 +67,10 @@ def forward(self, x: Tensor) -> Tensor: class ISAB(Module): """Induced Self-Attention Block (ISAB).""" - def __init__(self, dim_in: int, dim_out: int, num_heads: int, num_inds: int, ln: bool = False): + + def __init__( + self, dim_in: int, dim_out: int, num_heads: int, num_inds: int, ln: bool = False + ): super(ISAB, self).__init__() self.i = nn.Parameter(torch.Tensor(1, num_inds, dim_out)) nn.init.xavier_uniform_(self.i) @@ -76,6 +84,7 @@ def forward(self, x: Tensor) -> Tensor: class PMA(Module): """Pooling Multi-Head Attention (PMA).""" + def __init__(self, dim: int, num_heads: int, num_seeds: int, ln: bool = False): super(PMA, self).__init__() self.s = nn.Parameter(torch.Tensor(1, num_seeds, dim)) @@ -89,19 +98,20 @@ def forward(self, x: Tensor) -> Tensor: class TimeNetForSet(Module): """ Time-dependent feature transformation for SetTransformer. - + Applies separate linear layers to time (t) and features (x), then combines them as exp(t) * x. """ + def __init__(self, in_features: int = 1, out_features: int = 64): super(TimeNetForSet, self).__init__() self.feature_layer = Linear(in_features, out_features) - + self.time_layer1 = Linear(1, 10) self.time_layer2 = Linear(10, 10) self.time_layer3 = Linear(10, 10) self.time_layer4 = Linear(10, out_features) - + self.relu_stack = nn.Sequential(ReLU(), ReLU(), ReLU()) def forward(self, t: Tensor, x: Tensor) -> Tensor: @@ -109,9 +119,9 @@ def forward(self, t: Tensor, x: Tensor) -> Tensor: t_feat = self.relu_stack[1](self.time_layer2(t_feat)) t_feat = self.relu_stack[2](self.time_layer3(t_feat)) t_feat = self.time_layer4(t_feat) - + x_feat = self.feature_layer(x) - + return torch.exp(t_feat) * x_feat @@ -119,9 +129,10 @@ class DeepTimeSetTransformer(Module): """ Full Deep Time Set Transformer model. """ + def __init__(self, input_dim: int): super(DeepTimeSetTransformer, self).__init__() - + # Feature extractor layers self.layer1 = Linear(input_dim, 32) self.layer2 = Linear(32, 32) @@ -142,18 +153,18 @@ def __init__(self, input_dim: int): def forward(self, t: Tensor, x: Tensor) -> Tensor: x = self.activation(self.layer1(x)) x = x - torch.mean(x, 2, keepdim=True) - + x = self.activation(self.layer2(x)) x = x - torch.mean(x, 2, keepdim=True) - + x = self.activation(self.layer3(x)) x = x - torch.mean(x, 2, keepdim=True) - + x = self.activation(self.layer4(x)) x = x - torch.mean(x, 2, keepdim=True) - + x = self.activation(self.layer5(x)) - + output = self.regressor(x) output = torch.squeeze(output) @@ -167,18 +178,22 @@ def forward(self, t: Tensor, x: Tensor) -> Tensor: # --- Standard Networks --- + class TimeNet(Module): """Simple feed-forward network for time features.""" + def __init__(self, output_dim: int): super(TimeNet, self).__init__() - self.layers = nn.ModuleList([ - Linear(4, 100), - Linear(100, 150), - Linear(150, 200), - Linear(200, 300), - Linear(300, 200), - Linear(200, output_dim) - ]) + self.layers = nn.ModuleList( + [ + Linear(4, 100), + Linear(100, 150), + Linear(150, 200), + Linear(200, 300), + Linear(300, 200), + Linear(200, output_dim), + ] + ) self.tanh_stack = nn.ModuleList([Tanh() for _ in range(6)]) def forward(self, x: Tensor) -> Tensor: @@ -189,23 +204,27 @@ def forward(self, x: Tensor) -> Tensor: class Net1(Module): """A simple Linear + BatchNorm layer.""" + def __init__(self, input_dim: int, output_dim: int): super(Net1, self).__init__() self.layer = Linear(input_dim, output_dim) - self.bn = BatchNorm1d(output_dim) # Note: bn is not used in forward + self.bn = BatchNorm1d(output_dim) # Note: bn is not used in forward def forward(self, x: Tensor) -> Tensor: return self.layer(x) + # --- BSDE Solver Networks --- + class FBSNNNetwork(Module): """Feed-forward network for the FBSNN solver.""" + def __init__(self, layer_sizes: List[int]): super(FBSNNNetwork, self).__init__() self.n_layer = len(layer_sizes) - 1 self.layers = nn.ModuleList([]) - + for i in range(self.n_layer): self.layers.append(Linear(layer_sizes[i], layer_sizes[i + 1])) @@ -219,6 +238,7 @@ def forward(self, x: Tensor) -> Tensor: class DeepBSDE(Module): """Network for the Deep BSDE method (one net per time step).""" + def __init__(self, layer_sizes: List[int]): super(DeepBSDE, self).__init__() self.n_layer = len(layer_sizes) - 1 @@ -227,8 +247,10 @@ def __init__(self, layer_sizes: List[int]): for i in range(self.n_layer): self.layers.append(Linear(layer_sizes[i], layer_sizes[i + 1], bias=False)) - self.batch_layer.append(BatchNorm1d(layer_sizes[i], eps=1e-06, momentum=0.01)) - + self.batch_layer.append( + BatchNorm1d(layer_sizes[i], eps=1e-06, momentum=0.01) + ) + self.batch_layer.append(BatchNorm1d(layer_sizes[-1], eps=1e-06, momentum=0.01)) self.activation = ReLU() @@ -238,7 +260,7 @@ def forward(self, x: Tensor) -> Tensor: x = self.layers[i](x) # x = self.batch_layer[i+1](x) # Original code commented this out x = self.activation(x) - + x = self.layers[-1](x) # x = self.batch_layer[-1](x) # Original code commented this out return x @@ -246,6 +268,7 @@ def forward(self, x: Tensor) -> Tensor: class TimeDependentNetwork(Module): """Time-dependent network for BSDE solver.""" + def __init__(self, indim: int, layersize: List[int], outdim: int): super(TimeDependentNetwork, self).__init__() self.n_layer = len(layersize) @@ -263,7 +286,9 @@ def __init__(self, indim: int, layersize: List[int], outdim: int): for i in range(len(layersize) - 1): self.layers.append(Net1(layersize[i], layersize[i + 1])) self.time_layer.append(TimeNet(layersize[i])) - self.batch_layer.append(BatchNorm1d(layersize[i + 1], eps=1e-06, momentum=0.01)) + self.batch_layer.append( + BatchNorm1d(layersize[i + 1], eps=1e-06, momentum=0.01) + ) # Output layer self.time_layer.append(TimeNet(outdim)) @@ -275,7 +300,7 @@ def forward(self, t: Tensor, x: Tensor) -> Tensor: # (t, t**2, t**3, exp(t)) time_features = torch.cat((t, t**2, t**3, torch.exp(t)), 1) time_weight = self.time_layer[i](time_features) - + x = self.batch_layer[i](x) x = x * (1 + time_weight) x = self.layers[i](x) @@ -288,6 +313,7 @@ def forward(self, t: Tensor, x: Tensor) -> Tensor: class TimeDependentNetworkMonteCarlo(Module): """Time-dependent network for BSDE solver with Monte Carlo gradient.""" + def __init__(self, indim: int, layersize: List[int], outdim: int, sigma: float): super(TimeDependentNetworkMonteCarlo, self).__init__() self.n_layer = len(layersize) @@ -316,7 +342,7 @@ def __init__(self, indim: int, layersize: List[int], outdim: int, sigma: float): def forward(self, t: Tensor, x: Tensor, y_mc: Tensor) -> Tensor: """ Forward pass. - + Parameters ---------- t : Tensor @@ -326,12 +352,12 @@ def forward(self, t: Tensor, x: Tensor, y_mc: Tensor) -> Tensor: y_mc : Tensor Monte Carlo estimate of the gradient (Z). """ - x_prim = x # Store original input + x_prim = x # Store original input for i in range(self.n_layer): time_features = torch.cat((t, t**2, t**3, torch.exp(t)), 1) time_weight = self.time_layer[i](time_features) - + x = x * (1 + time_weight) x = self.layers[i](x) x = self.activation(x) @@ -339,8 +365,8 @@ def forward(self, t: Tensor, x: Tensor, y_mc: Tensor) -> Tensor: # Output layer time_features = torch.cat((t, t**2, t**3, torch.exp(t)), 1) time_weight = self.time_layer[self.n_layer](time_features) - + # Combine network output with MC estimate # Z = sigma * S * Network(t,S) + (1 - time_weight) * Z_MC - x = (self.sigma * x_prim * self.linear(x) + (1 - time_weight) * y_mc) - return x \ No newline at end of file + x = self.sigma * x_prim * self.linear(x) + (1 - time_weight) * y_mc + return x diff --git a/RiskLabAI/pde/solver.py b/RiskLabAI/pde/solver.py index d13f16a..177fefe 100644 --- a/RiskLabAI/pde/solver.py +++ b/RiskLabAI/pde/solver.py @@ -16,6 +16,7 @@ logger = logging.getLogger(__name__) + def initialize_weights(m: nn.Module) -> None: """ Initializes the weights of a Linear layer. @@ -34,6 +35,7 @@ class FBSDESolver: """ Solver for FBSDEs using various deep learning methods. """ + def __init__( self, pde: Equation, @@ -66,17 +68,17 @@ def __init__( self.device = device # 1. Initialize the correct network model - if solving_method == 'Monte-Carlo': + if solving_method == "Monte-Carlo": self.solver = TimeDependentNetworkMonteCarlo( pde.dim, self.layer_size, pde.dim, pde.sigma ).to(device) - elif solving_method == 'Deep-Time-SetTransformer': - self.solver = DeepTimeSetTransformer(1).to(device) # Assumes dim=1 - elif solving_method == 'DTNN': - self.solver = TimeDependentNetwork( - pde.dim, self.layer_size, pde.dim - ).to(device) - elif solving_method == 'DeepBSDE': + elif solving_method == "Deep-Time-SetTransformer": + self.solver = DeepTimeSetTransformer(1).to(device) # Assumes dim=1 + elif solving_method == "DTNN": + self.solver = TimeDependentNetwork(pde.dim, self.layer_size, pde.dim).to( + device + ) + elif solving_method == "DeepBSDE": # Create a separate network for each time step self.solver = nn.ModuleList( [DeepBSDE(layer_sizes).to(device) for _ in range(pde.num_time_interval)] @@ -85,7 +87,7 @@ def __init__( raise ValueError(f"Unknown solving_method: {solving_method}") # 2. Initialize the optimizer - if solving_method != 'DeepBSDE': + if solving_method != "DeepBSDE": self.optimizer = torch.optim.Adam( self.solver.parameters(), lr=self.learning_rate, betas=(0.9, 0.99) ) @@ -102,7 +104,7 @@ def compute_loss( """ batch_size = y_path.size()[0] y_terminal = init_y.expand(batch_size, 1) - + coef = torch.ones((batch_size, 1), device=self.device) dw_coef = torch.zeros((batch_size, 1), device=self.device) @@ -110,7 +112,7 @@ def compute_loss( S0 = y_path[:, :, z] t0 = t * self.pde.delta_t * z - if self.method == 'DeepBSDE': + if self.method == "DeepBSDE": if z > 0: out_z = self.solver[z](S0) else: @@ -118,38 +120,41 @@ def compute_loss( out_z = init_z.expand(batch_size, self.pde.dim) else: # Other models are time-dependent - out_z = self.solver(t0, S0) # This assumes other models return Z - + out_z = self.solver(t0, S0) # This assumes other models return Z + samp_dw = dw_path[:, :, z] - + # Get driver components interest_rate = self.pde.r_u(t0, S0, y_terminal, out_z) hz = self.pde.h_z(t0, S0, y_terminal, out_z) - + # Update path y_terminal = ( y_terminal * (1 + interest_rate * self.pde.delta_t) + hz * self.pde.delta_t + torch.sum(out_z * samp_dw, dim=1, keepdim=True) ) - + # Update coefficients for loss calculation if z > 0: - dw_coef = dw_coef * (1 + interest_rate * self.pde.delta_t) + torch.sum( - out_z * samp_dw, dim=1, keepdim=True - ) + hz * self.pde.delta_t + dw_coef = ( + dw_coef * (1 + interest_rate * self.pde.delta_t) + + torch.sum(out_z * samp_dw, dim=1, keepdim=True) + + hz * self.pde.delta_t + ) else: - dw_coef = torch.sum(out_z * samp_dw, dim=1, keepdim=True) + hz * self.pde.delta_t - + dw_coef = ( + torch.sum(out_z * samp_dw, dim=1, keepdim=True) + + hz * self.pde.delta_t + ) + coef = coef * (1 + interest_rate * self.pde.delta_t) # 4. Calculate terminal payoff and loss - payoff = self.pde.terminal( - t * self.pde.total_time, y_path[:, :, -1] - ) - + payoff = self.pde.terminal(t * self.pde.total_time, y_path[:, :, -1]) + loss = torch.mean(torch.square(payoff - y_terminal)) - + return loss, coef, dw_coef, payoff def solve( @@ -175,12 +180,12 @@ def solve( """ losses = [] inits = [] - + # Y_0 and Z_0 are trainable parameters for DeepBSDE y0 = torch.tensor([init_y], device=self.device).requires_grad_(True) z0 = torch.zeros(1, self.pde.dim, device=self.device).requires_grad_(True) - - if self.method == 'DeepBSDE': + + if self.method == "DeepBSDE": init_opt = torch.optim.Adam([y0], lr=0.1, betas=(0.9, 0.99)) init_grad_opt = torch.optim.Adam([z0], lr=0.001, betas=(0.9, 0.99)) @@ -191,7 +196,7 @@ def solve( t_val = torch.ones((128, 1), device=self.device) # Set model to train mode - if self.method != 'DeepBSDE': + if self.method != "DeepBSDE": self.solver.train() else: for model in self.solver: @@ -206,18 +211,16 @@ def solve( t_train = torch.ones((batch_size, 1), device=self.device) self.optimizer.zero_grad() - if self.method == 'DeepBSDE': + if self.method == "DeepBSDE": init_opt.zero_grad() init_grad_opt.zero_grad() - loss, _, _, _ = self.compute_loss( - y_train, dw_train, t_train, y0, z0 - ) - + loss, _, _, _ = self.compute_loss(y_train, dw_train, t_train, y0, z0) + loss.backward() - + self.optimizer.step() - if self.method == 'DeepBSDE': + if self.method == "DeepBSDE": init_opt.step() init_grad_opt.step() @@ -226,8 +229,8 @@ def solve( val_loss, coef, dw_coef, payoff = self.compute_loss( y_val, dw_val, t_val, y0, z0 ) - - if self.method != 'DeepBSDE': + + if self.method != "DeepBSDE": # For non-DeepBSDE, Y_0 is not a param but computed # from the loss function's components. y0_new = torch.mean((payoff - dw_coef) / coef) @@ -236,7 +239,7 @@ def solve( else: inits.append(y0.item()) logger.info("Loss: %.4f, Y_0: %.4f", val_loss.item(), y0.item()) - + losses.append(val_loss.item()) return losses, inits @@ -245,17 +248,18 @@ def solve( class FBSNNolver: """ Solver for FBSNN (Forward-Backward Stochastic Neural Network). - + This method solves the PDE by learning the solution Y_t directly at each time step and minimizing the difference between the predicted Y_{t+1} and the one-step-ahead approximation. """ + def __init__( self, pde: Equation, layer_sizes: List[int], learning_rate: float, - device: torch.device + device: torch.device, ): """ Initialize the FBSNN Solver. @@ -288,7 +292,7 @@ def compute_loss( Compute the loss for the FBSNN method. """ batch_size = y_path.size()[0] - y_terminal = init_y.expand(batch_size, 1) # Not used, but kept for signature + y_terminal = init_y.expand(batch_size, 1) # Not used, but kept for signature loss = torch.tensor(0.0, device=self.device) for z in range(self.pde.num_time_interval): @@ -304,9 +308,7 @@ def compute_loss( # 2. Calculate the gradient Z_t = dY_t/dS_t * sigma(S_t) grad_y0 = autograd.grad( - outputs=torch.sum(y_0), - inputs=S0, - create_graph=True + outputs=torch.sum(y_0), inputs=S0, create_graph=True )[0] Z = grad_y0 * self.pde.sigma_matrix(S0) @@ -315,33 +317,33 @@ def compute_loss( # 3. Get driver components interest_rate = self.pde.r_u(t0, S0, y_0, Z) hz = self.pde.h_z(t0, S0, y_0, Z) - + # 4. Calculate the one-step approximation Y_hat_{t+1} y_1_hat = ( y_0 * (1 + interest_rate * self.pde.delta_t) + hz * self.pde.delta_t + torch.sum(Z * samp_dw, dim=1, keepdim=True) ) - + # 5. Add to total loss loss += torch.mean(torch.square(y_1_hat - y_1)) # 6. Add terminal condition loss S_T = y_path[:, :, -1] t_T = t * self.pde.total_time - + payoff = self.pde.terminal(t_T, S_T) y_T = self.solver(torch.cat((t_T, S_T), dim=1)) - + loss += torch.mean(torch.square(payoff - y_T)) - + return loss def solve( self, num_iterations: int, batch_size: int, - init_y: float, # Note: init_y is not used by FBSNN, but kept for API + init_y: float, # Note: init_y is not used by FBSNN, but kept for API ) -> Tuple[List[float], List[float]]: """ Solves the PDE using the FBSNN method. @@ -363,9 +365,9 @@ def solve( """ losses = [] inits = [] - + self.solver.train() - + # Validation data dw_val, y_val = self.pde.sample(128) y_val = torch.tensor(y_val, dtype=torch.float32, device=self.device) @@ -381,10 +383,10 @@ def solve( t_train = torch.ones((batch_size, 1), device=self.device) self.optimizer.zero_grad() - + # init_y is not used, pass a dummy tensor dummy_init = torch.tensor(0.0, device=self.device) - + loss = self.compute_loss(y_train, dw_train, t_train, dummy_init) loss.backward() self.optimizer.step() @@ -398,10 +400,10 @@ def solve( S0_val = y_val[:, :, 0] t0_val = t_val * self.pde.delta_t * 0 y0_val = self.solver(torch.cat((t0_val, S0_val), dim=1)) - + y0_mean = torch.mean(y0_val).item() inits.append(y0_mean) logger.info("Loss: %.4f, Y_0: %.4f", val_loss.item(), y0_mean) - return losses, inits \ No newline at end of file + return losses, inits diff --git a/RiskLabAI/utils/__init__.py b/RiskLabAI/utils/__init__.py index acd1ea2..0aec361 100644 --- a/RiskLabAI/utils/__init__.py +++ b/RiskLabAI/utils/__init__.py @@ -36,6 +36,7 @@ def __getattr__(name): return value raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + # --- Alias for backward compatibility --- # The historical `compute_exponential_weighted_moving_average` name now maps to # the canonical, numba-jitted `ewma` (the former `smoothing_average.py` @@ -75,22 +76,17 @@ def __getattr__(name): "PREVIOUS_TICK_IMBALANCES_SELL_LIST", "PREVIOUS_BARS_BUY_TICKS_PROPORTIONS_LIST", "N_PREVIOUS_BARS_FOR_EXPECTED_N_TICKS_ESTIMATION", - # ewma "ewma", "compute_exponential_weighted_moving_average", # Alias - # progress "progress_bar", - # momentum_mean_reverting_strategy_sides "determine_strategy_side", - # update_figure_layout "update_figure_layout", - # publication_plots "setup_publication_style", "apply_plot_style", "finalize_plot", -] \ No newline at end of file +] diff --git a/RiskLabAI/utils/constants.py b/RiskLabAI/utils/constants.py index d2372b4..1289705 100644 --- a/RiskLabAI/utils/constants.py +++ b/RiskLabAI/utils/constants.py @@ -1,5 +1,5 @@ DATE_TIME = "Date Time" -TIMESTAMP = 'Timestamp' +TIMESTAMP = "Timestamp" TICK_NUMBER = "Tick Number" OPEN_PRICE = "Open" HIGH_PRICE = "High" @@ -40,4 +40,3 @@ PREVIOUS_BARS_BUY_TICKS_PROPORTIONS_LIST = "List of previous bars buy ticks proportion" N_PREVIOUS_BARS_FOR_EXPECTED_N_TICKS_ESTIMATION = "Window size for E[T]" - diff --git a/RiskLabAI/utils/ewma.py b/RiskLabAI/utils/ewma.py index 603885e..ffa36ca 100644 --- a/RiskLabAI/utils/ewma.py +++ b/RiskLabAI/utils/ewma.py @@ -7,6 +7,7 @@ from numba import jit, float64, int64 from typing import Union + @jit(nopython=True) def ewma(array: np.ndarray, window: int) -> np.ndarray: r""" @@ -44,7 +45,7 @@ def ewma(array: np.ndarray, window: int) -> np.ndarray: alpha = 2.0 / (float(window) + 1.0) multiplier = 1.0 - alpha - + # Handle the sum of weights (denominator) weight_sum = 1.0 current_weighted_sum = array[0] @@ -55,4 +56,4 @@ def ewma(array: np.ndarray, window: int) -> np.ndarray: current_weighted_sum = current_weighted_sum * multiplier + array[i] result_ewma_array[i] = current_weighted_sum / weight_sum - return result_ewma_array \ No newline at end of file + return result_ewma_array diff --git a/RiskLabAI/utils/momentum_mean_reverting_strategy_sides.py b/RiskLabAI/utils/momentum_mean_reverting_strategy_sides.py index 1c352a2..208f23f 100644 --- a/RiskLabAI/utils/momentum_mean_reverting_strategy_sides.py +++ b/RiskLabAI/utils/momentum_mean_reverting_strategy_sides.py @@ -4,6 +4,7 @@ import pandas as pd + def determine_strategy_side( prices: pd.Series, fast_window: int = 20, @@ -61,9 +62,9 @@ def determine_strategy_side( # Explicit int64: plain `int` maps to int32 on Windows, which makes the # returned dtype platform-dependent. signal = (fast_ma >= slow_ma).astype("int64") * 2 - 1 - + if mean_reversion: # Invert the signal for mean reversion return -signal - - return signal \ No newline at end of file + + return signal diff --git a/RiskLabAI/utils/progress.py b/RiskLabAI/utils/progress.py index 0f48bac..3727ed8 100644 --- a/RiskLabAI/utils/progress.py +++ b/RiskLabAI/utils/progress.py @@ -6,6 +6,7 @@ import time from typing import Optional + def progress_bar( current_progress: int, total_progress: int, @@ -30,7 +31,7 @@ def progress_bar( """ if total_progress == 0: return - + # Check if task is completed if current_progress == total_progress: sys.stdout.write( @@ -38,7 +39,7 @@ def progress_bar( ) else: percentage = current_progress / total_progress - + # Handle the very first iteration if percentage == 0: arrow = "" @@ -46,7 +47,7 @@ def progress_bar( # Safely calculate arrow length, ensuring it doesn't go below -1 arrow_length = int(round(percentage * bar_length) - 1) arrow = "-" * max(0, arrow_length) + ">" - + spaces = " " * (bar_length - len(arrow)) elapsed_time_sec = time.time() - start_time @@ -64,5 +65,5 @@ def progress_bar( sys.stdout.write( f"\rCompleted: [{arrow + spaces}] {percentage*100:.0f}% - {remaining_time_str}." ) - - sys.stdout.flush() \ No newline at end of file + + sys.stdout.flush() diff --git a/RiskLabAI/utils/publication_plots.py b/RiskLabAI/utils/publication_plots.py index d482fe6..2dcf7b8 100644 --- a/RiskLabAI/utils/publication_plots.py +++ b/RiskLabAI/utils/publication_plots.py @@ -17,61 +17,59 @@ # [THEMES dictionary remains the same] THEMES: Dict[str, Dict[str, Any]] = { - 'light': { - 'figure.facecolor': '#FFFFFF', - 'axes.facecolor': '#FFFFFF', - 'text.color': '#000000', - 'axes.labelcolor': '#000000', - 'axes.edgecolor': '#000000', - 'xtick.color': '#000000', - 'ytick.color': '#000000', - 'grid.color': '#CCCCCC', - 'legend.facecolor': '#FFFFFF', - 'legend.edgecolor': '#B0B0B0', + "light": { + "figure.facecolor": "#FFFFFF", + "axes.facecolor": "#FFFFFF", + "text.color": "#000000", + "axes.labelcolor": "#000000", + "axes.edgecolor": "#000000", + "xtick.color": "#000000", + "ytick.color": "#000000", + "grid.color": "#CCCCCC", + "legend.facecolor": "#FFFFFF", + "legend.edgecolor": "#B0B0B0", }, - 'medium': { - 'figure.facecolor': '#B0B0B0', # A more solid, medium grey - 'axes.facecolor': '#B0B0B0', - 'text.color': '#FFFFFF', # White text (like the dark theme) - 'axes.labelcolor': '#FFFFFF', - 'axes.edgecolor': '#FFFFFF', - 'xtick.color': '#FFFFFF', - 'ytick.color': '#FFFFFF', - 'grid.color': '#E0E0E0', # Lighter grid lines on medium bg - 'legend.facecolor': '#B0B0B0', - 'legend.edgecolor': '#FFFFFF', + "medium": { + "figure.facecolor": "#B0B0B0", # A more solid, medium grey + "axes.facecolor": "#B0B0B0", + "text.color": "#FFFFFF", # White text (like the dark theme) + "axes.labelcolor": "#FFFFFF", + "axes.edgecolor": "#FFFFFF", + "xtick.color": "#FFFFFF", + "ytick.color": "#FFFFFF", + "grid.color": "#E0E0E0", # Lighter grid lines on medium bg + "legend.facecolor": "#B0B0B0", + "legend.edgecolor": "#FFFFFF", + }, + "dark": { + "figure.facecolor": "#2E2E2E", + "axes.facecolor": "#2E2E2E", + "text.color": "#F0F0F0", + "axes.labelcolor": "#F0F0F0", + "axes.edgecolor": "#F0F0F0", + "xtick.color": "#F0F0F0", + "ytick.color": "#F0F0F0", + "grid.color": "#6A6A6A", + "legend.facecolor": "#2E2E2E", + "legend.edgecolor": "#F0F0F0", }, - 'dark': { - 'figure.facecolor': '#2E2E2E', - 'axes.facecolor': '#2E2E2E', - 'text.color': '#F0F0F0', - 'axes.labelcolor': '#F0F0F0', - 'axes.edgecolor': '#F0F0F0', - 'xtick.color': '#F0F0F0', - 'ytick.color': '#F0F0F0', - 'grid.color': '#6A6A6A', - 'legend.facecolor': '#2E2E2E', - 'legend.edgecolor': '#F0F0F0', - } } # --- MODULE-LEVEL CONFIGURATION --- # This dictionary will store the settings from setup_publication_style -_CONFIG = { - 'save_plots': False, - 'save_dir': 'figs' -} +_CONFIG = {"save_plots": False, "save_dir": "figs"} + # --- UPDATED FUNCTION --- def setup_publication_style( - theme: str = 'light', + theme: str = "light", quality: int = 300, save_plots: bool = False, # <-- New parameter - save_dir: str = 'figs' # <-- New parameter + save_dir: str = "figs", # <-- New parameter ) -> None: """ Sets the global Matplotlib rcParams and saving configuration. - + Call this function once at the beginning of your notebook. Parameters @@ -85,45 +83,54 @@ def setup_publication_style( save_dir : str, optional The directory to save figures in. Defaults to 'figs'. """ - + # [All the theme parsing and styling code remains the same] # ... (omitted for brevity) ... is_transparent = False base_theme_name = theme - if theme.endswith('-transparent'): + if theme.endswith("-transparent"): is_transparent = True - base_theme_name = theme.replace('-transparent', '') + base_theme_name = theme.replace("-transparent", "") if base_theme_name not in THEMES: - base_theme_name = 'light' + base_theme_name = "light" params = THEMES[base_theme_name].copy() common_params = { - 'font.size': 12, 'axes.labelsize': 12, 'axes.titlesize': 14, - 'axes.titleweight': 'bold', 'xtick.labelsize': 12, 'ytick.labelsize': 12, - 'legend.fontsize': 12, 'legend.title_fontsize': 13, - 'figure.dpi': quality, 'savefig.dpi': quality, 'axes.grid': True, - 'grid.linestyle': '--', 'grid.alpha': 0.7, 'axes.linewidth': 1.2, + "font.size": 12, + "axes.labelsize": 12, + "axes.titlesize": 14, + "axes.titleweight": "bold", + "xtick.labelsize": 12, + "ytick.labelsize": 12, + "legend.fontsize": 12, + "legend.title_fontsize": 13, + "figure.dpi": quality, + "savefig.dpi": quality, + "axes.grid": True, + "grid.linestyle": "--", + "grid.alpha": 0.7, + "axes.linewidth": 1.2, } params.update(common_params) if is_transparent: - params['figure.facecolor'] = (0, 0, 0, 0) - params['axes.facecolor'] = (0, 0, 0, 0) - params['savefig.transparent'] = True - params['legend.facecolor'] = (0, 0, 0, 0) + params["figure.facecolor"] = (0, 0, 0, 0) + params["axes.facecolor"] = (0, 0, 0, 0) + params["savefig.transparent"] = True + params["legend.facecolor"] = (0, 0, 0, 0) else: - params['savefig.transparent'] = False + params["savefig.transparent"] = False try: - plt.rc('font', family='Times New Roman') + plt.rc("font", family="Times New Roman") except Exception: logger.warning("Times New Roman not found. Defaulting to serif.") - plt.rc('font', family='serif') + plt.rc("font", family="serif") plt.rcParams.update(params) - sns_style = "darkgrid" if base_theme_name == 'dark' else "whitegrid" + sns_style = "darkgrid" if base_theme_name == "dark" else "whitegrid" sns.set_style(sns_style, rc=params) - + # --- Store saving configuration --- - _CONFIG['save_plots'] = save_plots - _CONFIG['save_dir'] = save_dir - + _CONFIG["save_plots"] = save_plots + _CONFIG["save_dir"] = save_dir + logger.info( "Matplotlib style updated. Theme: '%s', Quality: %s DPI.", theme, quality ) @@ -132,13 +139,14 @@ def setup_publication_style( else: logger.info("Plot saving disabled.") + # [apply_plot_style function remains exactly the same] def apply_plot_style( ax: plt.Axes, title: str, xlabel: str, ylabel: str, - legend_title: Optional[str] = None + legend_title: Optional[str] = None, ) -> None: ax.set_title(title) ax.set_xlabel(xlabel) @@ -146,11 +154,9 @@ def apply_plot_style( if ax.get_legend() and legend_title is not None: ax.legend(title=legend_title) + # --- UPDATED FUNCTION --- -def finalize_plot( - fig: fig.Figure, - filename: str -) -> None: +def finalize_plot(fig: fig.Figure, filename: str) -> None: """ Shows the plot and saves it *if* saving was enabled in setup_publication_style. @@ -163,24 +169,24 @@ def finalize_plot( The name of the file (e.g., 'model_performance.png'). This is required, but only used if saving is enabled. """ - + # --- 1. Save the figure if global switch is on --- - if _CONFIG['save_plots']: - save_dir = _CONFIG['save_dir'] - + if _CONFIG["save_plots"]: + save_dir = _CONFIG["save_dir"] + # Create the directory if it doesn't exist os.makedirs(save_dir, exist_ok=True) - + # Construct the full path full_path = os.path.join(save_dir, filename) - + # Save the figure - fig.savefig(full_path, bbox_inches='tight') - + fig.savefig(full_path, bbox_inches="tight") + logger.info("Figure saved to: %s", full_path) - + # --- 2. Always show the plot --- plt.show() - + # --- 3. Close the figure object --- - plt.close(fig) \ No newline at end of file + plt.close(fig) diff --git a/RiskLabAI/utils/update_figure_layout.py b/RiskLabAI/utils/update_figure_layout.py index 9ca5f5e..749f9fa 100644 --- a/RiskLabAI/utils/update_figure_layout.py +++ b/RiskLabAI/utils/update_figure_layout.py @@ -6,6 +6,7 @@ import plotly.graph_objects as go from typing import Optional + def update_figure_layout( fig: go.Figure, title: str, @@ -45,7 +46,7 @@ def update_figure_layout( yaxis_title=yaxis_title, template="plotly_dark", plot_bgcolor="rgba(0,0,0,0)", # Transparent background - paper_bgcolor="rgba(0,0,0,0)", # Transparent background + paper_bgcolor="rgba(0,0,0,0)", # Transparent background legend=dict( x=legend_x, y=legend_y, @@ -53,4 +54,4 @@ def update_figure_layout( yanchor="auto", ), ) - return fig \ No newline at end of file + return fig diff --git a/STRUCTURE.md b/STRUCTURE.md deleted file mode 100644 index 65d0f9b..0000000 --- a/STRUCTURE.md +++ /dev/null @@ -1,211 +0,0 @@ -# Project File Structure - -``` -πŸ“ RiskLabAI.py/ -β”œβ”€β”€ πŸ“ docs/ -β”‚ └── πŸ“„ delete -β”œβ”€β”€ πŸ“ RiskLabAI/ -β”‚ β”œβ”€β”€ πŸ“ backtest/ -β”‚ β”‚ β”œβ”€β”€ πŸ“ validation/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ adaptive_combinatorial_purged.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ bagged_combinatorial_purged.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ combinatorial_purged.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ cross_validator_controller.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ cross_validator_factory.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ cross_validator_interface.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ kfold.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ purged_kfold.py -β”‚ β”‚ β”‚ └── πŸ“„ walk_forward.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ backtest_overfitting_simulation.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ backtest_statistics.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ backtest_synthetic_data.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ bet_sizing.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ probabilistic_sharpe_ratio.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ probability_of_backtest_overfitting.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ strategy_risk.py -β”‚ β”‚ └── πŸ“„ test_set_overfitting.py -β”‚ β”œβ”€β”€ πŸ“ cluster/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ └── πŸ“„ clustering.py -β”‚ β”œβ”€β”€ πŸ“ controller/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ bars_initializer.py -β”‚ β”‚ └── πŸ“„ data_structure_controller.py -β”‚ β”œβ”€β”€ πŸ“ data/ -β”‚ β”‚ β”œβ”€β”€ πŸ“ denoise/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”‚ └── πŸ“„ denoising.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ differentiation/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”‚ └── πŸ“„ differentiation.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ distance/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”‚ └── πŸ“„ distance_metric.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ labeling/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ financial_labels.py -β”‚ β”‚ β”‚ └── πŸ“„ labeling.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ structures/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ abstract_bars.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ abstract_imbalance_bars.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ abstract_information_driven_bars.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ abstract_run_bars.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ imbalance_bars.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ run_bars.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ standard_bars.py -β”‚ β”‚ β”‚ └── πŸ“„ time_bars.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ synthetic_data/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ drift_burst_hypothesis.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ simulation.py -β”‚ β”‚ β”‚ └── πŸ“„ synthetic_controlled_environment.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ weights/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”‚ └── πŸ“„ sample_weights.py -β”‚ β”‚ └── πŸ“„ __init__.py -β”‚ β”œβ”€β”€ πŸ“ ensemble/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ bagging_classifier_accuracy.py -β”‚ β”‚ └── πŸ“„ empirical_bagging_accuracy.py -β”‚ β”œβ”€β”€ πŸ“ features/ -β”‚ β”‚ β”œβ”€β”€ πŸ“ entropy_features/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ entropy.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ kontoyiannis.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ lempel_ziv.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ plug_in.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ pmf.py -β”‚ β”‚ β”‚ └── πŸ“„ shannon.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ feature_importance/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ clustered_feature_importance_mda.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ clustered_feature_importance_mdi.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ feature_importance_controller.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ feature_importance_factory.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ feature_importance_mda.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ feature_importance_mdi.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ feature_importance_sfi.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ feature_importance_strategy.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ generate_synthetic_data.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ orthogonal_features.py -β”‚ β”‚ β”‚ └── πŸ“„ weighted_tau.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ microstructural_features/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ bekker_parkinson_volatility_estimator.py -β”‚ β”‚ β”‚ └── πŸ“„ corwin_schultz.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ structural_breaks/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”‚ └── πŸ“„ structural_breaks.py -β”‚ β”‚ └── πŸ“„ __init__.py -β”‚ β”œβ”€β”€ πŸ“ hpc/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ └── πŸ“„ hpc.py -β”‚ β”œβ”€β”€ πŸ“ optimization/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ hedging.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ hrp.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ hyper_parameter_tuning.py -β”‚ β”‚ └── πŸ“„ nco.py -β”‚ β”œβ”€β”€ πŸ“ pde/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ equation.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ model.py -β”‚ β”‚ └── πŸ“„ solver.py -β”‚ β”œβ”€β”€ πŸ“ utils/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ __init__.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ constants.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ ewma.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ momentum_mean_reverting_strategy_sides.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ progress.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ publication_plots.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ smoothing_average.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ update_figure_layout.py -β”‚ β”‚ └── πŸ“„ utilities_lopez.py -β”‚ └── πŸ“„ __init__.py -β”œβ”€β”€ πŸ“ test/ -β”‚ β”œβ”€β”€ πŸ“ backtest/ -β”‚ β”‚ β”œβ”€β”€ πŸ“ validation/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_adaptive_combinatorial_purged.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_bagged_combinatorial_purged.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_combinatorial_purged.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_cross_validator_controller.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_cross_validator_factory.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_kfold.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_purged_kfold.py -β”‚ β”‚ β”‚ └── πŸ“„ test_walk_forward.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_backtest_statistics.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_backtest_synthetic_data.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_bet_sizing.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_probabilistic_sharpe_ratio.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_probability_of_backtest_overfitting.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_strategy_risk.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_test_set_overfitting.py -β”‚ β”‚ └── πŸ“„ teste_backtest_overfitting_simulation.py -β”‚ β”œβ”€β”€ πŸ“ cluster/ -β”‚ β”‚ └── πŸ“„ test_clustering.py -β”‚ β”œβ”€β”€ πŸ“ controller/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_bars_initializer.py -β”‚ β”‚ └── πŸ“„ test_data_structure_controller.py -β”‚ β”œβ”€β”€ πŸ“ data/ -β”‚ β”‚ β”œβ”€β”€ πŸ“ denoise/ -β”‚ β”‚ β”‚ └── πŸ“„ test_denoising.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ differentiation/ -β”‚ β”‚ β”‚ └── πŸ“„ test_differentiation.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ distance/ -β”‚ β”‚ β”‚ └── πŸ“„ test_distance_metric.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ labeling/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_financial_labels.py -β”‚ β”‚ β”‚ └── πŸ“„ test_labeling.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ structures/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_imbalance_bars.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_run_bars.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_standard_bars.py -β”‚ β”‚ β”‚ └── πŸ“„ test_time_bars.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ synthetic_data/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_drift_burst_hypothesis.py -β”‚ β”‚ β”‚ └── πŸ“„ test_synthetic_controlled_environment.py -β”‚ β”‚ └── πŸ“ weights/ -β”‚ β”‚ └── πŸ“„ test_sample_weights.py -β”‚ β”œβ”€β”€ πŸ“ ensemble/ -β”‚ β”‚ └── πŸ“„ test_bagging_classifier_accuracy.py -β”‚ β”œβ”€β”€ πŸ“ features/ -β”‚ β”‚ β”œβ”€β”€ πŸ“ entropy_features/ -β”‚ β”‚ β”‚ └── πŸ“„ test_entropy.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ feature_importance/ -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_feature_importance.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_generate_synthetic_data.py -β”‚ β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_orthogonal_features.py -β”‚ β”‚ β”‚ └── πŸ“„ test_weighted_tau.py -β”‚ β”‚ β”œβ”€β”€ πŸ“ microstructural_features/ -β”‚ β”‚ β”‚ └── πŸ“„ test_microstructure.py -β”‚ β”‚ └── πŸ“ structural_breaks/ -β”‚ β”‚ └── πŸ“„ test_structural_breaks.py -β”‚ β”œβ”€β”€ πŸ“ hpc/ -β”‚ β”‚ └── πŸ“„ test_hpc.py -β”‚ β”œβ”€β”€ πŸ“ optimization/ -β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_hedging.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_hrp.py -β”‚ β”‚ β”œβ”€β”€ πŸ“„ test_hyper_parameter_tuning.py -β”‚ β”‚ └── πŸ“„ test_nco.py -β”‚ β”œβ”€β”€ πŸ“ pde/ -β”‚ β”‚ └── πŸ“„ test_pde_solver.py -β”‚ └── πŸ“ utils/ -β”‚ β”œβ”€β”€ πŸ“„ test_ewma.py -β”‚ β”œβ”€β”€ πŸ“„ test_momentum_mean_reverting_strategy_sides.py -β”‚ └── πŸ“„ test_progress.py -β”œβ”€β”€ πŸ“„ .gitignore -β”œβ”€β”€ πŸ“„ .pypirc -β”œβ”€β”€ πŸ“„ desktop.ini -β”œβ”€β”€ πŸ“„ DOCUMENTATION.md -β”œβ”€β”€ πŸ“„ documenter.py -β”œβ”€β”€ πŸ“„ INSTALLATION.md -β”œβ”€β”€ πŸ“„ LICENSE -β”œβ”€β”€ πŸ“„ pyproject.toml -β”œβ”€β”€ πŸ“„ README.md -β”œβ”€β”€ πŸ“„ STRUCTURE.md -β”œβ”€β”€ πŸ“„ style_guide.md -└── πŸ“„ tree.py -``` diff --git a/test/backtest/test_backtest_overfitting_simulation.py b/test/backtest/test_backtest_overfitting_simulation.py index 1c87d6b..9c50a48 100644 --- a/test/backtest/test_backtest_overfitting_simulation.py +++ b/test/backtest/test_backtest_overfitting_simulation.py @@ -15,21 +15,23 @@ get_cpu_info, ) + @pytest.fixture def sample_prices(): """Fixture for a sample price series.""" return pd.Series( np.cumprod(1 + np.random.normal(0.001, 0.01, 300)), - index=pd.date_range("2020-01-01", periods=300) + index=pd.date_range("2020-01-01", periods=300), ) + def test_local_metric_functions(): """Test the locally defined metric functions.""" returns = pd.Series([0.01, 0.01, 0.01, 0.01]) # Test Sharpe # mean=0.01, std=0 -> SR=0 - assert np.isclose(sharpe_ratio(returns, 0.0), 0.0) - + assert np.isclose(sharpe_ratio(returns, 0.0), 0.0) + returns_var = pd.Series([0.02, -0.01, 0.02, -0.01]) # Test Sortino # mean=0.005, rf=0 @@ -46,8 +48,6 @@ def test_local_metric_functions(): assert np.isclose(es, -0.1) - - def test_financial_features_generation(sample_prices): """Test the financial_features function.""" features = financial_features_backtest_overfitting_simulation( @@ -61,9 +61,9 @@ def test_financial_features_generation(sample_prices): assert features_dropped.shape[0] < sample_prices.shape[0] assert "FracDiff" in features.columns assert "Volatility" in features.columns - assert "Log MACD Histogram" in features.columns + assert "Log MACD Histogram" in features.columns assert "Kumo Breakout" in features.columns - + # Check that noise is applied features_noised = financial_features_backtest_overfitting_simulation( sample_prices, noise_scale=1.0, random_state=42 @@ -71,10 +71,11 @@ def test_financial_features_generation(sample_prices): # Volatility should be different assert not features["Volatility"].equals(features_noised["Volatility"]) + @pytest.mark.skipif(platform.system() == "Windows", reason="lscpu not on Windows") def test_get_cpu_info(): """Test the CPU info function (on non-Windows).""" info = get_cpu_info() assert isinstance(info, dict) assert "Model name" in info - assert "CPU(s)" in info \ No newline at end of file + assert "CPU(s)" in info diff --git a/test/backtest/test_backtest_statistics.py b/test/backtest/test_backtest_statistics.py index 1babc7c..ff111c6 100644 --- a/test/backtest/test_backtest_statistics.py +++ b/test/backtest/test_backtest_statistics.py @@ -13,19 +13,27 @@ compute_drawdowns_time_under_water, ) + @pytest.fixture def sample_positions(): """Fixture for a sample position series.""" dates = pd.to_datetime( [ - "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-04", - "2020-01-05", "2020-01-06", "2020-01-07", "2020-01-08", + "2020-01-01", + "2020-01-02", + "2020-01-03", + "2020-01-04", + "2020-01-05", + "2020-01-06", + "2020-01-07", + "2020-01-08", ] ) # [0, 1, 1, 0, -1, -1, 1, 0] pos = [0, 1, 1, 0, -1, -1, 1, 0] return pd.Series(pos, index=dates) + def test_bet_timing(sample_positions): """Test the bet_timing function.""" # Bets should be at: @@ -33,7 +41,7 @@ def test_bet_timing(sample_positions): # 2020-01-05 (0 -> -1, no, this is not a flip *at* 0) # 2020-01-07 (-1 -> 1, sign flip) # 2020-01-08 (1 -> 0) - + # Rerunning logic: # zero_positions = ['01-01', '01-04', '01-08'] # lagged_non_zero = ['01-02', '01-03', '01-05', '01-06', '01-07'] @@ -47,31 +55,31 @@ def test_bet_timing(sample_positions): # sign_flips < 0 -> ['01-07'] # bets (union) = ['01-04', '01-07', '01-08'] # last day ('01-08') is already in bets. - + expected_dates = pd.to_datetime(["2020-01-04", "2020-01-07", "2020-01-08"]) - + bet_times = bet_timing(sample_positions) pd.testing.assert_index_equal(bet_times, expected_dates) + def test_calculate_holding_period(): """Test calculate_holding_period.""" - dates = pd.to_datetime( - ["2020-01-01", "2020-01-02", "2020-01-03", "2020-01-04"] - ) + dates = pd.to_datetime(["2020-01-01", "2020-01-02", "2020-01-03", "2020-01-04"]) # [0, 1, 1, 0] pos = pd.Series([0, 1, 1, 0], index=dates) - + # t=1: pos=1, prev=0, diff=1. diff*prev=0. time_entry = (0*0 + 1*1)/1 = 1 # t=2: pos=1, prev=1, diff=0. diff*prev=0. time_entry = (1*1 + 2*0)/1 = 1 # t=3: pos=0, prev=1, diff=-1. diff*prev=-1. (close) # hold_period = (dT=3-1=2, w=abs(-1)=1) - + df, mean_hold = calculate_holding_period(pos) - + assert np.isclose(mean_hold, 2.0) assert df.shape == (1, 2) - assert np.isclose(df['dT'].iloc[0], 2.0) - assert np.isclose(df['w'].iloc[0], 1.0) + assert np.isclose(df["dT"].iloc[0], 2.0) + assert np.isclose(df["w"].iloc[0], 1.0) + def test_calculate_hhi(): """Test the HHI calculation.""" @@ -87,6 +95,7 @@ def test_calculate_hhi(): # n = 4, hhi_norm = (1 - 1/4) / (1 - 1/4) = 1 assert np.isclose(calculate_hhi(returns_conc), 1.0) + def test_compute_drawdowns_time_under_water(): """Test drawdown and time under water calculation.""" dates = pd.date_range("2020-01-01", periods=10) @@ -99,19 +108,19 @@ def test_compute_drawdowns_time_under_water(): # Start='01-05', Stop='01-06', HWM=11, Min=10 # Group 3 (HWM=12): # Start='01-07', Stop='01-10', HWM=12, Min=10 - + dd, tuw = compute_drawdowns_time_under_water(pnl, dollars=True) - + assert len(dd) == 3 assert len(tuw) == 3 - + # Check drawdowns (dollars) - assert np.isclose(dd.loc['2020-01-01'], 2.0) # 10 - 8 - assert np.isclose(dd.loc['2020-01-05'], 1.0) # 11 - 10 - assert np.isclose(dd.loc['2020-01-07'], 2.0) # 12 - 10 - + assert np.isclose(dd.loc["2020-01-01"], 2.0) # 10 - 8 + assert np.isclose(dd.loc["2020-01-05"], 1.0) # 11 - 10 + assert np.isclose(dd.loc["2020-01-07"], 2.0) # 12 - 10 + # Check time under water (in years) days_in_year = 365.25 - assert np.isclose(tuw.loc['2020-01-01'], 3.0 / days_in_year) # 01-04 - 01-01 - assert np.isclose(tuw.loc['2020-01-05'], 1.0 / days_in_year) # 01-06 - 01-05 - assert np.isclose(tuw.loc['2020-01-07'], 3.0 / days_in_year) # 01-10 - 01-07 \ No newline at end of file + assert np.isclose(tuw.loc["2020-01-01"], 3.0 / days_in_year) # 01-04 - 01-01 + assert np.isclose(tuw.loc["2020-01-05"], 1.0 / days_in_year) # 01-06 - 01-05 + assert np.isclose(tuw.loc["2020-01-07"], 3.0 / days_in_year) # 01-10 - 01-07 diff --git a/test/backtest/test_backtest_synthetic_data.py b/test/backtest/test_backtest_synthetic_data.py index 8dcbee7..5be229d 100644 --- a/test/backtest/test_backtest_synthetic_data.py +++ b/test/backtest/test_backtest_synthetic_data.py @@ -6,13 +6,14 @@ import pytest from RiskLabAI.backtest.backtest_synthetic_data import synthetic_back_testing + def test_synthetic_back_testing_structure(): """ Test the output structure with minimal iterations. """ pt_range = np.linspace(1, 2, 2) sl_range = np.linspace(1, 2, 2) - + results = synthetic_back_testing( forecast=10.0, half_life=10.0, @@ -21,25 +22,26 @@ def test_synthetic_back_testing_structure(): maximum_holding_period=10, profit_taking_range=pt_range, stop_loss_range=sl_range, - seed=10 + seed=10, ) - + # Expected number of results = len(pt_range) * len(sl_range) = 4 assert len(results) == 4 - + # Check the structure of the first result first_result = results[0] assert isinstance(first_result, tuple) assert len(first_result) == 5 - + # Check the first tuple corresponds to the first (pt, sl) combo assert first_result[0] == 1.0 # profit_taking assert first_result[1] == 1.0 # stop_loss - + # Check types - assert isinstance(first_result[2], float) # mean - assert isinstance(first_result[3], float) # std - assert isinstance(first_result[4], float) # sharpe + assert isinstance(first_result[2], float) # mean + assert isinstance(first_result[3], float) # std + assert isinstance(first_result[4], float) # sharpe + def test_synthetic_back_testing_logic(): """ @@ -47,7 +49,7 @@ def test_synthetic_back_testing_logic(): """ pt_range = np.array([0.5]) sl_range = np.array([0.5]) - + # forecast = 10, seed = 5, half_life = 1 -> rho = 0.5 # P_1 = (1-0.5)*10 + 0.5*5 + 0*gauss = 5 + 2.5 = 7.5 # gain = 7.5 - 5 = 2.5 @@ -55,19 +57,19 @@ def test_synthetic_back_testing_logic(): results = synthetic_back_testing( forecast=10.0, half_life=1.0, # rho = 0.5 - sigma=0.0, # No noise + sigma=0.0, # No noise n_iteration=5, maximum_holding_period=10, profit_taking_range=pt_range, stop_loss_range=sl_range, - seed=5 + seed=5, ) - + assert len(results) == 1 mean_ret = results[0][2] std_ret = results[0][3] - + # All 5 iterations should yield the same gain of 2.5 assert np.isclose(mean_ret, 2.5) assert np.isclose(std_ret, 0.0) - assert np.isclose(results[0][4], 0.0) # Sharpe is 0 if std is 0 \ No newline at end of file + assert np.isclose(results[0][4], 0.0) # Sharpe is 0 if std is 0 diff --git a/test/backtest/test_bet_sizing.py b/test/backtest/test_bet_sizing.py index 191451e..3a5a749 100644 --- a/test/backtest/test_bet_sizing.py +++ b/test/backtest/test_bet_sizing.py @@ -43,6 +43,7 @@ def test_avg_active_signals(): # At 2020-01-05: only signal 3 -> -1.0 assert np.isclose(out.loc[pd.Timestamp("2020-01-05")], -1.0) + def test_probability_bet_size(): """Test probability_bet_size function.""" # Prob = 0.5 -> CDF(0) = 0.5 -> 2*0.5 - 1 = 0 @@ -52,12 +53,13 @@ def test_probability_bet_size(): assert np.allclose(sizes, [0, 0, 0]) # Prob > 0 -> size > 0 - probs_high = np.array([norm.ppf(0.75), norm.ppf(0.75)]) # CDF = 0.75 + probs_high = np.array([norm.ppf(0.75), norm.ppf(0.75)]) # CDF = 0.75 sides = np.array([1, -1]) # Expected: [1 * (2*0.75 - 1), -1 * (2*0.75 - 1)] = [0.5, -0.5] sizes_high = probability_bet_size(probs_high, sides) assert np.allclose(sizes_high, [0.5, -0.5]) + def test_average_bet_sizes_numba(): """Test the Numba-jitted average_bet_sizes.""" price_dates = np.arange(10) @@ -68,9 +70,7 @@ def test_average_bet_sizes_numba(): end_dates = np.array([5, 7, 9]) bet_sizes = np.array([1.0, 0.5, -1.0]) - avg_sizes = average_bet_sizes( - price_dates, start_dates, end_dates, bet_sizes - ) + avg_sizes = average_bet_sizes(price_dates, start_dates, end_dates, bet_sizes) # Date 0: [0, 5] -> 1.0 # Date 1: [0, 5] -> 1.0 @@ -82,21 +82,17 @@ def test_average_bet_sizes_numba(): # Date 7: [2, 7] -> 0.5 # Date 8: [8, 9] -> -1.0 # Date 9: [8, 9] -> -1.0 - expected = np.array( - [1.0, 1.0, 0.75, 0.75, 0.75, 0.75, 0.5, 0.5, -1.0, -1.0] - ) + expected = np.array([1.0, 1.0, 0.75, 0.75, 0.75, 0.75, 0.5, 0.5, -1.0, -1.0]) assert np.allclose(avg_sizes, expected) + def test_strategy_bet_sizing(): """Test the strategy_bet_sizing wrapper.""" price_idx = pd.to_datetime(pd.date_range("2020-01-01", periods=10)) - bet_idx = pd.to_datetime( - ["2020-01-01", "2020-01-03", "2020-01-09"] - ) - + bet_idx = pd.to_datetime(["2020-01-01", "2020-01-03", "2020-01-09"]) + times = pd.Series( - pd.to_datetime(["2020-01-06", "2020-01-08", "2020-01-10"]), - index=bet_idx + pd.to_datetime(["2020-01-06", "2020-01-08", "2020-01-10"]), index=bet_idx ) sides = pd.Series([1, 1, -1], index=bet_idx) # Probs -> CDF(0) = 0.5, CDF(0.674) = 0.75, CDF(0.674) = 0.75 @@ -104,7 +100,7 @@ def test_strategy_bet_sizing(): probs = pd.Series([0.0, norm.ppf(0.75), norm.ppf(0.75)], index=bet_idx) avg_sizes = strategy_bet_sizing(price_idx, times, sides, probs) - + # Dates: 1 2 3 4 5 6 7 8 9 10 # Bet 1 (0): [--------] # Bet 2 (0.5): [----------] @@ -121,24 +117,25 @@ def test_strategy_bet_sizing(): # 8: 0.5 # 9: (-0.5) / 1 = -0.5 # 10: (-0.5) / 1 = -0.5 - + expected_vals = [0, 0, 0.25, 0.25, 0.25, 0.25, 0.5, 0.5, -0.5, -0.5] assert np.allclose(avg_sizes.values, expected_vals) + def test_desprado_bet_sizing_snippets(): """Test snippets 10.4 from de Prado.""" w = 1.0 - x = 0.5 # divergence + x = 0.5 # divergence m = betSize(w, x) # m = 0.5 / sqrt(1 + 0.25) = 0.5 / sqrt(1.25) = 0.4472 assert np.isclose(m, 0.44721359) - + # Test getW w_calc = getW(x, m) assert np.isclose(w, w_calc) - + # Test TPos pos = TPos(w=1.0, f=10.5, acctualPrice=10.0, maximumPositionSize=100) # x = 0.5, m = 0.4472... # pos = int(0.4472 * 100) = 44 - assert pos == 44 \ No newline at end of file + assert pos == 44 diff --git a/test/backtest/test_probabilistic_sharpe_ratio.py b/test/backtest/test_probabilistic_sharpe_ratio.py index 19efe9b..6affbf1 100644 --- a/test/backtest/test_probabilistic_sharpe_ratio.py +++ b/test/backtest/test_probabilistic_sharpe_ratio.py @@ -11,6 +11,7 @@ benchmark_sharpe_ratio, ) + def test_probabilistic_sharpe_ratio_normal(): """ Test PSR with normal parameters (skew=0, kurtosis=3). @@ -39,6 +40,7 @@ def test_probabilistic_sharpe_ratio_normal(): ) assert psr < 0.5 + def test_probabilistic_sharpe_ratio_non_normal(): """ Test PSR with non-normal parameters. @@ -57,14 +59,15 @@ def test_probabilistic_sharpe_ratio_non_normal(): benchmark_sharpe_ratio=1.0, number_of_returns=100, skewness_of_returns=-1.0, # Negative skew - kurtosis_of_returns=5.0, # High kurtosis + kurtosis_of_returns=5.0, # High kurtosis ) - + # Denominator (normal) = 1 # Denominator (non-normal) = 1 - (-1)*(2) + (5-1)/4 * (2**2) = 1 + 2 + 4 = 7 # Z-stat (non-normal) will be lower, so PSR will be lower. assert psr_non_normal < psr_normal + def test_probabilistic_sharpe_ratio_statistic(): """ Test the return_test_statistic flag. @@ -80,8 +83,9 @@ def test_probabilistic_sharpe_ratio_statistic(): # Z = (1.5 - 1.0) * sqrt(99) / sqrt(1 - 0 + (3-1)/4 * 1.5**2) # Z = 0.5 * 9.9498 / sqrt(1 + 0.5 * 2.25) # Z = 4.9749 / sqrt(2.125) = 4.9749 / 1.4577 = 3.4127... - assert np.isclose(z_stat, 3.4127787539671264) - assert np.isclose(ss.norm.cdf(z_stat), 0.99968) + assert np.isclose(z_stat, 3.4127787539671264) + assert np.isclose(ss.norm.cdf(z_stat), 0.99968) + def test_benchmark_sharpe_ratio(): """ @@ -90,19 +94,20 @@ def test_benchmark_sharpe_ratio(): sr_list = [0.5, 1.0, 1.5, 0.8, 1.2] n_estimates = 5 std_dev = np.std(sr_list) - + bsr = benchmark_sharpe_ratio(sr_list) - + # Manual calculation term1 = (1 - np.euler_gamma) * norm.ppf(1 - 1 / n_estimates) term2 = np.euler_gamma * norm.ppf(1 - 1 / (n_estimates * np.e)) expected_bsr = std_dev * (term1 + term2) - + assert np.isclose(bsr, expected_bsr) + def test_benchmark_sharpe_ratio_edge_cases(): """ Test benchmark_sharpe_ratio with 0 or 1 estimate. """ assert np.isclose(benchmark_sharpe_ratio([]), 0.0) - assert np.isclose(benchmark_sharpe_ratio([1.5]), 1.5) \ No newline at end of file + assert np.isclose(benchmark_sharpe_ratio([1.5]), 1.5) diff --git a/test/backtest/test_probability_of_backtest_overfitting.py b/test/backtest/test_probability_of_backtest_overfitting.py index 5177133..ae87281 100644 --- a/test/backtest/test_probability_of_backtest_overfitting.py +++ b/test/backtest/test_probability_of_backtest_overfitting.py @@ -10,6 +10,7 @@ probability_of_backtest_overfitting, ) + @pytest.fixture def sample_performance_matrix(): """ @@ -31,71 +32,74 @@ def sample_performance_matrix(): ) return matrix + def test_sharpe_ratio_numba(): """Test the Numba-jitted Sharpe ratio.""" returns = np.array([0.1, 0.1, 0.1, 0.1]) # std=0, so SR=0 assert np.isclose(sharpe_ratio(returns, 0.0), 0.0) - + returns_var = np.array([0.1, -0.1, 0.1, -0.1]) # mean=0, std > 0, so SR=0 assert np.isclose(sharpe_ratio(returns_var, 0.0), 0.0) - + returns_pos = np.array([0.1, 0.1, 0.2, 0.0]) # mean=0.1, std > 0, so SR > 0 assert sharpe_ratio(returns_pos, 0.0) > 0 + def test_performance_evaluation(sample_performance_matrix): """Test the performance_evaluation function.""" - train_part = sample_performance_matrix[:2, :] # [0.2, 0.2, 0.1], [0.2, 0.2, 0.1] + train_part = sample_performance_matrix[:2, :] # [0.2, 0.2, 0.1], [0.2, 0.2, 0.1] test_part = sample_performance_matrix[2:, :] # [0.2, -0.2, 0.2], [0.2, -0.2, 0.2] n_strat = 3 - + # Train SRs: # S0: mean=0.2, std=0 -> SR=0 # S1: mean=0.2, std=0 -> SR=0 # S2: mean=0.1, std=0 -> SR=0 # `np.argmax` will pick the first one, index 0. - + # Test SRs: # S0: mean=0.2, std=0 -> SR=0 # S1: mean=-0.2, std=0 -> SR=0 # S2: mean=0.2, std=0 -> SR=0 - + # This is a bad example. Let's add variance. train_part = np.array([[0.2, 0.3, 0.1], [0.2, 0.3, 0.1]]) test_part = np.array([[0.2, -0.2, 0.3], [0.2, -0.2, 0.3]]) - + # Train SRs (approx, since std=0): # S0: ~inf (mean=0.2) # S1: ~inf (mean=0.3) -> Best strategy is 1 # S2: ~inf (mean=0.1) - + # Test SRs (approx): # S0: ~inf (mean=0.2) # S1: ~-inf (mean=-0.2) # S2: ~inf (mean=0.3) - + # Test Ranks (from lowest to highest): S1, S0, S2 # Ranks (1-based): [2, 1, 3] # Rank of best IS (S1) is 1. - + # w_bar = 1 / (3 + 1) = 0.25 # logit = log(0.25 / 0.75) = log(1/3) < 0 # is_overfit = True - + is_overfit, logit = performance_evaluation( train_part, test_part, n_strat, sharpe_ratio, 0.0 ) - + assert is_overfit assert logit < 0 + def test_probability_of_backtest_overfitting(sample_performance_matrix): """Test the main PBO function.""" # S=2 (T=4 / 2 = 2 rows per partition) # C(2, 1) = 2 combinations - + # Combo 1: Train=[0,1], Test=[2,3] # (This is the test from test_performance_evaluation) # Train: [[0.2, 0.2, 0.1], [0.2, 0.2, 0.1]] -> Best S0 @@ -103,19 +107,19 @@ def test_probability_of_backtest_overfitting(sample_performance_matrix): # Test SRs: S0(0), S1(0), S2(0). # Ranks: [1, 1, 1]. Rank of S0 is 1. # w_bar = 1 / 4 = 0.25. logit < 0. is_overfit = True - + # Combo 2: Train=[2,3], Test=[0,1] # Train: [[0.2, -0.2, 0.2], [0.2, -0.2, 0.2]] -> Best S0 # Test: [[0.2, 0.2, 0.1], [0.2, 0.2, 0.1]] # Test SRs: S0(0), S1(0), S2(0). # Ranks: [1, 1, 1]. Rank of S0 is 1. # w_bar = 1 / 4 = 0.25. logit < 0. is_overfit = True - + pbo, logits = probability_of_backtest_overfitting( sample_performance_matrix, n_partitions=2, n_jobs=1 ) - + # Both combos show overfitting assert np.isclose(pbo, 1.0) assert len(logits) == 2 - assert np.all(logits < 0) \ No newline at end of file + assert np.all(logits < 0) diff --git a/test/backtest/test_strategy_risk.py b/test/backtest/test_strategy_risk.py index 5b0bbfd..83999a1 100644 --- a/test/backtest/test_strategy_risk.py +++ b/test/backtest/test_strategy_risk.py @@ -13,6 +13,7 @@ failure_probability, ) + def test_sharpe_ratio_trials(): """Test sharpe_ratio_trials.""" # With p=0.5, mean should be ~0 @@ -20,39 +21,34 @@ def test_sharpe_ratio_trials(): assert np.isclose(mean, 0.0, atol=0.01) assert np.isclose(std, 1.0, atol=0.01) assert np.isclose(sr, 0.0, atol=0.01) - + # With p=1.0, mean=1, std=0, sr=0 (by implementation) mean, std, sr = sharpe_ratio_trials(p=1.0, n_run=100) assert np.isclose(mean, 1.0) assert np.isclose(std, 0.0) assert np.isclose(sr, 0.0) + def test_binomial_sharpe_ratio(): """Test binomial_sharpe_ratio.""" # 50/50 win/loss sr = binomial_sharpe_ratio( - stop_loss=-0.01, - profit_taking=0.01, - frequency=252, - probability=0.5 + stop_loss=-0.01, profit_taking=0.01, frequency=252, probability=0.5 ) # E[R] = 0.5*0.01 + 0.5*(-0.01) = 0 # Stdev = (0.01 - (-0.01)) * sqrt(0.5*0.5) = 0.02 * 0.5 = 0.01 # SR_trade = 0 / 0.01 = 0 assert np.isclose(sr, 0.0) - + # High precision sr_high_p = binomial_sharpe_ratio( - stop_loss=-0.01, - profit_taking=0.01, - frequency=252, - probability=0.6 + stop_loss=-0.01, profit_taking=0.01, frequency=252, probability=0.6 ) # E[R] = 0.6*0.01 + 0.4*(-0.01) = 0.006 - 0.004 = 0.002 # Stdev = (0.01 - (-0.01)) * sqrt(0.6*0.4) = 0.02 * sqrt(0.24) = 0.009798 # SR_trade = 0.002 / 0.009798 = 0.2041 # SR_annual = 0.2041 * sqrt(252) = 3.24037... - assert np.isclose(sr_high_p, 3.240370349, atol=1e-5) # <-- CORRECTED VALUE + assert np.isclose(sr_high_p, 3.240370349, atol=1e-5) # <-- CORRECTED VALUE def test_implied_precision_and_bin_frequency(): @@ -65,23 +61,23 @@ def test_implied_precision_and_bin_frequency(): stop_loss=-0.01, profit_taking=0.01, precision=0.6, - target_sharpe_ratio=3.2403703492039306 # <-- USE CORRECT SR + target_sharpe_ratio=3.2403703492039306, # <-- USE CORRECT SR ) - assert np.isclose(freq, 252, atol=0.01) # <-- TIGHTENED TOLERANCE + assert np.isclose(freq, 252, atol=0.01) # <-- TIGHTENED TOLERANCE # Test implied_precision with sl=0.01 (positive) prec = implied_precision( - stop_loss=0.01, # Positive + stop_loss=0.01, # Positive profit_taking=0.01, frequency=252, - target_sharpe_ratio=3.2396 + target_sharpe_ratio=3.2396, ) # This won't match, the formulas are inconsistent. # Let's re-derive implied_precision from binomial_sharpe_ratio # S = ( (pt-sl)*p + sl ) / ( (pt-sl)*sqrt(p(1-p)) ) * sqrt(f) # S^2 = [ (pt-sl)p + sl ]^2 / [ (pt-sl)^2 * p(1-p) ] * f # S^2 * (pt-sl)^2 * (p - p^2) = f * [ (pt-sl)^2 p^2 + 2*sl*(pt-sl)p + sl^2 ] - # [ S^2(pt-sl)^2 + f(pt-sl)^2 ] p^2 + # [ S^2(pt-sl)^2 + f(pt-sl)^2 ] p^2 # + [ -S^2(pt-sl)^2 + 2*f*sl*(pt-sl) ] p # + [ f*sl^2 ] = 0 # a = (S^2 + f)(pt-sl)^2 @@ -89,29 +85,28 @@ def test_implied_precision_and_bin_frequency(): # c = f*sl^2 # This matches the user's `implied_precision` function, # but it assumes `sl` is *negative*. - + prec_recalc = implied_precision( - stop_loss=-0.01, # Pass negative + stop_loss=-0.01, # Pass negative profit_taking=0.01, frequency=252, - target_sharpe_ratio=3.2396 + target_sharpe_ratio=3.2396, ) assert np.isclose(prec_recalc, 0.6, atol=1e-4) + def test_mix_gaussians(): """Test the mix_gaussians function.""" n_obs = 1000 p = 0.5 - mix = mix_gaussians( - mu1=10, mu2=-10, sigma1=1, sigma2=1, probability=p, n_obs=n_obs - ) + mix = mix_gaussians(mu1=10, mu2=-10, sigma1=1, sigma2=1, probability=p, n_obs=n_obs) assert len(mix) == n_obs # Mean should be (10*0.5) + (-10*0.5) = 0 assert np.isclose(np.mean(mix), 0.0, atol=0.5) - + p = 0.8 mix_biased = mix_gaussians( mu1=10, mu2=-10, sigma1=1, sigma2=1, probability=p, n_obs=n_obs ) # Mean should be (10*0.8) + (-10*0.2) = 8 - 2 = 6 - assert np.isclose(np.mean(mix_biased), 6.0, atol=0.5) \ No newline at end of file + assert np.isclose(np.mean(mix_biased), 6.0, atol=0.5) diff --git a/test/backtest/test_test_set_overfitting.py b/test/backtest/test_test_set_overfitting.py index 96c1e6b..23f9565 100644 --- a/test/backtest/test_test_set_overfitting.py +++ b/test/backtest/test_test_set_overfitting.py @@ -16,19 +16,19 @@ strategy_type2_error_probability, ) + def test_expected_max_sharpe_ratio(): """Test E[max SR] calculation.""" # With 1 trial, E[max SR] = mean SR - assert np.isclose( - expected_max_sharpe_ratio(1, 0.5, 1.0), 0.5 - ) - + assert np.isclose(expected_max_sharpe_ratio(1, 0.5, 1.0), 0.5) + # With N trials, E[max SR] > mean SR assert expected_max_sharpe_ratio(10, 0.5, 1.0) > 0.5 - + # Test with 0 trials assert np.isclose(expected_max_sharpe_ratio(0, 0.5, 1.0), 0.0) + def test_generate_max_sharpe_ratios(): """Test the simulation of max SRs.""" n_sims = 100 @@ -37,16 +37,19 @@ def test_generate_max_sharpe_ratios(): n_sims=n_sims, n_trials_list=n_trials_list, std_sharpe_ratio=1.0, - mean_sharpe_ratio=0.0 + mean_sharpe_ratio=0.0, ) - + assert df.shape == (n_sims * len(n_trials_list), 2) - assert df['n_trials'].value_counts()[10] == n_sims - assert df['n_trials'].value_counts()[20] == n_sims - + assert df["n_trials"].value_counts()[10] == n_sims + assert df["n_trials"].value_counts()[20] == n_sims + # E[max SR] for N=20 should be > E[max SR] for N=10 - assert df[df['n_trials'] == 20]['max_SR'].mean() > \ - df[df['n_trials'] == 10]['max_SR'].mean() + assert ( + df[df["n_trials"] == 20]["max_SR"].mean() + > df[df["n_trials"] == 10]["max_SR"].mean() + ) + def test_mean_std_error(): """Test the mean_std_error function.""" @@ -55,15 +58,16 @@ def test_mean_std_error(): n_sims1=10, n_trials=[10, 20], std_sharpe_ratio=1.0, - mean_sharpe_ratio=0.0 + mean_sharpe_ratio=0.0, ) - + assert df.shape == (2, 2) - assert 'meanErr' in df.columns - assert 'stdErr' in df.columns + assert "meanErr" in df.columns + assert "stdErr" in df.columns assert 10 in df.index assert 20 in df.index + def test_z_statistics_and_errors(): """Test Z-stat and error probability functions.""" # 1. Z-stat (standard normal) @@ -73,22 +77,20 @@ def test_z_statistics_and_errors(): # Z = (1.96 - 0) * sqrt(999) / sqrt(1 - 0 + (3-1)/4 * 1.96**2) # Z = 61.95 / sqrt(1 + 0.5 * 3.8416) # Z = 61.95 / sqrt(2.9208) = 36.248... - assert np.isclose(z, 36.248321866, atol=1e-5) # <-- CORRECTED VALUE + assert np.isclose(z, 36.248321866, atol=1e-5) # <-- CORRECTED VALUE # 2. Type 1 Error # For z=1.96, alpha should be 0.025 (one-sided) alpha_1 = strategy_type1_error_probability(z=1.96, k=1) assert np.isclose(alpha_1, 1 - norm.cdf(1.96), atol=1e-4) assert np.isclose(alpha_1, 0.025, atol=1e-3) - + # For k=2, alpha_k = 1 - (1-0.025)^2 = 0.049375 alpha_2 = strategy_type1_error_probability(z=1.96, k=2) - assert np.isclose(alpha_2, 1 - (1 - alpha_1)**2, atol=1e-4) + assert np.isclose(alpha_2, 1 - (1 - alpha_1) ** 2, atol=1e-4) # 3. Theta - theta = theta_for_type2_error( - sharpe_ratio=1.0, t=100, true_sharpe_ratio=0.5 - ) + theta = theta_for_type2_error(sharpe_ratio=1.0, t=100, true_sharpe_ratio=0.5) # Assert against the correct calculated value assert np.isclose(theta, 4.0620192, atol=1e-5) @@ -96,4 +98,4 @@ def test_z_statistics_and_errors(): beta = strategy_type2_error_probability(alpha_k=alpha_2, k=2, theta=theta) # z_alpha = norm.ppf((1 - 0.049375)**0.5) = norm.ppf(0.975) = 1.96 # beta = norm.cdf(1.96 - 4.97) = norm.cdf(-3.01) - assert np.isclose(beta, norm.cdf(1.96 - theta), atol=1e-3) \ No newline at end of file + assert np.isclose(beta, norm.cdf(1.96 - theta), atol=1e-3) diff --git a/test/backtest/validation/test_adaptive_combinatorial_purged.py b/test/backtest/validation/test_adaptive_combinatorial_purged.py index b143738..d5e1088 100644 --- a/test/backtest/validation/test_adaptive_combinatorial_purged.py +++ b/test/backtest/validation/test_adaptive_combinatorial_purged.py @@ -6,12 +6,14 @@ import pandas as pd import pytest from sklearn.linear_model import LogisticRegression -from itertools import combinations +from itertools import combinations -from RiskLabAI.backtest.validation.adaptive_combinatorial_purged import AdaptiveCombinatorialPurged +from RiskLabAI.backtest.validation.adaptive_combinatorial_purged import ( + AdaptiveCombinatorialPurged, +) -from RiskLabAI.backtest.validation.combinatorial_purged import CombinatorialPurged -from itertools import combinations +from RiskLabAI.backtest.validation.combinatorial_purged import CombinatorialPurged +from itertools import combinations # Re-use the purged k-fold fixture and add an external feature @@ -19,26 +21,15 @@ def sample_data_with_times_and_feature(): """Fixture for sample data with 'times' and 'external_feature'.""" n_samples = 120 - idx = pd.date_range('2020-01-01', periods=n_samples, freq='B') - X = pd.DataFrame( - {'feature1': np.arange(n_samples)}, - index=idx - ) - y = pd.Series( - np.random.randint(0, 2, n_samples), - index=idx - ) - times = pd.Series( - idx + pd.DateOffset(days=7), - index=idx - ) + idx = pd.date_range("2020-01-01", periods=n_samples, freq="B") + X = pd.DataFrame({"feature1": np.arange(n_samples)}, index=idx) + y = pd.Series(np.random.randint(0, 2, n_samples), index=idx) + times = pd.Series(idx + pd.DateOffset(days=7), index=idx) # External feature: a sine wave to create predictable quantiles - external_feature = pd.Series( - np.sin(np.linspace(0, 10, n_samples)), - index=idx - ) + external_feature = pd.Series(np.sin(np.linspace(0, 10, n_samples)), index=idx) return X, y, times, external_feature + def test_adaptive_init(sample_data_with_times_and_feature): """Test A-CPCV initialization.""" _, _, times, feature = sample_data_with_times_and_feature @@ -48,63 +39,63 @@ def test_adaptive_init(sample_data_with_times_and_feature): times=times, external_feature=feature, lower_quantile=0.3, - upper_quantile=0.7 + upper_quantile=0.7, ) assert cv.n_splits == 6 assert cv.lower_quantile == 0.3 assert cv.external_feature is not None + def test_adaptive_split_segments(sample_data_with_times_and_feature): """Test the adaptive segment splitting.""" X, _, _, feature = sample_data_with_times_and_feature - + cv_adaptive = AdaptiveCombinatorialPurged( - n_splits=6, n_test_groups=2, times=pd.Series(), # Dummy times - external_feature=feature + n_splits=6, + n_test_groups=2, + times=pd.Series(), # Dummy times + external_feature=feature, ) cv_normal = CombinatorialPurged( - n_splits=6, n_test_groups=2, times=pd.Series() # Dummy times + n_splits=6, n_test_groups=2, times=pd.Series() # Dummy times ) - + adaptive_segments = cv_adaptive._get_split_segments(X, feature) normal_segments = cv_normal._get_split_segments(X) - + # Check that segments are different assert len(adaptive_segments) == 6 assert len(normal_segments) == 6 - + # Normal segments should all be size 20 assert all(len(seg) == 20 for seg in normal_segments) - + # Adaptive segments should NOT all be size 20 adaptive_lengths = [len(seg) for seg in adaptive_segments] assert not all(length == 20 for length in adaptive_lengths) - + # Check they still cover all indices assert sum(adaptive_lengths) == 120 - + + def test_adaptive_predictions(sample_data_with_times_and_feature): """Test the backtest_predictions method.""" X, y, times, feature = sample_data_with_times_and_feature cv = AdaptiveCombinatorialPurged( - n_splits=6, - n_test_groups=2, - times=times, - external_feature=feature, - embargo=0.01 + n_splits=6, n_test_groups=2, times=times, external_feature=feature, embargo=0.01 ) model = LogisticRegression() - + preds_dict = cv.backtest_predictions(model, X, y, n_jobs=1) - + # 5 paths assert len(preds_dict) == 5 - assert 'Path 1' in preds_dict - + assert "Path 1" in preds_dict + # Each path's predictions should cover the whole dataset - preds1 = preds_dict['Path 1'] + preds1 = preds_dict["Path 1"] assert len(preds1) == X.shape[0] - + # Check total length of predictions total_preds = sum(len(p) for p in preds_dict.values()) - assert total_preds == 5 * 120 \ No newline at end of file + assert total_preds == 5 * 120 diff --git a/test/backtest/validation/test_bagged_combinatorial_purged.py b/test/backtest/validation/test_bagged_combinatorial_purged.py index 9d5599b..3eedb31 100644 --- a/test/backtest/validation/test_bagged_combinatorial_purged.py +++ b/test/backtest/validation/test_bagged_combinatorial_purged.py @@ -6,47 +6,36 @@ import pandas as pd import pytest from sklearn.linear_model import LogisticRegression, LinearRegression -from typing import List +from typing import List + +from RiskLabAI.backtest.validation.bagged_combinatorial_purged import ( + BaggedCombinatorialPurged, +) -from RiskLabAI.backtest.validation.bagged_combinatorial_purged import BaggedCombinatorialPurged # Re-use the purged k-fold fixture @pytest.fixture def sample_data_with_times(): """Fixture for sample data with a 'times' series.""" n_samples = 120 - idx = pd.date_range('2020-01-01', periods=n_samples, freq='B') - X = pd.DataFrame( - {'feature1': np.arange(n_samples)}, - index=idx - ) - y_class = pd.Series( - np.random.randint(0, 2, n_samples), - index=idx - ) - y_reg = pd.Series( - np.random.randn(n_samples), - index=idx - ) - times = pd.Series( - idx + pd.DateOffset(days=7), - index=idx - ) + idx = pd.date_range("2020-01-01", periods=n_samples, freq="B") + X = pd.DataFrame({"feature1": np.arange(n_samples)}, index=idx) + y_class = pd.Series(np.random.randint(0, 2, n_samples), index=idx) + y_reg = pd.Series(np.random.randn(n_samples), index=idx) + times = pd.Series(idx + pd.DateOffset(days=7), index=idx) return X, y_class, y_reg, times + def test_bagged_init(sample_data_with_times): """Test B-CPCV initialization.""" _, _, _, times = sample_data_with_times cv = BaggedCombinatorialPurged( - n_splits=6, - n_test_groups=2, - times=times, - classifier=True, - n_estimators=5 + n_splits=6, n_test_groups=2, times=times, classifier=True, n_estimators=5 ) assert cv.classifier is True assert cv.n_estimators == 5 + def test_bagged_predictions_classifier(sample_data_with_times): """Test B-CPCV predictions with a classifier.""" X, y_class, _, times = sample_data_with_times @@ -56,17 +45,18 @@ def test_bagged_predictions_classifier(sample_data_with_times): times=times, classifier=True, n_estimators=5, - random_state=42 + random_state=42, ) model = LogisticRegression() - + preds_dict = cv.backtest_predictions(model, X, y_class, n_jobs=1) - - assert len(preds_dict) == 5 # 5 paths - preds1 = preds_dict['Path 1'] + + assert len(preds_dict) == 5 # 5 paths + preds1 = preds_dict["Path 1"] assert len(preds1) == X.shape[0] assert np.isin(preds1, [0, 1]).all() + def test_bagged_predictions_regressor(sample_data_with_times): """Test B-CPCV predictions with a regressor.""" X, _, y_reg, times = sample_data_with_times @@ -76,19 +66,17 @@ def test_bagged_predictions_regressor(sample_data_with_times): times=times, classifier=False, n_estimators=5, - random_state=42 + random_state=42, ) model = LinearRegression() - + preds_dict = cv.backtest_predictions(model, X, y_reg, n_jobs=1) - - assert len(preds_dict) == 5 # 5 paths - preds1 = preds_dict['Path 1'] + + assert len(preds_dict) == 5 # 5 paths + preds1 = preds_dict["Path 1"] assert len(preds1) == X.shape[0] assert preds1.dtype == float - + # Test that predict_proba fails with pytest.raises(ValueError): - cv.backtest_predictions( - model, X, y_reg, predict_probability=True, n_jobs=1 - ) \ No newline at end of file + cv.backtest_predictions(model, X, y_reg, predict_probability=True, n_jobs=1) diff --git a/test/backtest/validation/test_combinatorial_purged.py b/test/backtest/validation/test_combinatorial_purged.py index d70fed7..c9a2c8c 100644 --- a/test/backtest/validation/test_combinatorial_purged.py +++ b/test/backtest/validation/test_combinatorial_purged.py @@ -11,39 +11,34 @@ from RiskLabAI.backtest.validation.combinatorial_purged import CombinatorialPurged from itertools import combinations + # Re-use the purged k-fold fixture @pytest.fixture def sample_data_with_times(): """Fixture for sample data with a 'times' series.""" - n_samples = 120 # Use 120 for easier division + n_samples = 120 # Use 120 for easier division X = pd.DataFrame( - {'feature1': np.arange(n_samples)}, - index=pd.date_range('2020-01-01', periods=n_samples, freq='B') - ) - y = pd.Series( - np.random.randint(0, 2, n_samples), - index=X.index - ) - times = pd.Series( - X.index + pd.DateOffset(days=7), - index=X.index + {"feature1": np.arange(n_samples)}, + index=pd.date_range("2020-01-01", periods=n_samples, freq="B"), ) + y = pd.Series(np.random.randint(0, 2, n_samples), index=X.index) + times = pd.Series(X.index + pd.DateOffset(days=7), index=X.index) return X, y, times + def test_combinatorial_purged_init(sample_data_with_times): """Test CPCV initialization.""" _, _, times = sample_data_with_times n_splits = 6 n_test_groups = 2 cv = CombinatorialPurged( - n_splits=n_splits, - n_test_groups=n_test_groups, - times=times + n_splits=n_splits, n_test_groups=n_test_groups, times=times ) - + assert cv.n_splits == n_splits assert cv.n_test_groups == n_test_groups - assert cv.get_n_splits() == comb(n_splits, n_test_groups) # 15 + assert cv.get_n_splits() == comb(n_splits, n_test_groups) # 15 + def test_path_locations(): """Test the static _path_locations method.""" @@ -52,120 +47,115 @@ def test_path_locations(): # C(4, 2) = 6 combinations combinations_list = list(combinations(range(n_splits), n_test_groups)) assert len(combinations_list) == 6 - + locations = CombinatorialPurged._path_locations(n_splits, combinations_list) - + # n_splits=4, n_test_groups=2. # Total test sets = 4 * C(3, 1) = 12 # Number of paths = C(4, 2) * 2 / 4 * 2 = 6? No... # Number of paths = n_splits - n_test_groups + 1 = 4 - 2 + 1 = 3 - assert len(locations) == 3 - + assert len(locations) == 3 + # Path 1 should have 4 segments assert len(locations[1]) == 4 # Path 3 (last) should have 4 segments assert len(locations[3]) == 4 - + # Check coordinates for Path 1 # (group_idx, split_idx) # Path 1, Group 0: (0, 3) -> combo (1, 2) # Path 1, Group 1: (1, 0) -> combo (0, 2) # Path 1, Group 2: (2, 0) -> combo (0, 1) # ... this depends on the `combinations` order. - + # Let's check total segments total_segments = sum(len(loc) for loc in locations.values()) assert total_segments == n_splits * comb(n_splits - 1, n_test_groups - 1) - assert total_segments == 4 * comb(3, 1) # 12 - + assert total_segments == 4 * comb(3, 1) # 12 + + def test_combinatorial_split(sample_data_with_times): """Test the split method.""" X, y, times = sample_data_with_times n_splits = 6 n_test_groups = 2 cv = CombinatorialPurged( - n_splits=n_splits, - n_test_groups=n_test_groups, - times=times, - embargo=0.01 + n_splits=n_splits, n_test_groups=n_test_groups, times=times, embargo=0.01 ) - - n_combinations = comb(n_splits, n_test_groups) # 15 + + n_combinations = comb(n_splits, n_test_groups) # 15 splits = list(cv.split(X, y)) assert len(splits) == n_combinations - + # Test one split - train_idx, test_idx = splits[0] # Combo (0, 1) - + train_idx, test_idx = splits[0] # Combo (0, 1) + # Test indices should be groups 0 and 1 # n_samples = 120, n_splits = 6 -> 20 samples/group assert len(test_idx) == 40 np.testing.assert_array_equal(test_idx, np.arange(40)) - + # Train indices must be purged assert 39 not in train_idx assert np.all(train_idx >= 40) - + + def test_combinatorial_backtest_paths(sample_data_with_times): """Test the backtest_paths method.""" X, y, times = sample_data_with_times n_splits = 6 n_test_groups = 2 cv = CombinatorialPurged( - n_splits=n_splits, - n_test_groups=n_test_groups, - times=times + n_splits=n_splits, n_test_groups=n_test_groups, times=times ) - + paths = cv.backtest_paths(X) - + # Num paths = n_splits - n_test_groups + 1 = 6 - 2 + 1 = 5 assert len(paths) == 5 - assert 'Path 1' in paths - assert 'Path 5' in paths - + assert "Path 1" in paths + assert "Path 5" in paths + # Each path should have n_splits = 6 segments - assert len(paths['Path 1']) == n_splits - + assert len(paths["Path 1"]) == n_splits + # Check a segment # Path 1, Segment 0 - segment = paths['Path 1'][0] - train_idx = segment['Train'] - test_idx = segment['Test'] - + segment = paths["Path 1"][0] + train_idx = segment["Train"] + test_idx = segment["Test"] + # Test set is just group 0 np.testing.assert_array_equal(test_idx, np.arange(20)) - + # Train set should be purged against its *combination* # (which combination this is depends on path logic) # But it must be a subset of 0..119 assert train_idx.max() < 120 - + + def test_combinatorial_backtest_predictions(sample_data_with_times): """Test the backtest_predictions method.""" X, y, times = sample_data_with_times n_splits = 6 n_test_groups = 2 cv = CombinatorialPurged( - n_splits=n_splits, - n_test_groups=n_test_groups, - times=times, - embargo=0.01 + n_splits=n_splits, n_test_groups=n_test_groups, times=times, embargo=0.01 ) model = LogisticRegression() - + preds_dict = cv.backtest_predictions(model, X, y, n_jobs=1) - + # 5 paths assert len(preds_dict) == 5 - assert 'Path 1' in preds_dict - + assert "Path 1" in preds_dict + # Each path's predictions should cover the whole dataset - preds1 = preds_dict['Path 1'] + preds1 = preds_dict["Path 1"] assert len(preds1) == X.shape[0] - - preds5 = preds_dict['Path 5'] + + preds5 = preds_dict["Path 5"] assert len(preds5) == X.shape[0] - + # Predictions should be binary - assert np.isin(preds1, [0, 1]).all() \ No newline at end of file + assert np.isin(preds1, [0, 1]).all() diff --git a/test/backtest/validation/test_cross_validator_controller.py b/test/backtest/validation/test_cross_validator_controller.py index 8371ae8..9d97287 100644 --- a/test/backtest/validation/test_cross_validator_controller.py +++ b/test/backtest/validation/test_cross_validator_controller.py @@ -7,13 +7,16 @@ from RiskLabAI.backtest.validation.combinatorial_purged import CombinatorialPurged -from RiskLabAI.backtest.validation.cross_validator_controller import CrossValidatorController +from RiskLabAI.backtest.validation.cross_validator_controller import ( + CrossValidatorController, +) from RiskLabAI.backtest.validation.kfold import KFold + @pytest.fixture def dummy_args(): """Dummy 'times' args.""" - idx = pd.date_range('2020-01-01', periods=10) + idx = pd.date_range("2020-01-01", periods=10) times = pd.Series(idx, index=idx) return { "n_splits": 5, @@ -21,32 +24,31 @@ def dummy_args(): "times": times, } + def test_controller_creates_kfold(): """Test controller with KFold.""" - controller = CrossValidatorController( - validator_type='kfold', n_splits=10 - ) + controller = CrossValidatorController(validator_type="kfold", n_splits=10) validator = controller.get_validator() - + assert isinstance(validator, KFold) assert validator.n_splits == 10 + def test_controller_creates_cpcv(dummy_args): """Test controller with CombinatorialPurged.""" controller = CrossValidatorController( - validator_type='combinatorialpurged', - **dummy_args + validator_type="combinatorialpurged", **dummy_args ) validator = controller.get_validator() - + assert isinstance(validator, CombinatorialPurged) assert validator.n_splits == 5 assert validator.n_test_groups == 2 + def test_controller_public_attribute(dummy_args): """Test accessing the validator as a public attribute.""" controller = CrossValidatorController( - validator_type='combinatorialpurged', - **dummy_args + validator_type="combinatorialpurged", **dummy_args ) - assert isinstance(controller.cross_validator, CombinatorialPurged) \ No newline at end of file + assert isinstance(controller.cross_validator, CombinatorialPurged) diff --git a/test/backtest/validation/test_cross_validator_factory.py b/test/backtest/validation/test_cross_validator_factory.py index 8e12155..f4b7290 100644 --- a/test/backtest/validation/test_cross_validator_factory.py +++ b/test/backtest/validation/test_cross_validator_factory.py @@ -7,70 +7,83 @@ import pytest # Import all validator classes -from RiskLabAI.backtest.validation.adaptive_combinatorial_purged import AdaptiveCombinatorialPurged -from RiskLabAI.backtest.validation.bagged_combinatorial_purged import BaggedCombinatorialPurged +from RiskLabAI.backtest.validation.adaptive_combinatorial_purged import ( + AdaptiveCombinatorialPurged, +) +from RiskLabAI.backtest.validation.bagged_combinatorial_purged import ( + BaggedCombinatorialPurged, +) from RiskLabAI.backtest.validation.combinatorial_purged import CombinatorialPurged from RiskLabAI.backtest.validation.cross_validator_factory import CrossValidatorFactory from RiskLabAI.backtest.validation.kfold import KFold from RiskLabAI.backtest.validation.purged_kfold import PurgedKFold from RiskLabAI.backtest.validation.walk_forward import WalkForward + @pytest.fixture def dummy_args(): """Dummy 'times' and 'feature' args for complex validators.""" - idx = pd.date_range('2020-01-01', periods=10) + idx = pd.date_range("2020-01-01", periods=10) times = pd.Series(idx, index=idx) feature = pd.Series(np.arange(10), index=idx) return { "n_splits": 5, "n_test_groups": 2, "times": times, - "external_feature": feature + "external_feature": feature, } + def test_factory_kfold(): - cv = CrossValidatorFactory.create_cross_validator('kfold', n_splits=5) + cv = CrossValidatorFactory.create_cross_validator("kfold", n_splits=5) assert isinstance(cv, KFold) assert cv.n_splits == 5 + def test_factory_walkforward(): - cv = CrossValidatorFactory.create_cross_validator('walkforward', n_splits=3, gap=1) + cv = CrossValidatorFactory.create_cross_validator("walkforward", n_splits=3, gap=1) assert isinstance(cv, WalkForward) assert cv.gap == 1 + def test_factory_purgedkfold(dummy_args): cv = CrossValidatorFactory.create_cross_validator( - 'purgedkfold', n_splits=5, times=dummy_args['times'] + "purgedkfold", n_splits=5, times=dummy_args["times"] ) assert isinstance(cv, PurgedKFold) + def test_factory_combinatorialpurged(dummy_args): cv = CrossValidatorFactory.create_cross_validator( - 'combinatorialpurged', **dummy_args + "combinatorialpurged", **dummy_args ) assert isinstance(cv, CombinatorialPurged) + def test_factory_bagged(dummy_args): cv = CrossValidatorFactory.create_cross_validator( - 'baggedcombinatorialpurged', **dummy_args + "baggedcombinatorialpurged", **dummy_args ) assert isinstance(cv, BaggedCombinatorialPurged) + def test_factory_adaptive(dummy_args): cv = CrossValidatorFactory.create_cross_validator( - 'adaptivecombinatorialpurged', **dummy_args + "adaptivecombinatorialpurged", **dummy_args ) assert isinstance(cv, AdaptiveCombinatorialPurged) + def test_factory_case_insensitivity(dummy_args): """Test that the factory is case-insensitive.""" cv = CrossValidatorFactory.create_cross_validator( - 'PurgedKFold', n_splits=5, times=dummy_args['times'] + "PurgedKFold", n_splits=5, times=dummy_args["times"] ) assert isinstance(cv, PurgedKFold) + def test_factory_invalid_type(): """Test that an invalid type raises a ValueError.""" with pytest.raises(ValueError) as exc_info: - CrossValidatorFactory.create_cross_validator('invalid_type') - assert "Invalid validator_type: invalid_type" in str(exc_info.value) \ No newline at end of file + CrossValidatorFactory.create_cross_validator("invalid_type") + assert "Invalid validator_type: invalid_type" in str(exc_info.value) diff --git a/test/backtest/validation/test_kfold.py b/test/backtest/validation/test_kfold.py index 9cd5830..bae5807 100644 --- a/test/backtest/validation/test_kfold.py +++ b/test/backtest/validation/test_kfold.py @@ -9,15 +9,19 @@ from RiskLabAI.backtest.validation.kfold import KFold + # Use a fixed sample dataset for all tests @pytest.fixture def sample_data(): """Fixture for sample data.""" n_samples = 100 - X = pd.DataFrame({'feature1': np.arange(n_samples), 'feature2': np.arange(n_samples, 0, -1)}) + X = pd.DataFrame( + {"feature1": np.arange(n_samples), "feature2": np.arange(n_samples, 0, -1)} + ) y = pd.Series(np.random.randint(0, 2, n_samples)) return X, y + def test_kfold_init(): """Test KFold initialization.""" cv = KFold(n_splits=5, shuffle=True, random_seed=42) @@ -26,6 +30,7 @@ def test_kfold_init(): assert cv.random_seed == 42 assert cv.get_n_splits() == 5 + def test_kfold_split_no_shuffle(sample_data): """Test KFold split without shuffling.""" X, y = sample_data @@ -56,19 +61,20 @@ def test_kfold_split_no_shuffle(sample_data): # Check that all indices are used exactly once as test indices assert len(np.unique(all_test_indices)) == n_samples + def test_kfold_split_shuffle(sample_data): """Test KFold split with shuffling.""" X, y = sample_data n_samples = X.shape[0] n_splits = 5 cv = KFold(n_splits=n_splits, shuffle=True, random_seed=42) - + splits1 = list(cv.split(X, y)) - + # Check for determinism cv_same_seed = KFold(n_splits=n_splits, shuffle=True, random_seed=42) splits2 = list(cv_same_seed.split(X, y)) - + all_test_indices = [] for i in range(n_splits): np.testing.assert_array_equal(splits1[i][0], splits2[i][0]) @@ -82,23 +88,25 @@ def test_kfold_split_shuffle(sample_data): # Check that all indices are covered assert len(np.unique(all_test_indices)) == n_samples + def test_kfold_backtest_paths(sample_data): """Test backtest_paths method.""" X, y = sample_data n_splits = 4 cv = KFold(n_splits=n_splits, shuffle=False) - + paths = cv.backtest_paths(X) - assert 'Path 1' in paths - assert len(paths['Path 1']) == n_splits - - first_fold = paths['Path 1'][0] - assert 'Train' in first_fold - assert 'Test' in first_fold - + assert "Path 1" in paths + assert len(paths["Path 1"]) == n_splits + + first_fold = paths["Path 1"][0] + assert "Train" in first_fold + assert "Test" in first_fold + # Test first fold indices - np.testing.assert_array_equal(first_fold['Test'], np.arange(25)) - np.testing.assert_array_equal(first_fold['Train'], np.arange(25, 100)) + np.testing.assert_array_equal(first_fold["Test"], np.arange(25)) + np.testing.assert_array_equal(first_fold["Train"], np.arange(25, 100)) + def test_kfold_backtest_predictions(sample_data): """Test backtest_predictions method.""" @@ -106,27 +114,28 @@ def test_kfold_backtest_predictions(sample_data): n_splits = 5 cv = KFold(n_splits=n_splits, shuffle=False) model = LogisticRegression() - + preds_dict = cv.backtest_predictions(model, X, y, n_jobs=1) - - assert 'Path 1' in preds_dict - preds = preds_dict['Path 1'] + + assert "Path 1" in preds_dict + preds = preds_dict["Path 1"] assert isinstance(preds, np.ndarray) assert len(preds) == X.shape[0] - + # Check that predictions are binary (0 or 1) assert np.isin(preds, [0, 1]).all() + def test_kfold_backtest_predictions_shuffle(sample_data): """Test that shuffled predictions are re-ordered correctly.""" X, y = sample_data n_splits = 5 cv = KFold(n_splits=n_splits, shuffle=True, random_seed=42) model = LogisticRegression() - + preds_dict = cv.backtest_predictions(model, X, y, n_jobs=1) - - assert 'Path 1' in preds_dict - preds = preds_dict['Path 1'] + + assert "Path 1" in preds_dict + preds = preds_dict["Path 1"] assert isinstance(preds, np.ndarray) - assert len(preds) == X.shape[0] \ No newline at end of file + assert len(preds) == X.shape[0] diff --git a/test/backtest/validation/test_purged_kfold.py b/test/backtest/validation/test_purged_kfold.py index 11d0f1f..bde4a69 100644 --- a/test/backtest/validation/test_purged_kfold.py +++ b/test/backtest/validation/test_purged_kfold.py @@ -9,53 +9,50 @@ from RiskLabAI.backtest.validation.purged_kfold import PurgedKFold + @pytest.fixture def sample_data_with_times(): """Fixture for sample data with a 'times' series.""" n_samples = 100 X = pd.DataFrame( - {'feature1': np.arange(n_samples)}, - index=pd.date_range('2020-01-01', periods=n_samples, freq='B') - ) - y = pd.Series( - np.random.randint(0, 2, n_samples), - index=X.index + {"feature1": np.arange(n_samples)}, + index=pd.date_range("2020-01-01", periods=n_samples, freq="B"), ) - + y = pd.Series(np.random.randint(0, 2, n_samples), index=X.index) + # 'times' Series: info starts at index time, ends 5 business days later # 5 business days = 7 calendar days - times = pd.Series( - X.index + pd.DateOffset(days=7), - index=X.index - ) + times = pd.Series(X.index + pd.DateOffset(days=7), index=X.index) return X, y, times + def test_purged_kfold_init(sample_data_with_times): """Test PurgedKFold initialization.""" _, _, times = sample_data_with_times cv = PurgedKFold(n_splits=5, times=times, embargo=0.01) - + assert cv.n_splits == 5 assert cv.embargo == 0.01 assert cv.get_n_splits() == 5 assert cv.is_multiple_datasets is False + def test_filtered_training_indices_with_embargo(): """Test the static purging method directly.""" # All data: 100 days, info span is 5 days all_times = pd.Series( - pd.date_range('2020-01-06', periods=100, freq='D'), - index=pd.date_range('2020-01-01', periods=100, freq='D') + pd.date_range("2020-01-06", periods=100, freq="D"), + index=pd.date_range("2020-01-01", periods=100, freq="D"), ) - + # Test set: days 20-30 (iloc 20 to 29) test_times = all_times.iloc[20:30] - + # Test 1: No embargo train_times = PurgedKFold.filtered_training_indices_with_embargo( all_times, test_times, embargo_fraction=0 ) - + # Test set info range: ['2020-01-21', '2020-02-04'] # Purge range: ['2020-01-16' (iloc 15) to '2020-02-04' (iloc 34)] # Purged: 15..34 (inclusive). 34 - 15 + 1 = 20 samples. @@ -78,6 +75,7 @@ def test_filtered_training_indices_with_embargo(): assert all_times.index[36] in train_times_emb.index assert len(train_times_emb) == 79 + def test_purged_kfold_split(sample_data_with_times): """Test the split method.""" X, y, times = sample_data_with_times @@ -86,11 +84,11 @@ def test_purged_kfold_split(sample_data_with_times): splits = list(cv.split(X, y)) assert len(splits) == n_splits - + # --- Test first fold --- train_idx_0, test_idx_0 = splits[0] np.testing.assert_array_equal(test_idx_0, np.arange(0, 20)) - + # Test range: start '2020-01-01', end '2020-02-04' (from iloc 19) # Embargo (1%*100=1 sample): end_iloc = 24. embargoed_iloc = 25. # Embargoed end timestamp: times.index[25] = '2020-02-05' @@ -105,7 +103,7 @@ def test_purged_kfold_split(sample_data_with_times): # --- Test last fold --- train_idx_4, test_idx_4 = splits[4] np.testing.assert_array_equal(test_idx_4, np.arange(80, 100)) - + # Test range: start '2020-04-27' (iloc 80), end '2020-05-26' (from iloc 99) # Embargo (1 sample) -> end_val '2020-05-26' is OOB, so embargoed_end is '2020-05-26' # Purge range: ['2020-04-27', '2020-05-26'] @@ -115,23 +113,22 @@ def test_purged_kfold_split(sample_data_with_times): assert 70 in train_idx_4 assert 71 in train_idx_4 # 71 should be in the set assert 74 in train_idx_4 # This is the last valid index - assert 75 not in train_idx_4 # This is the first purged index - assert len(train_idx_4) == 75 # The correct length is 75 - + assert 75 not in train_idx_4 # This is the first purged index + assert len(train_idx_4) == 75 # The correct length is 75 def test_get_train_indices_refactor(sample_data_with_times): """Test the _get_train_indices refactor.""" X, y, times = sample_data_with_times cv = PurgedKFold(n_splits=5, times=times, embargo=0.01) - + test_indices = np.arange(80, 100) train_indices = cv._get_train_indices(test_indices, times, True) - + # Logic is identical to test_purged_kfold_split[4] assert isinstance(train_indices, np.ndarray) assert 70 in train_indices - assert 71 in train_indices # 71 should be in the set - assert 74 in train_indices # This is the last valid index - assert 75 not in train_indices # This is the first purged index - assert len(train_indices) == 75 # The correct length is 75 \ No newline at end of file + assert 71 in train_indices # 71 should be in the set + assert 74 in train_indices # This is the last valid index + assert 75 not in train_indices # This is the first purged index + assert len(train_indices) == 75 # The correct length is 75 diff --git a/test/backtest/validation/test_walk_forward.py b/test/backtest/validation/test_walk_forward.py index 419a2a3..f54e505 100644 --- a/test/backtest/validation/test_walk_forward.py +++ b/test/backtest/validation/test_walk_forward.py @@ -9,15 +9,19 @@ from RiskLabAI.backtest.validation.walk_forward import WalkForward + # Re-use the sample data fixture from test_kfold @pytest.fixture def sample_data(): """Fixture for sample data.""" n_samples = 100 - X = pd.DataFrame({'feature1': np.arange(n_samples), 'feature2': np.arange(n_samples, 0, -1)}) + X = pd.DataFrame( + {"feature1": np.arange(n_samples), "feature2": np.arange(n_samples, 0, -1)} + ) y = pd.Series(np.random.randint(0, 2, n_samples)) return X, y + def test_walk_forward_init(): """Test WalkForward initialization.""" cv = WalkForward(n_splits=5, max_train_size=50, gap=2) @@ -26,20 +30,21 @@ def test_walk_forward_init(): assert cv.gap == 2 assert cv.shuffle is False # WalkForward should never shuffle + def test_walk_forward_split_no_gap(sample_data): """Test WalkForward split without gap.""" X, _ = sample_data n_splits = 5 cv = WalkForward(n_splits=n_splits, gap=0) - + splits = list(cv.split(X)) assert len(splits) == n_splits - + # Fold 1 train_idx_0, test_idx_0 = splits[0] np.testing.assert_array_equal(train_idx_0, np.array([], dtype=int)) np.testing.assert_array_equal(test_idx_0, np.arange(0, 20)) - + # Fold 2 train_idx_1, test_idx_1 = splits[1] np.testing.assert_array_equal(train_idx_1, np.arange(0, 20)) @@ -50,21 +55,22 @@ def test_walk_forward_split_no_gap(sample_data): np.testing.assert_array_equal(train_idx_4, np.arange(0, 80)) np.testing.assert_array_equal(test_idx_4, np.arange(80, 100)) + def test_walk_forward_split_with_gap(sample_data): """Test WalkForward split with a gap.""" X, _ = sample_data n_splits = 5 gap = 2 cv = WalkForward(n_splits=n_splits, gap=gap) - + splits = list(cv.split(X)) assert len(splits) == n_splits - + # Fold 1 (test starts at 0, train_end = 0 - 2 = -2) train_idx_0, test_idx_0 = splits[0] np.testing.assert_array_equal(train_idx_0, np.array([], dtype=int)) np.testing.assert_array_equal(test_idx_0, np.arange(0, 20)) - + # Fold 2 (test starts at 20, train_end = 20 - 2 = 18) train_idx_1, test_idx_1 = splits[1] np.testing.assert_array_equal(train_idx_1, np.arange(0, 18)) @@ -75,24 +81,25 @@ def test_walk_forward_split_with_gap(sample_data): np.testing.assert_array_equal(train_idx_4, np.arange(0, 78)) np.testing.assert_array_equal(test_idx_4, np.arange(80, 100)) + def test_walk_forward_split_with_max_train(sample_data): """Test WalkForward split with max_train_size.""" X, _ = sample_data n_splits = 5 max_train_size = 30 cv = WalkForward(n_splits=n_splits, max_train_size=max_train_size, gap=0) - + splits = list(cv.split(X)) - + # Fold 1 (train_end=0, train_size=0) train_idx_0, _ = splits[0] np.testing.assert_array_equal(train_idx_0, np.array([], dtype=int)) - + # Fold 2 (train_end=20, train_size=20) train_idx_1, _ = splits[1] np.testing.assert_array_equal(train_idx_1, np.arange(0, 20)) assert len(train_idx_1) == 20 - + # Fold 3 (train_end=40, train_size=40, capped at 30) train_idx_2, _ = splits[2] # train_start = 40 - 30 = 10. train_end = 40. @@ -105,6 +112,7 @@ def test_walk_forward_split_with_max_train(sample_data): np.testing.assert_array_equal(train_idx_4, np.arange(50, 80)) assert len(train_idx_4) == max_train_size + def test_walk_forward_predictions_with_nan(sample_data): """Test that predictions are np.nan for the first fold with no train data.""" X, y = sample_data @@ -113,11 +121,11 @@ def test_walk_forward_predictions_with_nan(sample_data): model = LogisticRegression() preds_dict = cv.backtest_predictions(model, X, y, n_jobs=1) - preds = preds_dict['Path 1'] - + preds = preds_dict["Path 1"] + assert len(preds) == X.shape[0] - + # First 20 predictions should be NaN assert np.isnan(preds[:20]).all() # Remaining predictions should be valid - assert not np.isnan(preds[20:]).any() \ No newline at end of file + assert not np.isnan(preds[20:]).any() diff --git a/test/cluster/test_clustering.py b/test/cluster/test_clustering.py index bd4d3b9..02fc4c6 100644 --- a/test/cluster/test_clustering.py +++ b/test/cluster/test_clustering.py @@ -12,22 +12,24 @@ cluster_k_means_top, ) + def test_covariance_to_correlation(): """Test correlation matrix derivation.""" cov = np.array([[2.0, 1.0], [1.0, 1.0]]) # std = [sqrt(2), 1] # corr[0,1] = 1 / (sqrt(2) * 1) = 1/sqrt(2) = 0.7071 corr = covariance_to_correlation(cov) - + expected = np.array([[1.0, 1 / np.sqrt(2)], [1 / np.sqrt(2), 1.0]]) assert np.allclose(corr, expected) - + # Test numerical stability cov_unstable = np.array([[1.0, 1.000001], [1.000001, 1.0]]) corr_unstable = covariance_to_correlation(cov_unstable) assert corr_unstable.max() <= 1.0 assert corr_unstable.min() >= -1.0 + @pytest.fixture def block_corr_matrix(): """Generate a known block correlation matrix for testing.""" @@ -37,74 +39,78 @@ def block_corr_matrix(): # With block_size_min=5, this forces two blocks of 5 return corr + def test_random_block_correlation(block_corr_matrix): """Test the block correlation generator.""" corr = block_corr_matrix assert corr.shape == (10, 10) - + # Check that intra-block correlation is high intra_block_1 = corr.iloc[0:5, 0:5].to_numpy() intra_block_2 = corr.iloc[5:10, 5:10].to_numpy() - + # Check that inter-block correlation is lower inter_block = corr.iloc[0:5, 5:10].to_numpy() - + # Avg corr inside block (minus diagonal) avg_intra_1 = (intra_block_1.sum() - 5) / (25 - 5) avg_intra_2 = (intra_block_2.sum() - 5) / (25 - 5) - + # Avg corr between blocks avg_inter = inter_block.mean() - + assert avg_intra_1 > avg_inter assert avg_intra_2 > avg_inter + def test_cluster_k_means_base(block_corr_matrix): """Test the base clustering logic.""" corr = block_corr_matrix - + # We know there are 2 blocks, so set max_clusters low corr_sorted, clusters, silhouette = cluster_k_means_base( corr, max_clusters=2, iterations=10, random_state=0 ) - + # Should find 2 clusters assert len(clusters) == 2 - + # Clusters should be the two blocks c0 = clusters[0] c1 = clusters[1] - + block1 = [0, 1, 2, 3, 4] block2 = [5, 6, 7, 8, 9] - + # Convert item names (which are ints) to list c0_names = sorted([int(c) for c in c0]) c1_names = sorted([int(c) for c in c1]) - - assert (c0_names == block1 and c1_names == block2) or \ - (c0_names == block2 and c1_names == block1) - + + assert (c0_names == block1 and c1_names == block2) or ( + c0_names == block2 and c1_names == block1 + ) + assert silhouette.shape == (10,) assert corr_sorted.shape == (10, 10) + def test_cluster_k_means_top(block_corr_matrix): """Test the top-level ONC algorithm.""" corr = block_corr_matrix - + corr_sorted, clusters, silhouette = cluster_k_means_top( corr, max_clusters=10, iterations=10, random_state=0 ) - + # ONC should be stable and find the 2 clusters assert len(clusters) == 2 - - block1 = list(range(5)) - block2 = list(range(5, 10)) + block1 = list(range(5)) + block2 = list(range(5, 10)) c0_names = sorted(clusters[0]) c1_names = sorted(clusters[1]) - assert (c0_names == block1 and c1_names == block2) or \ - (c0_names == block2 and c1_names == block1) \ No newline at end of file + assert (c0_names == block1 and c1_names == block2) or ( + c0_names == block2 and c1_names == block1 + ) diff --git a/test/controller/test_bars_initializer.py b/test/controller/test_bars_initializer.py index dfcf051..10f9868 100644 --- a/test/controller/test_bars_initializer.py +++ b/test/controller/test_bars_initializer.py @@ -7,17 +7,20 @@ # Import all bar types to check instance from RiskLabAI.data.structures.imbalance_bars import ( - ExpectedImbalanceBars, FixedImbalanceBars + ExpectedImbalanceBars, + FixedImbalanceBars, ) from RiskLabAI.data.structures.run_bars import ExpectedRunBars, FixedRunBars from RiskLabAI.data.structures.standard_bars import StandardBars from RiskLabAI.data.structures.time_bars import TimeBars + @pytest.fixture def controller(): """Fixture for the BarsInitializerController.""" return BarsInitializerController() + # Get all method names from the controller's map bar_types_to_test = list(BarsInitializerController().method_name_to_method.keys()) expected_classes = { @@ -39,6 +42,7 @@ def controller(): "time_bars": TimeBars, } + @pytest.mark.parametrize("bar_method_name", bar_types_to_test) def test_all_bar_initializers(controller, bar_method_name): """ @@ -47,16 +51,16 @@ def test_all_bar_initializers(controller, bar_method_name): """ # Get the actual initialization method from the controller init_method = controller.method_name_to_method[bar_method_name] - + # Call the method (e.g., controller.initialize_time_bars()) bar_instance = init_method() - + # Check that it's an instance of the correct base class expected_class = expected_classes[bar_method_name] assert isinstance(bar_instance, expected_class) - + # Check specific bar types for imbalance/run bars if bar_method_name.endswith("_imbalance_bars"): assert bar_method_name.split("_")[1] in bar_instance.bar_type elif bar_method_name.endswith("_run_bars"): - assert bar_method_name.split("_")[1] in bar_instance.bar_type \ No newline at end of file + assert bar_method_name.split("_")[1] in bar_instance.bar_type diff --git a/test/controller/test_data_structure_controller.py b/test/controller/test_data_structure_controller.py index 64464c5..22e7e87 100644 --- a/test/controller/test_data_structure_controller.py +++ b/test/controller/test_data_structure_controller.py @@ -8,6 +8,7 @@ import io from RiskLabAI.controller.data_structure_controller import Controller + @pytest.fixture def mock_tick_data(): """Fixture for a sample tick DataFrame.""" @@ -15,9 +16,10 @@ def mock_tick_data(): dates = pd.to_datetime(pd.date_range("2020-01-01", periods=n_ticks, freq="ms")) prices = 100 + np.random.randn(n_ticks).cumsum() * 0.1 volumes = np.random.randint(1, 100, n_ticks) - + # The controller expects (datetime, price, volume) - return pd.DataFrame({'datetime': dates, 'price': prices, 'volume': volumes}) + return pd.DataFrame({"datetime": dates, "price": prices, "volume": volumes}) + def test_controller_init(): """Test that the controller and its initializer are created.""" @@ -25,60 +27,58 @@ def test_controller_init(): assert controller.bars_initializer is not None assert "dollar_standard_bars" in controller.bars_initializer.method_name_to_method + def test_controller_read_from_dataframe(mock_tick_data): """Test reading from a DataFrame in batches.""" controller = Controller() batch_size = 20 - + generator = controller.read_batches_from_dataframe(mock_tick_data, batch_size) - + batches = list(generator) - - assert len(batches) == 5 # 100 / 20 = 5 + + assert len(batches) == 5 # 100 / 20 = 5 assert batches[0].shape == (20, 3) pd.testing.assert_frame_equal(batches[0], mock_tick_data.iloc[:20]) + def test_controller_read_from_string(): """Test reading from a CSV string.""" controller = Controller() batch_size = 3 - + # Create a mock CSV file in memory - csv_data = "datetime,price,volume\n" + \ - "2020-01-01T00:00:00,100,10\n" * 7 - + csv_data = "datetime,price,volume\n" + "2020-01-01T00:00:00,100,10\n" * 7 + # Use io.StringIO to simulate a file csv_file = io.StringIO(csv_data) - + # We need to mock 'open' to return our string file # This is advanced, so for now we'll test the dataframe method # and assume read_batches_from_string (which uses pd.read_csv) works. - + # A simpler test: create a dummy DataFrame and use read_batches_from_dataframe df = pd.read_csv(io.StringIO(csv_data), parse_dates=[0]) generator = controller.read_batches_from_dataframe(df, batch_size) batches = list(generator) - - assert len(batches) == 3 # 7 rows / 3 = 3 batches (3, 3, 1) + + assert len(batches) == 3 # 7 rows / 3 = 3 batches (3, 3, 1) assert batches[0].shape == (3, 3) assert batches[2].shape == (1, 3) - + def test_controller_handle_input_command(mock_tick_data): """Test the full end-to-end command handling.""" controller = Controller() - + # Use dollar bars with a threshold that will generate a few bars method_name = "dollar_standard_bars" - method_arguments = {"threshold": 10000} # 100 * 50 (avg) = 5000 per tick - + method_arguments = {"threshold": 10000} # 100 * 50 (avg) = 5000 per tick + bars_df = controller.handle_input_command( - method_name, - method_arguments, - input_data=mock_tick_data, - batch_size=20 + method_name, method_arguments, input_data=mock_tick_data, batch_size=20 ) - + assert isinstance(bars_df, pd.DataFrame) assert not bars_df.empty - assert bars_df["Cumulative Dollar Value"].min() >= 10000 \ No newline at end of file + assert bars_df["Cumulative Dollar Value"].min() >= 10000 diff --git a/test/data/denoise/test_denoising.py b/test/data/denoise/test_denoising.py index b31f00c..c307059 100644 --- a/test/data/denoise/test_denoising.py +++ b/test/data/denoise/test_denoising.py @@ -10,9 +10,10 @@ cov_to_corr, corr_to_cov, denoise_cov, - optimal_portfolio + optimal_portfolio, ) + @pytest.fixture def noisy_cov_matrix(): """ @@ -21,80 +22,86 @@ def noisy_cov_matrix(): """ T, N = 100, 50 rng = np.random.default_rng(42) - + # 1. Create a true, simple correlation structure (2 factors) factors = rng.normal(size=(T, 2)) loadings = rng.normal(size=(N, 2)) - + # 2. Generate returns with noise true_returns = factors @ loadings.T noise = rng.normal(scale=0.5, size=(T, N)) - + returns = true_returns + noise - + # 3. Calculate the noisy covariance matrix cov = np.cov(returns, rowvar=False) - - return cov, T/N + + return cov, T / N + def test_marcenko_pastur_pdf(): """Test the MP PDF calculation.""" q = 10 variance = 1.0 - + pdf = marcenko_pastur_pdf(variance, q, num_points=100) - + # lambda_min = 1 * (1 - 1/sqrt(10))^2 = 0.467 # lambda_max = 1 * (1 + 1/sqrt(10))^2 = 1.732 - + assert isinstance(pdf, pd.Series) assert np.isclose(pdf.index.min(), 0.4675, atol=1e-3) assert np.isclose(pdf.index.max(), 1.7324, atol=1e-3) assert not pdf.isna().any() + def test_cov_corr_conversion(): """Test that cov_to_corr and corr_to_cov are inverses.""" cov = np.array([[4.0, 1.0], [1.0, 1.0]]) std = np.array([2.0, 1.0]) - + # Test cov -> corr corr = cov_to_corr(cov) expected_corr = np.array([[1.0, 0.5], [0.5, 1.0]]) assert np.allclose(corr, expected_corr) - + # Test corr -> cov cov_new = corr_to_cov(corr, std) assert np.allclose(cov, cov_new) + def test_denoise_cov(noisy_cov_matrix): """Test the end-to-end denoising function.""" cov, q = noisy_cov_matrix - + assert cov.shape == (50, 50) - + # Denoise the matrix cov_denoised = denoise_cov(cov, q, bandwidth=0.01) - + assert cov_denoised.shape == cov.shape - + # Get eigenvalues evals_orig, _ = np.linalg.eigh(cov) evals_denoised, _ = np.linalg.eigh(cov_denoised) - + # The denoising process "clips" the smallest (noise) eigenvalues # and boosts the signal eigenvalues. # The smallest denoised eigenvalue should be larger # than the smallest original one. - assert evals_denoised.min() > evals_orig.min() # The largest denoised eigenvalue should be smaller + assert ( + evals_denoised.min() > evals_orig.min() + ) # The largest denoised eigenvalue should be smaller # than the largest original one (as noise variance is removed) assert evals_denoised.max() < evals_orig.max() + def test_optimal_portfolio(noisy_cov_matrix): """Test the optimal portfolio helper.""" cov, q = noisy_cov_matrix - + # GMV portfolio weights = optimal_portfolio(cov, mu=None) - + assert weights.shape == (50,) - assert np.isclose(weights.sum(), 1.0) \ No newline at end of file + assert np.isclose(weights.sum(), 1.0) diff --git a/test/data/differentiation/test_differentiation.py b/test/data/differentiation/test_differentiation.py index ae40ba8..23bdf52 100644 --- a/test/data/differentiation/test_differentiation.py +++ b/test/data/differentiation/test_differentiation.py @@ -15,11 +15,13 @@ fractionally_differentiated_log_price, ) + @pytest.fixture def sample_series(): """A simple linear series.""" return pd.Series(np.arange(1, 21, dtype=float), name="close") + @pytest.fixture def random_walk_series(): """A non-stationary random walk.""" @@ -27,6 +29,7 @@ def random_walk_series(): log_price = np.log(100 + rng.normal(0, 1, 1000).cumsum()) return pd.Series(log_price, name="close") + def test_calculate_weights_std(): """Test standard weights calculation.""" # d=0, w=[1] @@ -38,6 +41,7 @@ def test_calculate_weights_std(): # [w4, w3, w2, w1, w0] = [0, 0, 0, -1, 1] assert np.allclose(w1, [[0], [0], [0], [-1], [1]]) + def test_calculate_weights_ffd(): """Test fixed-width weights calculation.""" # d=0, w=[1] @@ -48,35 +52,40 @@ def test_calculate_weights_ffd(): w1 = calculate_weights_ffd(degree=1, threshold=1e-5) assert np.allclose(w1, [[-1.0], [1.0]]) + def test_fractional_difference_std(sample_series): """Test standard differentiation.""" df = sample_series.to_frame() - + # d=1 should be equivalent to .diff(1) diff_std = fractional_difference_std(df, degree=1.0, threshold=0.01) - + # Standard .diff() diff_pd = df.diff(1).dropna() - + # Compare common indices common_idx = diff_std.index.intersection(diff_pd.index) - assert np.allclose(diff_std.loc[common_idx, 'close'], - diff_pd.loc[common_idx, 'close']) + assert np.allclose( + diff_std.loc[common_idx, "close"], diff_pd.loc[common_idx, "close"] + ) + def test_fractional_difference_fixed(sample_series): """Test fixed-width differentiation.""" df = sample_series.to_frame() - + # d=1 should be equivalent to .diff(1) diff_ffd = fractional_difference_fixed(df, degree=1.0, threshold=1e-5) - + # Standard .diff() diff_pd = df.diff(1).dropna() - + # Compare common indices common_idx = diff_ffd.index.intersection(diff_pd.index) - assert np.allclose(diff_ffd.loc[common_idx, 'close'], - diff_pd.loc[common_idx, 'close']) + assert np.allclose( + diff_ffd.loc[common_idx, "close"], diff_pd.loc[common_idx, "close"] + ) + def test_fractional_difference_fixed_single(sample_series): """Test fixed-width differentiation on a single Series.""" @@ -85,27 +94,27 @@ def test_fractional_difference_fixed_single(sample_series): sample_series, degree=1.0, threshold=1e-5 ) diff_pd = sample_series.diff(1).dropna() - + common_idx = diff_ffd.index.intersection(diff_pd.index) - assert np.allclose(diff_ffd.loc[common_idx], - diff_pd.loc[common_idx]) - + assert np.allclose(diff_ffd.loc[common_idx], diff_pd.loc[common_idx]) + # d=0.5 diff_d05 = fractional_difference_fixed_single( - sample_series, degree=0.5, threshold=0.01 + sample_series, degree=0.5, threshold=0.01 ) assert not diff_d05.empty - assert diff_d05.dropna().iloc[0] > 1.0 + assert diff_d05.dropna().iloc[0] > 1.0 + def test_fractionally_differentiated_log_price(random_walk_series): """Test the minimum 'd' finding function.""" # Original series should not be stationary - adf_orig = adfuller(random_walk_series.dropna(), maxlag=1, regression='c') - assert adf_orig[1] > 0.05 # p-value > 0.05 (not stationary) + adf_orig = adfuller(random_walk_series.dropna(), maxlag=1, regression="c") + assert adf_orig[1] > 0.05 # p-value > 0.05 (not stationary) # Differentiated series should be stationary differentiated_series = fractionally_differentiated_log_price( random_walk_series, p_value_threshold=0.05 ) - adf_diff = adfuller(differentiated_series.dropna(), maxlag=1, regression='c') - assert adf_diff[1] < 0.05 # p-value < 0.05 (stationary) \ No newline at end of file + adf_diff = adfuller(differentiated_series.dropna(), maxlag=1, regression="c") + assert adf_diff[1] < 0.05 # p-value < 0.05 (stationary) diff --git a/test/data/distance/test_distance_metric.py b/test/data/distance/test_distance_metric.py index fa3b74a..3b4c29c 100644 --- a/test/data/distance/test_distance_metric.py +++ b/test/data/distance/test_distance_metric.py @@ -11,48 +11,52 @@ calculate_distance, ) + @pytest.fixture def sample_arrays(): """Fixture for sample arrays.""" x = np.array([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]) y = np.array([1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2]) - z = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]) # Independent + z = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]) # Independent return x, y, z + def test_calculate_number_of_bins(): """Test optimal bin calculation.""" # Univariate bins_uni = calculate_number_of_bins(1000) assert bins_uni > 0 - + # Bivariate bins_bi_high_corr = calculate_number_of_bins(1000, correlation=0.99) bins_bi_low_corr = calculate_number_of_bins(1000, correlation=0.01) - + assert bins_bi_low_corr < bins_bi_high_corr + def test_calculate_variation_of_information(sample_arrays): """Test VI calculation.""" x, y, z = sample_arrays bins = 3 - + # VI(X, X) should be 0 vi_self = calculate_variation_of_information(x, x, bins, norm=True) assert np.isclose(vi_self, 0.0) - + # VI(X, Z) where Z is independent should be H(X) + H(Z) # VI_norm should be 1 vi_indep = calculate_variation_of_information(x, z, bins, norm=True) assert np.isclose(vi_indep, 1.0, atol=0.1) - + # VI(X, Y) should be between 0 and 1 vi_partial = calculate_variation_of_information(x, y, bins, norm=True) assert 0 < vi_partial <= 1 + def test_calculate_mutual_information(sample_arrays): """Test MI calculation.""" x, y, z = sample_arrays - + # MI(X, X) = H(X). MI_norm = 1 mi_self = calculate_mutual_information(x, x, norm=True) assert np.isclose(mi_self, 1.0) @@ -61,12 +65,11 @@ def test_calculate_mutual_information(sample_arrays): mi_indep = calculate_mutual_information(x, z, norm=True) assert np.isclose(mi_indep, 0.0, atol=0.1) + def test_calculate_distance(): """Test angular distance calculation.""" - corr = np.array([[1.0, 0.5, 0.0], - [0.5, 1.0, -0.5], - [0.0, -0.5, 1.0]]) - + corr = np.array([[1.0, 0.5, 0.0], [0.5, 1.0, -0.5], [0.0, -0.5, 1.0]]) + # Angular dist_ang = calculate_distance(corr, metric="angular") # d(0,0) = sqrt(0.5 * (1 - 1)) = 0 @@ -75,10 +78,10 @@ def test_calculate_distance(): assert np.isclose(dist_ang[0, 0], 0.0) assert np.isclose(dist_ang[0, 1], 0.5) assert np.isclose(dist_ang[1, 2], np.sqrt(0.75)) - + # Absolute Angular dist_abs_ang = calculate_distance(corr, metric="absolute_angular") # d(1,2) = sqrt(0.5 * (1 - |-0.5|)) = sqrt(0.5 * 0.5) = 0.5 assert np.isclose(dist_abs_ang[0, 0], 0.0) assert np.isclose(dist_abs_ang[0, 1], 0.5) - assert np.isclose(dist_abs_ang[1, 2], 0.5) \ No newline at end of file + assert np.isclose(dist_abs_ang[1, 2], 0.5) diff --git a/test/data/labeling/test_financial_labels.py b/test/data/labeling/test_financial_labels.py index 3492293..6e37de6 100644 --- a/test/data/labeling/test_financial_labels.py +++ b/test/data/labeling/test_financial_labels.py @@ -10,13 +10,14 @@ find_trend_using_trend_scanning, ) + def test_calculate_t_value_linear_regression(): """Test the t-value calculation.""" # Perfect positive trend prices_pos = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]) t_val_pos = calculate_t_value_linear_regression(prices_pos) assert t_val_pos > 0 - assert np.isinf(t_val_pos) # OLS stderr is 0 + assert np.isinf(t_val_pos) # OLS stderr is 0 # Perfect negative trend prices_neg = pd.Series([5.0, 4.0, 3.0, 2.0, 1.0]) @@ -29,43 +30,62 @@ def test_calculate_t_value_linear_regression(): t_val_noisy = calculate_t_value_linear_regression(prices_noisy) assert t_val_noisy > 0 assert np.isfinite(t_val_noisy) - + # No trend prices_flat = pd.Series([2.0, 2.0, 2.0, 2.0, 2.0]) t_val_flat = calculate_t_value_linear_regression(prices_flat) - assert np.isnan(t_val_flat) # StdErr is 0, slope is 0 + assert np.isnan(t_val_flat) # StdErr is 0, slope is 0 + def test_find_trend_using_trend_scanning(): """Test the trend scanning function.""" dates = pd.date_range("2020-01-01", periods=20) prices = pd.Series( [ - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, # Strong uptrend - 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 # Strong downtrend + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, # Strong uptrend + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, # Strong downtrend ], index=dates, - dtype=float + dtype=float, ) - + # Scan from 2020-01-01. Span [5, 10] # Window [0, 4] (len 5): 1..5 -> t = inf # Window [0, 9] (len 10): 1..10 -> t = inf # It will pick the last one, t_value at '2020-01-10' - + molecule = pd.to_datetime(["2020-01-01", "2020-01-10"]) - span = (5, 11) # span(5, 10) - + span = (5, 11) # span(5, 10) + trends = find_trend_using_trend_scanning(molecule, prices, span) - + # Check event 1 assert trends.loc["2020-01-01", "Trend"] == 1.0 assert trends.loc["2020-01-01", "End Time"] == pd.to_datetime("2020-01-11") assert np.isinf(trends.loc["2020-01-01", "t-Value"]) - + # Check event 2 # Scan from 2020-01-10 (price 10) # Window [9, 13] (len 5): 10, 9, 8, 7, 6 -> t = -inf # Window [9, 18] (len 10): 10..2 -> t = -inf assert trends.loc["2020-01-10", "Trend"] == -1.0 assert trends.loc["2020-01-10", "End Time"] == pd.to_datetime("2020-01-20") - assert np.isinf(trends.loc["2020-01-10", "t-Value"]) \ No newline at end of file + assert np.isinf(trends.loc["2020-01-10", "t-Value"]) diff --git a/test/data/labeling/test_labeling.py b/test/data/labeling/test_labeling.py index 970b5e4..ba817a5 100644 --- a/test/data/labeling/test_labeling.py +++ b/test/data/labeling/test_labeling.py @@ -14,20 +14,36 @@ meta_labeling, ) + @pytest.fixture def price_series(): """Fixture for a predictable price series.""" - dates = pd.to_datetime( - pd.date_range("2020-01-01", periods=20, freq="D") - ) + dates = pd.to_datetime(pd.date_range("2020-01-01", periods=20, freq="D")) prices = [ - 10, 11, 12, 13, 14, 15, # Event 1 - 14, 13, 12, 11, 10, # Event 2 - 11, 12, 11, 12, 11, # Noise - 12, 13, 14, 15, # Event 3 + 10, + 11, + 12, + 13, + 14, + 15, # Event 1 + 14, + 13, + 12, + 11, + 10, # Event 2 + 11, + 12, + 11, + 12, + 11, # Noise + 12, + 13, + 14, + 15, # Event 3 ] return pd.Series(prices, index=dates, dtype=float) + def test_symmetric_cusum_filter(price_series): """Test the fixed-threshold CUSUM filter.""" # Threshold of 3 @@ -40,27 +56,27 @@ def test_symmetric_cusum_filter(price_series): pd.testing.assert_index_equal(events, expected_dates) + def test_cusum_filter_dynamic_threshold(price_series): """Test the dynamic-threshold CUSUM filter.""" # Threshold = 2.0 everywhere thresholds = pd.Series(2.0, index=price_series.index) - + # 10->12 (diff=2), 12->13 (diff=1), s_pos=3 > 2. Event at 2020-01-03 # 13->14 (diff=1), 14->15 (diff=1), s_pos=2 # 15->14 (diff=-1), 14->13 (diff=-1), s_neg=-2 # 13->12 (diff=-1), s_neg=-3 < -2. Event at 2020-01-09 - + events = cusum_filter_events_dynamic_threshold(price_series, thresholds) # Note: CUSUM logic is slightly different, it triggers *after* # 10->11 (1), 11->12 (2), 12->13 (3) > 2. Event at 2020-01-04 # 13->14 (1), 14->15 (2) # 15->14 (-1), 14->13 (-2), 13->12 (-3) < -2. Event at 2020-01-09 - - expected_dates = pd.to_datetime( - ["2020-01-04", "2020-01-09", "2020-01-18"] - ) + + expected_dates = pd.to_datetime(["2020-01-04", "2020-01-09", "2020-01-18"]) pd.testing.assert_index_equal(events, expected_dates) + def test_daily_volatility(price_series): """Test daily volatility calculation.""" vol = daily_volatility_with_log_returns(price_series, span=5) @@ -68,58 +84,59 @@ def test_daily_volatility(price_series): assert not vol.empty assert vol.name == "std" + def test_vertical_barrier(price_series): """Test the vertical barrier function.""" events = pd.to_datetime(["2020-01-02", "2020-01-10"]) # 2020-01-02 + 5 days = 2020-01-07 # 2020-01-10 + 5 days = 2020-01-15 barriers = vertical_barrier(price_series, events, number_days=5) - + expected_index = pd.to_datetime(["2020-01-02", "2020-01-10"]) expected_values = pd.to_datetime(["2020-01-07", "2020-01-15"]) - + pd.testing.assert_index_equal(barriers.index, expected_index) assert np.all(barriers.values == expected_values) + def test_meta_events_and_labeling(price_series): """Test the triple-barrier and meta-labeling functions.""" time_events = pd.to_datetime(["2020-01-02", "2020-01-08"]) volatility = pd.Series(0.01, index=price_series.index) - ptsl = [2.0, 2.0] # 2 * 0.01 = 0.02 + ptsl = [2.0, 2.0] # 2 * 0.01 = 0.02 return_min = 0.0 num_threads = 1 - + # Event 1: Start 2020-01-02 (price 11) # Price path: 12, 13, 14, 15 # Log returns: log(12/11)=0.087, log(13/11)=0.167, ... # 0.167 > 0.02 (pt). Hit at 2020-01-04. - + # Event 2: Start 2020-01-08 (price 13) # Price path: 12, 11, 10 # Log returns: log(12/13)=-0.08, log(11/13)=-0.167 # -0.167 < -0.02 (sl). Hit at 2020-01-10. - + events = meta_events( - price_series, time_events, ptsl, volatility, - return_min, num_threads + price_series, time_events, ptsl, volatility, return_min, num_threads ) - + expected_end_times = pd.to_datetime(["2020-01-03", "2020-01-09"]) assert np.all(events["End Time"] == expected_end_times) - + # Test meta-labeling (long only) labels = meta_labeling(events, price_series) - + # Event 1: 13 / 11 - 1 = 0.18 > 0. Label = 1 # Event 2: 11 / 13 - 1 = -0.15 < 0. Label = -1 assert np.isclose(labels.loc["2020-01-02", "Label"], 1.0) assert np.isclose(labels.loc["2020-01-08", "Label"], -1.0) - + # Test meta-labeling (with side) events["Side"] = pd.Series([1, 1], index=time_events) labels_meta = meta_labeling(events, price_series) - + # Event 1: Return > 0. Label = 1 # Event 2: Return < 0. Label = 0 assert np.isclose(labels_meta.loc["2020-01-02", "Label"], 1.0) - assert np.isclose(labels_meta.loc["2020-01-08", "Label"], 0.0) \ No newline at end of file + assert np.isclose(labels_meta.loc["2020-01-08", "Label"], 0.0) diff --git a/test/data/structures/test_imbalance_bars.py b/test/data/structures/test_imbalance_bars.py index 9e47214..9e340cd 100644 --- a/test/data/structures/test_imbalance_bars.py +++ b/test/data/structures/test_imbalance_bars.py @@ -1,13 +1,16 @@ """ Tests for data/structures/imbalance_bars.py """ + import numpy as np import pandas as pd import pytest from RiskLabAI.data.structures.imbalance_bars import ( - FixedImbalanceBars, ExpectedImbalanceBars + FixedImbalanceBars, + ExpectedImbalanceBars, ) + @pytest.fixture def sample_tick_data_for_imbalance(): """ @@ -18,24 +21,25 @@ def sample_tick_data_for_imbalance(): Cumul. Imbalance: 0, 1, 2, 3, 2, 1, 0 """ return [ - (pd.to_datetime('2020-01-01 10:00:00'), 100, 10), - (pd.to_datetime('2020-01-01 10:00:01'), 101, 10), - (pd.to_datetime('2020-01-01 10:00:02'), 102, 10), - (pd.to_datetime('2020-01-01 10:00:03'), 103, 10), # Bar samples here - (pd.to_datetime('2020-01-01 10:00:04'), 102, 10), - (pd.to_datetime('2020-01-01 10:00:05'), 101, 10), - (pd.to_datetime('2020-01-01 10:00:06'), 100, 10), + (pd.to_datetime("2020-01-01 10:00:00"), 100, 10), + (pd.to_datetime("2020-01-01 10:00:01"), 101, 10), + (pd.to_datetime("2020-01-01 10:00:02"), 102, 10), + (pd.to_datetime("2020-01-01 10:00:03"), 103, 10), # Bar samples here + (pd.to_datetime("2020-01-01 10:00:04"), 102, 10), + (pd.to_datetime("2020-01-01 10:00:05"), 101, 10), + (pd.to_datetime("2020-01-01 10:00:06"), 100, 10), ] + def test_fixed_imbalance_bars(sample_tick_data_for_imbalance): """Test FixedImbalanceBars.""" bars = FixedImbalanceBars( - bar_type='tick_imbalance', - initial_estimate_of_expected_n_ticks_in_bar=2, # E[T] = 2 - window_size_for_expected_imbalance_estimation=10, # E[b] window - analyse_thresholds=False + bar_type="tick_imbalance", + initial_estimate_of_expected_n_ticks_in_bar=2, # E[T] = 2 + window_size_for_expected_imbalance_estimation=10, # E[b] window + analyse_thresholds=False, ) - + # Warm-up: E[b] = ewma([0, 1]) approx 0.5 # Threshold = E[T] * |E[b]| = 2 * 0.5 = 1.0 # Tick 0 (price 100): b=0, theta=0. E[b]=nan @@ -45,55 +49,56 @@ def test_fixed_imbalance_bars(sample_tick_data_for_imbalance): # |theta| = 1.0. Condition (>=) is met. # Bar 1 constructed. # Reset: theta=0. - + # Tick 2 (price 102): b=1, theta=1. # E[T] = 2 (fixed). # E[b] = ewma([0, 1, 1]) ~ 0.7 # Threshold = 2 * 0.7 = 1.4 # |theta|=1.0. Condition not met. - + # Tick 3 (price 103): b=1, theta=2. # E[T] = 2 (fixed). # E[b] = ewma([0, 1, 1, 1]) ~ 0.8 # Threshold = 2 * 0.8 = 1.6 # |theta|=2.0. Condition (>=) is met. # Bar 2 constructed. - + bar_list = bars.construct_bars_from_data(sample_tick_data_for_imbalance) - + assert len(bar_list) == 2 - assert bar_list[0][9] == 3 # Bar 1 has 3 ticks - assert bar_list[1][9] == 4 # Bar 2 has 4 ticks + assert bar_list[0][9] == 3 # Bar 1 has 3 ticks + assert bar_list[1][9] == 4 # Bar 2 has 4 ticks + def test_expected_imbalance_bars(sample_tick_data_for_imbalance): """Test ExpectedImbalanceBars.""" bars = ExpectedImbalanceBars( - bar_type='tick_imbalance', - initial_estimate_of_expected_n_ticks_in_bar=2, # E[T] = 2 - window_size_for_expected_n_ticks_estimation=10, # E[T] window - window_size_for_expected_imbalance_estimation=10, # E[b] window + bar_type="tick_imbalance", + initial_estimate_of_expected_n_ticks_in_bar=2, # E[T] = 2 + window_size_for_expected_n_ticks_estimation=10, # E[T] window + window_size_for_expected_imbalance_estimation=10, # E[b] window expected_ticks_number_bounds=None, - analyse_thresholds=False + analyse_thresholds=False, ) - + # Tick 1: Bar 1 constructed (same as Fixed test) # E[T] updated: E[T] = ewma([2]) = 2. - + # Tick 2 (price 102): b=1, theta=1. # E[b] = ewma([0, 1, 1]) ~ 0.7 # Threshold = 2 * 0.7 = 1.4 # |theta|=1.0. Condition not met. - + # Tick 3 (price 103): b=1, theta=2. # E[b] = ewma([0, 1, 1, 1]) ~ 0.8 # Threshold = 2 * 0.8 = 1.6 # |theta|=2.0. Condition (>=) is met. # Bar 2 constructed. # E[T] updated: E[T] = ewma([2, 2]) = 2. - + bar_list = bars.construct_bars_from_data(sample_tick_data_for_imbalance) - + assert len(bar_list) == 1 - assert bar_list[0][9] == 3 # The one bar has 3 ticks + assert bar_list[0][9] == 3 # The one bar has 3 ticks # You can add the third assertion as well: - # assert bar_list[2][9] == 4 \ No newline at end of file + # assert bar_list[2][9] == 4 diff --git a/test/data/structures/test_run_bars.py b/test/data/structures/test_run_bars.py index f9f73b3..0d1133e 100644 --- a/test/data/structures/test_run_bars.py +++ b/test/data/structures/test_run_bars.py @@ -1,12 +1,12 @@ """ Tests for data/structures/run_bars.py """ + import numpy as np import pandas as pd import pytest -from RiskLabAI.data.structures.run_bars import ( - FixedRunBars, ExpectedRunBars -) +from RiskLabAI.data.structures.run_bars import FixedRunBars, ExpectedRunBars + @pytest.fixture def sample_tick_data_for_run(): @@ -19,29 +19,30 @@ def sample_tick_data_for_run(): Cumul. Sell: 0, 0, 0, 0, 1, 2, 3 """ return [ - (pd.to_datetime('2020-01-01 10:00:00'), 100, 10), - (pd.to_datetime('2020-01-01 10:00:01'), 101, 10), - (pd.to_datetime('2020-01-01 10:00:02'), 102, 10), - (pd.to_datetime('2020-01-01 10:00:03'), 103, 10), # Buy run bar - (pd.to_datetime('2020-01-01 10:00:04'), 102, 10), - (pd.to_datetime('2020-01-01 10:00:05'), 101, 10), - (pd.to_datetime('2020-01-01 10:00:06'), 100, 10), # Sell run bar + (pd.to_datetime("2020-01-01 10:00:00"), 100, 10), + (pd.to_datetime("2020-01-01 10:00:01"), 101, 10), + (pd.to_datetime("2020-01-01 10:00:02"), 102, 10), + (pd.to_datetime("2020-01-01 10:00:03"), 103, 10), # Buy run bar + (pd.to_datetime("2020-01-01 10:00:04"), 102, 10), + (pd.to_datetime("2020-01-01 10:00:05"), 101, 10), + (pd.to_datetime("2020-01-01 10:00:06"), 100, 10), # Sell run bar ] + def test_fixed_run_bars(sample_tick_data_for_run): """Test FixedRunBars.""" bars = FixedRunBars( - bar_type='tick_run', - initial_estimate_of_expected_n_ticks_in_bar=3, # E[T] = 3 - window_size_for_expected_imbalance_estimation=10, # E[theta] window - analyse_thresholds=False + bar_type="tick_run", + initial_estimate_of_expected_n_ticks_in_bar=3, # E[T] = 3 + window_size_for_expected_imbalance_estimation=10, # E[theta] window + analyse_thresholds=False, ) - + # Warm-up (Tick 0, 1): # E[T]=3, E[P_buy]=0.5 (initial guess), E[theta_buy]=0.5, E[theta_sell]=nan # Ticks 0, 1, 2: # E[P_buy]=2/3, E[theta_buy]=ewma([1,1])=1, E[theta_sell]=nan - + # Tick 3 (price 103): b=1, buy_theta=3, sell_theta=0 # E[T]=3, E[P_buy]=3/4=0.75, E[theta_buy]=1, E[theta_sell]=nan # buy_thresh = 0.75 * 1 = 0.75 @@ -49,7 +50,7 @@ def test_fixed_run_bars(sample_tick_data_for_run): # threshold = E[T] * E[P_buy] * E[theta_buy] = 3 * 0.75 = 2.25 # max_theta (3) >= 2.25. Bar 1 constructed. # Reset: buy_theta=0, sell_theta=0 - + # Tick 6 (price 100): b=-1, buy_theta=0, sell_theta=3 # E[T]=3 (fixed) # E[P_buy] = ewma([0.75]) = 0.75 @@ -58,8 +59,8 @@ def test_fixed_run_bars(sample_tick_data_for_run): # sell_thresh = (1-0.75) * 1 = 0.25 # threshold = E[T] * max(0.75, 0.25) = 3 * 0.75 = 2.25 # max_theta (3) >= 2.25. Bar 2 constructed. - + bar_list = bars.construct_bars_from_data(sample_tick_data_for_run) - + assert len(bar_list) == 1 - assert bar_list[0][9] == 7 # Bar 1 forms at tick 6 (7 total ticks) \ No newline at end of file + assert bar_list[0][9] == 7 # Bar 1 forms at tick 6 (7 total ticks) diff --git a/test/data/structures/test_standard_bars.py b/test/data/structures/test_standard_bars.py index 4ba6a43..7a7f3f1 100644 --- a/test/data/structures/test_standard_bars.py +++ b/test/data/structures/test_standard_bars.py @@ -1,66 +1,69 @@ """ Tests for data/structures/standard_bars.py """ + import numpy as np import pandas as pd import pytest from RiskLabAI.data.structures.standard_bars import StandardBars from RiskLabAI.utils.constants import * + @pytest.fixture def sample_tick_data(): """Fixture for sample tick data.""" # (date_time, price, volume) return [ - (pd.to_datetime('2020-01-01 10:00:00'), 100, 10), # T=1, V=10, D=1000 - (pd.to_datetime('2020-01-01 10:00:01'), 101, 5), # T=2, V=15, D=1505 - (pd.to_datetime('2020-01-01 10:00:02'), 100, 20), # T=3, V=35, D=3505 (Bar 1) - (pd.to_datetime('2020-01-01 10:00:03'), 101, 10), # T=1, V=10, D=1010 - (pd.to_datetime('2020-01-01 10:00:04'), 102, 10), # T=2, V=20, D=2030 - (pd.to_datetime('2020-01-01 10:00:05'), 103, 10), # T=3, V=30, D=3060 - (pd.to_datetime('2020-01-01 10:00:06'), 102, 5), # T=4, V=35, D=3570 (Bar 2) + (pd.to_datetime("2020-01-01 10:00:00"), 100, 10), # T=1, V=10, D=1000 + (pd.to_datetime("2020-01-01 10:00:01"), 101, 5), # T=2, V=15, D=1505 + (pd.to_datetime("2020-01-01 10:00:02"), 100, 20), # T=3, V=35, D=3505 (Bar 1) + (pd.to_datetime("2020-01-01 10:00:03"), 101, 10), # T=1, V=10, D=1010 + (pd.to_datetime("2020-01-01 10:00:04"), 102, 10), # T=2, V=20, D=2030 + (pd.to_datetime("2020-01-01 10:00:05"), 103, 10), # T=3, V=30, D=3060 + (pd.to_datetime("2020-01-01 10:00:06"), 102, 5), # T=4, V=35, D=3570 (Bar 2) ] + def test_tick_bars(sample_tick_data): """Test standard tick bars.""" bars = StandardBars(bar_type=CUMULATIVE_TICKS, threshold=3) bar_list = bars.construct_bars_from_data(sample_tick_data) - + assert len(bar_list) == 2 - + # Check Bar 1 # [dt, idx, open, high, low, close, vol, buy_vol, sell_vol, ticks, dollar, thresh] - assert bar_list[0][0] == pd.to_datetime('2020-01-01 10:00:02') # end time - assert bar_list[0][2] == 100 # open - assert bar_list[0][3] == 101 # high - assert bar_list[0][4] == 100 # low - assert bar_list[0][5] == 100 # close - assert bar_list[0][9] == 3 # ticks - - # Check Bar 2 - assert bar_list[1][0] == pd.to_datetime('2020-01-01 10:00:05') - assert bar_list[1][2] == 101 # open - assert bar_list[1][3] == 103 # high - assert bar_list[1][4] == 101 # low - assert bar_list[1][5] == 103 # close - assert bar_list[1][9] == 3 # ticks + assert bar_list[0][0] == pd.to_datetime("2020-01-01 10:00:02") # end time + assert bar_list[0][2] == 100 # open + assert bar_list[0][3] == 101 # high + assert bar_list[0][4] == 100 # low + assert bar_list[0][5] == 100 # close + assert bar_list[0][9] == 3 # ticks + # Check Bar 2 + assert bar_list[1][0] == pd.to_datetime("2020-01-01 10:00:05") + assert bar_list[1][2] == 101 # open + assert bar_list[1][3] == 103 # high + assert bar_list[1][4] == 101 # low + assert bar_list[1][5] == 103 # close + assert bar_list[1][9] == 3 # ticks def test_volume_bars(sample_tick_data): """Test standard volume bars.""" bars = StandardBars(bar_type=CUMULATIVE_VOLUME, threshold=35) bar_list = bars.construct_bars_from_data(sample_tick_data) - + assert len(bar_list) == 2 - assert bar_list[0][6] == 35 # cumulative_volume - assert bar_list[1][6] == 35 # cumulative_volume + assert bar_list[0][6] == 35 # cumulative_volume + assert bar_list[1][6] == 35 # cumulative_volume + def test_dollar_bars(sample_tick_data): """Test standard dollar bars.""" bars = StandardBars(bar_type=CUMULATIVE_DOLLAR, threshold=3500) bar_list = bars.construct_bars_from_data(sample_tick_data) - + assert len(bar_list) == 2 - assert bar_list[0][10] == 3505 # cumulative_dollar - assert bar_list[1][10] == 3570 # cumulative_dollar \ No newline at end of file + assert bar_list[0][10] == 3505 # cumulative_dollar + assert bar_list[1][10] == 3570 # cumulative_dollar diff --git a/test/data/structures/test_time_bars.py b/test/data/structures/test_time_bars.py index 059737b..d8d4a8e 100644 --- a/test/data/structures/test_time_bars.py +++ b/test/data/structures/test_time_bars.py @@ -1,49 +1,60 @@ """ Tests for data/structures/time_bars.py """ + import numpy as np import pandas as pd import pytest from RiskLabAI.data.structures.time_bars import TimeBars + @pytest.fixture def sample_tick_data_for_time(): """Fixture for sample tick data with fine timestamps.""" return [ - (pd.to_datetime('2020-01-01 10:00:00.100'), 100, 10), - (pd.to_datetime('2020-01-01 10:00:00.500'), 101, 5), - (pd.to_datetime('2020-01-01 10:00:01.200'), 100, 20), # Bar 1 ends, Bar 2 starts - (pd.to_datetime('2020-01-01 10:00:01.800'), 101, 10), - (pd.to_datetime('2020-01-01 10:00:02.100'), 102, 10), # Bar 2 ends, Bar 3 starts - (pd.to_datetime('2020-01-01 10:00:02.500'), 103, 10), + (pd.to_datetime("2020-01-01 10:00:00.100"), 100, 10), + (pd.to_datetime("2020-01-01 10:00:00.500"), 101, 5), + ( + pd.to_datetime("2020-01-01 10:00:01.200"), + 100, + 20, + ), # Bar 1 ends, Bar 2 starts + (pd.to_datetime("2020-01-01 10:00:01.800"), 101, 10), + ( + pd.to_datetime("2020-01-01 10:00:02.100"), + 102, + 10, + ), # Bar 2 ends, Bar 3 starts + (pd.to_datetime("2020-01-01 10:00:02.500"), 103, 10), ] + def test_time_bars(sample_tick_data_for_time): """Test time bars with 1-second resolution.""" # 1-second bars - bars = TimeBars(resolution_type='S', resolution_units=1) + bars = TimeBars(resolution_type="S", resolution_units=1) bar_list = bars.construct_bars_from_data(sample_tick_data_for_time) - + assert len(bar_list) == 2 - + # Bar 1 (from 10:00:00.000 to 10:00:01.000) # Ticks: 0, 1 # End time: 10:00:01.000 # Open: 100, High: 101, Low: 100, Close: 101 - assert bar_list[0][0] == pd.to_datetime('2020-01-01 10:00:01') # end time - assert bar_list[0][2] == 100 # open - assert bar_list[0][3] == 101 # high - assert bar_list[0][4] == 100 # low - assert bar_list[0][5] == 101 # close (from tick 1) - assert bar_list[0][9] == 2 # ticks - + assert bar_list[0][0] == pd.to_datetime("2020-01-01 10:00:01") # end time + assert bar_list[0][2] == 100 # open + assert bar_list[0][3] == 101 # high + assert bar_list[0][4] == 100 # low + assert bar_list[0][5] == 101 # close (from tick 1) + assert bar_list[0][9] == 2 # ticks + # Bar 2 (from 10:00:01.000 to 10:00:02.000) # Ticks: 2, 3 # End time: 10:00:02.000 # Open: 100, High: 101, Low: 100, Close: 101 - assert bar_list[1][0] == pd.to_datetime('2020-01-01 10:00:02') - assert bar_list[1][2] == 100 # open - assert bar_list[1][3] == 101 # high - assert bar_list[1][4] == 100 # low - assert bar_list[1][5] == 101 # close (from tick 3) - assert bar_list[1][9] == 2 # ticks \ No newline at end of file + assert bar_list[1][0] == pd.to_datetime("2020-01-01 10:00:02") + assert bar_list[1][2] == 100 # open + assert bar_list[1][3] == 101 # high + assert bar_list[1][4] == 100 # low + assert bar_list[1][5] == 101 # close (from tick 3) + assert bar_list[1][9] == 2 # ticks diff --git a/test/data/synthetic_data/test_drift_burst_hypothesis.py b/test/data/synthetic_data/test_drift_burst_hypothesis.py index bc172a3..69943a6 100644 --- a/test/data/synthetic_data/test_drift_burst_hypothesis.py +++ b/test/data/synthetic_data/test_drift_burst_hypothesis.py @@ -6,58 +6,70 @@ import pytest from RiskLabAI.data.synthetic_data.drift_burst_hypothesis import drift_volatility_burst + def test_drift_volatility_burst_shape(): """Test the output shape.""" drifts, vols = drift_volatility_burst( bubble_length=100, - a_before=1, a_after=1, - b_before=1, b_after=1, - alpha=0.5, beta=0.5 + a_before=1, + a_after=1, + b_before=1, + b_after=1, + alpha=0.5, + beta=0.5, ) assert drifts.shape == (100,) assert vols.shape == (100,) + def test_drift_volatility_burst_midpoint_handling(): """Test that the midpoint explosion is handled.""" # bubble_length=101 creates a perfect midpoint at index 50 drifts, vols = drift_volatility_burst( bubble_length=101, - a_before=1, a_after=1, - b_before=1, b_after=1, - alpha=0.5, beta=0.5, - explosion_filter_width=0.01 # Small width + a_before=1, + a_after=1, + b_before=1, + b_after=1, + alpha=0.5, + beta=0.5, + explosion_filter_width=0.01, # Small width ) - + midpoint_index = 50 - + # Check that steps[50] is 0.5 steps = np.linspace(0, 1, 101) assert np.isclose(steps[midpoint_index], 0.5) - + # Drift at midpoint should be 0 assert np.isclose(drifts[midpoint_index], 0.0) - + # Volatility at midpoint should be copied from [49] assert np.isclose(vols[midpoint_index], vols[midpoint_index - 1]) - + # Check that values just before midpoint use the filter width # step[49] = 0.49 # denominator = abs(0.49 - 0.5) = 0.01 # expected vol = 1 / sqrt(0.01) = 10 assert np.isclose(vols[midpoint_index - 1], 1.0 / np.sqrt(0.01)) + def test_drift_volatility_burst_asymmetry(): """Test that before/after parameters are used correctly.""" drifts, vols = drift_volatility_burst( bubble_length=101, - a_before=1, a_after=2, - b_before=3, b_after=4, - alpha=1.0, beta=1.0, - explosion_filter_width=0.1 + a_before=1, + a_after=2, + b_before=3, + b_after=4, + alpha=1.0, + beta=1.0, + explosion_filter_width=0.1, ) - + midpoint_index = 50 - + # Check 'a' and 'b' values before midpoint # denominator at [49] = 0.1 (due to filter) # drift = a_before / denom = 1 / 0.1 = 10 @@ -70,4 +82,4 @@ def test_drift_volatility_burst_asymmetry(): # drift = a_after / denom = 2 / 0.1 = 20 # vol = b_after / denom = 4 / 0.1 = 40 assert np.isclose(drifts[midpoint_index + 1], 2.0 / 0.1) - assert np.isclose(vols[midpoint_index + 1], 4.0 / 0.1) \ No newline at end of file + assert np.isclose(vols[midpoint_index + 1], 4.0 / 0.1) diff --git a/test/data/synthetic_data/test_synthetic_controlled_environment.py b/test/data/synthetic_data/test_synthetic_controlled_environment.py index 9d35533..eb6a249 100644 --- a/test/data/synthetic_data/test_synthetic_controlled_environment.py +++ b/test/data/synthetic_data/test_synthetic_controlled_environment.py @@ -11,68 +11,83 @@ parallel_generate_prices, ) + @pytest.fixture def sample_regimes(): """Fixture for a simple two-regime model.""" regimes = { "calm": { - "mu": 0.05, "kappa": 1.0, "theta": 0.04, "xi": 0.1, - "rho": -0.5, "lam": 0.05, "m": -0.01, "v": 0.02 + "mu": 0.05, + "kappa": 1.0, + "theta": 0.04, + "xi": 0.1, + "rho": -0.5, + "lam": 0.05, + "m": -0.01, + "v": 0.02, }, "crisis": { - "mu": -0.1, "kappa": 0.5, "theta": 0.2, "xi": 0.3, - "rho": -0.8, "lam": 0.2, "m": -0.05, "v": 0.1, + "mu": -0.1, + "kappa": 0.5, + "theta": 0.2, + "xi": 0.3, + "rho": -0.8, + "lam": 0.2, + "m": -0.05, + "v": 0.1, # Test list-based params - "v": [0.1, 0.15] # 2-step regime - } + "v": [0.1, 0.15], # 2-step regime + }, } # P(calm->calm)=0.9, P(crisis->crisis)=0.8 transition_matrix = np.array([[0.9, 0.1], [0.2, 0.8]]) return regimes, transition_matrix + def test_align_params_length(): """Test the parameter alignment helper.""" params = {"mu": 0.1, "v": [0.02, 0.03], "xi": 0.5} - + aligned_params, max_len = align_params_length(params) - + assert max_len == 2 assert aligned_params["mu"] == [0.1, 0.1] assert aligned_params["v"] == [0.02, 0.03] assert aligned_params["xi"] == [0.5, 0.5] + def test_generate_prices_from_regimes(sample_regimes): """Test the single-path price generation.""" regimes, tm = sample_regimes n_steps = 100 - + prices, regime_path = generate_prices_from_regimes( regimes, tm, total_time=1.0, n_steps=n_steps, random_state=42 ) - + assert isinstance(prices, pd.Series) assert prices.shape == (n_steps,) assert isinstance(regime_path, np.ndarray) assert regime_path.shape == (n_steps,) - + assert prices.isna().sum() == 0 assert all(r in regimes for r in regime_path) + def test_parallel_generate_prices(sample_regimes): """Test the parallel price generation.""" regimes, tm = sample_regimes n_steps = 50 n_paths = 4 - + prices_df, regimes_df = parallel_generate_prices( - n_paths, regimes, tm, total_time=1.0, - n_steps=n_steps, random_state=42, n_jobs=2 + n_paths, regimes, tm, total_time=1.0, n_steps=n_steps, random_state=42, n_jobs=2 ) - + assert isinstance(prices_df, pd.DataFrame) assert prices_df.shape == (n_steps, n_paths) assert isinstance(regimes_df, pd.DataFrame) assert regimes_df.shape == (n_steps, n_paths) - + # Check that paths are different - assert not prices_df[0].equals(prices_df[1]) \ No newline at end of file + assert not prices_df[0].equals(prices_df[1]) diff --git a/test/data/weights/test_sample_weights.py b/test/data/weights/test_sample_weights.py index ef97732..8233129 100644 --- a/test/data/weights/test_sample_weights.py +++ b/test/data/weights/test_sample_weights.py @@ -12,29 +12,29 @@ calculate_time_decay, ) + @pytest.fixture def sample_events(): """Fixture for sample events and price index.""" close_index = pd.to_datetime(pd.date_range("2020-01-01", periods=10)) - + # Event 1: [0, 4] # Event 2: [2, 6] # Event 3: [8, 9] timestamp = pd.Series( pd.to_datetime(["2020-01-05", "2020-01-07", "2020-01-10"]), - index=pd.to_datetime(["2020-01-01", "2020-01-03", "2020-01-09"]) + index=pd.to_datetime(["2020-01-01", "2020-01-03", "2020-01-09"]), ) molecule = timestamp.index return close_index, timestamp, molecule + def test_expand_label_for_meta_labeling(sample_events): """Test the concurrency calculation.""" close_index, timestamp, molecule = sample_events - - concurrency = expand_label_for_meta_labeling( - close_index, timestamp, molecule - ) - + + concurrency = expand_label_for_meta_labeling(close_index, timestamp, molecule) + # Concurrency: # 01-01: 1 # 01-02: 1 @@ -48,13 +48,12 @@ def test_expand_label_for_meta_labeling(sample_events): # 01-10: 1 expected_values = [1, 1, 2, 2, 2, 1, 1, 0, 1, 1] expected_index = pd.to_datetime(pd.date_range("2020-01-01", periods=10)) - + pd.testing.assert_series_equal( - concurrency, - pd.Series(expected_values, index=expected_index), - check_dtype=False + concurrency, pd.Series(expected_values, index=expected_index), check_dtype=False ) + def test_calculate_average_uniqueness(): """Test average uniqueness calculation.""" # T=4, N=3 @@ -63,54 +62,52 @@ def test_calculate_average_uniqueness(): # Event 2: [0, 1] idx_matrix = pd.DataFrame( [ - [1, 0, 1], # t=0, c=2 - [1, 1, 1], # t=1, c=3 - [1, 1, 0], # t=2, c=2 - [0, 1, 0] # t=3, c=1 + [1, 0, 1], # t=0, c=2 + [1, 1, 1], # t=1, c=3 + [1, 1, 0], # t=2, c=2 + [0, 1, 0], # t=3, c=1 ] ) - # Uniqueness = + # Uniqueness = # [1/2, 0, 1/2] # [1/3, 1/3, 1/3] # [1/2, 1/2, 0] # [0, 1, 0] - + # Avg Uniqueness (by column): # E0: (1/2 + 1/3 + 1/2) / 3 = (0.5 + 0.333 + 0.5) / 3 = 1.333 / 3 = 0.444 # E1: (1/3 + 1/2 + 1) / 3 = (0.333 + 0.5 + 1) / 3 = 1.833 / 3 = 0.611 # E2: (1/2 + 1/3) / 2 = (0.5 + 0.333) / 2 = 0.833 / 2 = 0.416 - + avg_u = calculate_average_uniqueness(idx_matrix) - - assert np.isclose(avg_u[0], (0.5 + 1/3 + 0.5) / 3) - assert np.isclose(avg_u[1], (1/3 + 0.5 + 1) / 3) - assert np.isclose(avg_u[2], (0.5 + 1/3) / 2) + + assert np.isclose(avg_u[0], (0.5 + 1 / 3 + 0.5) / 3) + assert np.isclose(avg_u[1], (1 / 3 + 0.5 + 1) / 3) + assert np.isclose(avg_u[2], (0.5 + 1 / 3) / 2) + def test_sample_weight_absolute_return(sample_events): """Test sample weighting by absolute return.""" close_index, timestamp, molecule = sample_events - prices = pd.Series( - [10, 11, 12, 13, 12, 11, 10, 11, 12, 13], index=close_index - ) - - weights = sample_weight_absolute_return_meta_labeling( - timestamp, prices, molecule - ) - + prices = pd.Series([10, 11, 12, 13, 12, 11, 10, 11, 12, 13], index=close_index) + + weights = sample_weight_absolute_return_meta_labeling(timestamp, prices, molecule) + assert weights.shape == (3,) - assert np.isclose(weights.sum(), 3.0) # Normalized to N - assert weights.loc['2020-01-01'] > 0 - assert weights.loc['2020-01-03'] > 0 - assert weights.loc['2020-01-09'] > 0 + assert np.isclose(weights.sum(), 3.0) # Normalized to N + assert weights.loc["2020-01-01"] > 0 + assert weights.loc["2020-01-03"] > 0 + assert weights.loc["2020-01-09"] > 0 + def test_calculate_time_decay(): """Test time decay weighting.""" weights = pd.Series(1.0, index=pd.date_range("2020-01-01", periods=10)) - + # Test 1: No decay decayed_1 = calculate_time_decay(weights, clf_last_weight=1.0) assert np.allclose(decayed_1, 1.0) - + # Test 2: Linear decay to 0 decayed_0 = calculate_time_decay(weights, clf_last_weight=0.0) # cumsum = [1, 2, ..., 10] @@ -119,11 +116,11 @@ def test_calculate_time_decay(): # new_weights = 0 + 0.1 * [1, 2, ..., 10] = [0.1, 0.2, ..., 1.0] expected_0 = np.arange(1, 11) * 0.1 assert np.allclose(decayed_0, expected_0) - + # Test 3: Linear decay to 0.5 decayed_05 = calculate_time_decay(weights, clf_last_weight=0.5) # slope = (1-0.5) / 10 = 0.05 # const = 1 - 0.05 * 10 = 0.5 # new_weights = 0.5 + 0.05 * [1, ..., 10] = [0.55, 0.6, ..., 1.0] expected_05 = 0.5 + 0.05 * np.arange(1, 11) - assert np.allclose(decayed_05, expected_05) \ No newline at end of file + assert np.allclose(decayed_05, expected_05) diff --git a/test/ensemble/test_bagging_classifier_accuracy.py b/test/ensemble/test_bagging_classifier_accuracy.py index 963cc79..b309d51 100644 --- a/test/ensemble/test_bagging_classifier_accuracy.py +++ b/test/ensemble/test_bagging_classifier_accuracy.py @@ -6,6 +6,7 @@ import numpy as np from RiskLabAI.ensemble.bagging_classifier_accuracy import bagging_classifier_accuracy + def test_bagging_accuracy(): """ Test the bagging classifier accuracy. @@ -23,10 +24,10 @@ def test_bagging_accuracy(): # 4. If N=1, accuracy should be p assert np.isclose(bagging_classifier_accuracy(N=1, p=0.7), 0.7) - + # 5. Test with N=3, p=0.7 # P(X=2) + P(X=3) # P(X=2) = comb(3, 2) * (0.7**2) * (0.3**1) = 3 * 0.49 * 0.3 = 0.441 # P(X=3) = comb(3, 3) * (0.7**3) * (0.3**0) = 1 * 0.343 * 1 = 0.343 # Total = 0.441 + 0.343 = 0.784 - assert np.isclose(bagging_classifier_accuracy(N=3, p=0.7), 0.784) \ No newline at end of file + assert np.isclose(bagging_classifier_accuracy(N=3, p=0.7), 0.784) diff --git a/test/features/entropy_features/test_entropy.py b/test/features/entropy_features/test_entropy.py index 40a606a..18b3468 100644 --- a/test/features/entropy_features/test_entropy.py +++ b/test/features/entropy_features/test_entropy.py @@ -15,6 +15,7 @@ MSG_MED = "ABABABABAB" MSG_HIGH = "ABCDEFGHIJ" + # --- Shannon Tests --- def test_shannon_entropy(): assert np.isclose(shannon_entropy(MSG_LOW), 0.0) @@ -22,6 +23,7 @@ def test_shannon_entropy(): assert np.isclose(shannon_entropy(MSG_HIGH), np.log2(10)) assert np.isclose(shannon_entropy(""), 0.0) + # --- Lempel-Ziv Tests --- def test_lempel_ziv_entropy(): # lib = {"A", "AA", "AAA", "AAAA"} -> len=4 @@ -31,6 +33,7 @@ def test_lempel_ziv_entropy(): # lib = {"A", "B", "C", ..., "J"} -> len=10 assert np.isclose(lempel_ziv_entropy(MSG_HIGH), 10 / 10.0) + # --- PMF Tests --- def test_probability_mass_function(): pmf = probability_mass_function(MSG_LOW, approximate_word_length=1) @@ -42,18 +45,20 @@ def test_probability_mass_function(): assert np.isclose(pmf_2["AB"], 5 / 9.0) assert np.isclose(pmf_2["BA"], 4 / 9.0) + # --- Plug-in Tests --- def test_plug_in_estimator(): # word_len=1 -> same as shannon assert np.isclose(plug_in_entropy_estimator(MSG_LOW, 1), 0.0) assert np.isclose(plug_in_entropy_estimator(MSG_MED, 1), 1.0) - + # word_len=2 # H = -( (5/9)*log2(5/9) + (4/9)*log2(4/9) ) = 0.991 # H_norm = H / 2 - h = -( (5/9)*np.log2(5/9) + (4/9)*np.log2(4/9) ) + h = -((5 / 9) * np.log2(5 / 9) + (4 / 9) * np.log2(4 / 9)) assert np.isclose(plug_in_entropy_estimator(MSG_MED, 2), h / 2.0) + # --- Kontoyiannis Tests --- def test_kontoyiannis_entropy(): # Expanding window @@ -65,10 +70,9 @@ def test_kontoyiannis_entropy(): # h = (0.5 + 0.528 + 1.0) / 3 = 2.028 / 3 = 0.676 assert np.isclose(kontoyiannis_entropy("AAAAA"), 0.62055, atol=1e-3) - # Rolling window # window=3. points=range(3, 5) -> [3, 4] # i=3: n=3. L_i(message, 3, 3) -> "AA" in "AAA" -> L=3. sum += log2(3)/3 = 0.528 # i=4: n=3. L_i(message, 4, 3) -> "A" in "AAA" -> L=2. sum += log2(3)/2 = 0.792 # h = (0.528 + 0.792) / 2 = 0.66 - assert np.isclose(kontoyiannis_entropy("AAAAA", window=3), 0.660, atol=1e-3) \ No newline at end of file + assert np.isclose(kontoyiannis_entropy("AAAAA", window=3), 0.660, atol=1e-3) diff --git a/test/features/feature_importance/test_feature_importance.py b/test/features/feature_importance/test_feature_importance.py index 9fb7cc8..559bc61 100644 --- a/test/features/feature_importance/test_feature_importance.py +++ b/test/features/feature_importance/test_feature_importance.py @@ -6,7 +6,10 @@ import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier -from RiskLabAI.features.feature_importance.feature_importance_controller import FeatureImportanceController +from RiskLabAI.features.feature_importance.feature_importance_controller import ( + FeatureImportanceController, +) + @pytest.fixture def mock_data(): @@ -15,61 +18,64 @@ def mock_data(): N = 100 P = 10 - X = pd.DataFrame(rng.normal(0, 1, size=(N, P)), - columns=[f'feat_{i}' for i in range(P)]) + X = pd.DataFrame( + rng.normal(0, 1, size=(N, P)), columns=[f"feat_{i}" for i in range(P)] + ) # Make features 0, 1, 2 correlated - X['feat_1'] = X['feat_0'] + rng.normal(0, 0.1, N) - X['feat_2'] = X['feat_0'] + rng.normal(0, 0.1, N) - + X["feat_1"] = X["feat_0"] + rng.normal(0, 0.1, N) + X["feat_2"] = X["feat_0"] + rng.normal(0, 0.1, N) + # Target depends on feat_0 (and its cluster) and feat_5 - y = pd.Series(np.where(X['feat_0'] + X['feat_5'] > 0, 1, 0)) - + y = pd.Series(np.where(X["feat_0"] + X["feat_5"] > 0, 1, 0)) + clusters = { - 'cluster_0': ['feat_0', 'feat_1', 'feat_2'], - 'cluster_1': ['feat_3', 'feat_4'], - 'cluster_2': ['feat_5', 'feat_6', 'feat_7'], - 'cluster_3': ['feat_8', 'feat_9'], + "cluster_0": ["feat_0", "feat_1", "feat_2"], + "cluster_1": ["feat_3", "feat_4"], + "cluster_2": ["feat_5", "feat_6", "feat_7"], + "cluster_3": ["feat_8", "feat_9"], } - + classifier = RandomForestClassifier(n_estimators=10, random_state=42) - + return X, y, classifier, clusters + def test_controller_mdi(mock_data): """Test MDI via the controller.""" X, y, classifier, _ = mock_data - + controller = FeatureImportanceController("MDI", classifier=classifier) importance = controller.calculate_importance(X, y) - + assert isinstance(importance, pd.DataFrame) assert importance.shape == (10, 2) - assert 'Mean' in importance.columns - assert np.isclose(importance['Mean'].sum(), 1.0) - assert importance['Mean'].idxmax() in ['feat_0', 'feat_1', 'feat_2', 'feat_5'] + assert "Mean" in importance.columns + assert np.isclose(importance["Mean"].sum(), 1.0) + assert importance["Mean"].idxmax() in ["feat_0", "feat_1", "feat_2", "feat_5"] def test_controller_clustered_mdi(mock_data): """Test Clustered MDI via the controller.""" X, y, classifier, clusters = mock_data - + controller = FeatureImportanceController( "ClusteredMDI", classifier=classifier, clusters=clusters ) importance = controller.calculate_importance(X, y) - + assert isinstance(importance, pd.DataFrame) - assert importance.shape == (4, 2) # 4 clusters - assert 'C_cluster_0' in importance.index - assert np.isclose(importance['Mean'].sum(), 1.0) + assert importance.shape == (4, 2) # 4 clusters + assert "C_cluster_0" in importance.index + assert np.isclose(importance["Mean"].sum(), 1.0) # cluster_0 should have the highest importance - assert importance['Mean'].idxmax() in ['C_cluster_0', 'C_cluster_2'] + assert importance["Mean"].idxmax() in ["C_cluster_0", "C_cluster_2"] + def test_controller_mda(mock_data): """Test MDA via the controller.""" X, y, classifier, _ = mock_data - + controller = FeatureImportanceController( "MDA", classifier=classifier, n_splits=3, random_state=42 ) @@ -79,38 +85,42 @@ def test_controller_mda(mock_data): assert isinstance(importance, pd.DataFrame) assert importance.shape == (10, 2) - top_2 = set(importance['Mean'].nlargest(2).index) - assert 'feat_5' in top_2 - + top_2 = set(importance["Mean"].nlargest(2).index) + assert "feat_5" in top_2 + # --- THIS IS THE FIX --- # OLD (INCORRECT) ASSERTION: # assert top_2.intersection({'feat_0', 'feat_1', 'feat_2'}) - + # NEW (CORRECT) ASSERTION: # We assert the redundant features are NOT important in standard MDA. - assert not top_2.intersection({'feat_0', 'feat_1', 'feat_2'}) + assert not top_2.intersection({"feat_0", "feat_1", "feat_2"}) def test_controller_clustered_mda(mock_data): """Test Clustered MDA via the controller.""" X, y, classifier, clusters = mock_data - + controller = FeatureImportanceController( - "ClusteredMDA", classifier=classifier, clusters=clusters, n_splits=3, random_state=42 + "ClusteredMDA", + classifier=classifier, + clusters=clusters, + n_splits=3, + random_state=42, ) importance = controller.calculate_importance(X, y) - + assert isinstance(importance, pd.DataFrame) assert importance.shape == (4, 2) # cluster_0 and cluster_2 should be most important - top_2 = importance['Mean'].nlargest(2).index - assert 'C_cluster_0' in top_2 - assert 'C_cluster_2' in top_2 + top_2 = importance["Mean"].nlargest(2).index + assert "C_cluster_0" in top_2 + assert "C_cluster_2" in top_2 + def test_controller_sfi(mock_data): """Test SFI via the controller.""" X, y, classifier, _ = mock_data - controller = FeatureImportanceController( "SFI", classifier=classifier, n_splits=3, scoring="accuracy" @@ -118,11 +128,11 @@ def test_controller_sfi(mock_data): importance = controller.calculate_importance(X, y) assert isinstance(importance, pd.DataFrame) - assert importance.shape == (10, 2) # <-- This is correct for SFI + assert importance.shape == (10, 2) # <-- This is correct for SFI - top_5 = set(importance['Mean'].nlargest(5).index) + top_5 = set(importance["Mean"].nlargest(5).index) - assert 'feat_0' in top_5 - assert 'feat_1' in top_5 - assert 'feat_2' in top_5 - assert 'feat_5' in top_5 \ No newline at end of file + assert "feat_0" in top_5 + assert "feat_1" in top_5 + assert "feat_2" in top_5 + assert "feat_5" in top_5 diff --git a/test/features/feature_importance/test_generate_synthetic_data.py b/test/features/feature_importance/test_generate_synthetic_data.py index f552676..281322d 100644 --- a/test/features/feature_importance/test_generate_synthetic_data.py +++ b/test/features/feature_importance/test_generate_synthetic_data.py @@ -4,7 +4,10 @@ import pytest import pandas as pd -from RiskLabAI.features.feature_importance.generate_synthetic_data import get_test_dataset +from RiskLabAI.features.feature_importance.generate_synthetic_data import ( + get_test_dataset, +) + def test_get_test_dataset(): """Test the synthetic data generation.""" @@ -12,25 +15,28 @@ def test_get_test_dataset(): n_informative = 10 n_redundant = 20 n_samples = 100 - + X, y = get_test_dataset( n_features=n_features, n_informative=n_informative, n_redundant=n_redundant, n_samples=n_samples, random_state=42, - sigma_std=0.1 + sigma_std=0.1, ) # Check shapes assert X.shape == (n_samples, n_features) assert y.shape == (n_samples,) - + # Check column names - assert len([col for col in X.columns if col.startswith('I_')]) == n_informative - assert len([col for col in X.columns if col.startswith('R_')]) == n_redundant - assert len([col for col in X.columns if col.startswith('N_')]) == n_features - n_informative - n_redundant - + assert len([col for col in X.columns if col.startswith("I_")]) == n_informative + assert len([col for col in X.columns if col.startswith("R_")]) == n_redundant + assert ( + len([col for col in X.columns if col.startswith("N_")]) + == n_features - n_informative - n_redundant + ) + # Check for determinism X2, y2 = get_test_dataset( n_features=n_features, @@ -38,7 +44,7 @@ def test_get_test_dataset(): n_redundant=n_redundant, n_samples=n_samples, random_state=42, - sigma_std=0.1 + sigma_std=0.1, ) pd.testing.assert_frame_equal(X, X2) - pd.testing.assert_series_equal(y, y2) \ No newline at end of file + pd.testing.assert_series_equal(y, y2) diff --git a/test/features/feature_importance/test_orthogonal_features.py b/test/features/feature_importance/test_orthogonal_features.py index 8df5ad6..8a61b8b 100644 --- a/test/features/feature_importance/test_orthogonal_features.py +++ b/test/features/feature_importance/test_orthogonal_features.py @@ -5,41 +5,44 @@ import pytest import pandas as pd import numpy as np -from RiskLabAI.features.feature_importance.orthogonal_features import orthogonal_features +from RiskLabAI.features.feature_importance.orthogonal_features import ( + orthogonal_features, +) + @pytest.fixture def mock_features(): """Mock correlated features.""" N = 100 - X = pd.DataFrame(np.random.normal(0, 1, size=(N, 3)), - columns=['A', 'B', 'C']) + X = pd.DataFrame(np.random.normal(0, 1, size=(N, 3)), columns=["A", "B", "C"]) # Make B and C highly correlated with A - X['B'] = X['A'] + np.random.normal(0, 0.01, N) - X['C'] = X['A'] + np.random.normal(0, 0.01, N) + X["B"] = X["A"] + np.random.normal(0, 0.01, N) + X["C"] = X["A"] + np.random.normal(0, 0.01, N) # Add an independent feature - X['D'] = np.random.normal(0, 1, size=(N,)) + X["D"] = np.random.normal(0, 1, size=(N,)) return X + def test_orthogonal_features(mock_features): """Test orthogonal feature generation.""" X = mock_features - + ortho_X, eigen_df = orthogonal_features(X, variance_threshold=0.95) - + # Check shapes # With 3 highly correlated features + 1 independent, we expect 2 main PCs assert ortho_X.shape[1] == 2 assert eigen_df.shape[0] == 2 - + # Check column names - assert 'PC_1' in ortho_X.columns - assert 'PC_2' in ortho_X.columns - + assert "PC_1" in ortho_X.columns + assert "PC_2" in ortho_X.columns + # Check cumulative variance - assert eigen_df.iloc[0]['CumulativeVariance'] < 0.95 - assert eigen_df.iloc[1]['CumulativeVariance'] >= 0.95 - + assert eigen_df.iloc[0]["CumulativeVariance"] < 0.95 + assert eigen_df.iloc[1]["CumulativeVariance"] >= 0.95 + # Check for orthogonality corr_matrix = ortho_X.corr() # Off-diagonal should be near 0 - assert np.isclose(corr_matrix.loc['PC_1', 'PC_2'], 0.0, atol=1e-10) \ No newline at end of file + assert np.isclose(corr_matrix.loc["PC_1", "PC_2"], 0.0, atol=1e-10) diff --git a/test/features/feature_importance/test_weighted_tau.py b/test/features/feature_importance/test_weighted_tau.py index b3992a0..f6e9a53 100644 --- a/test/features/feature_importance/test_weighted_tau.py +++ b/test/features/feature_importance/test_weighted_tau.py @@ -6,6 +6,7 @@ import numpy as np from RiskLabAI.features.feature_importance.weighted_tau import calculate_weighted_tau + def test_weighted_tau(): """Test the weighted tau calculation.""" # Perfect positive correlation @@ -13,13 +14,13 @@ def test_weighted_tau(): ranks = np.array([1, 2, 3]) tau_pos = calculate_weighted_tau(imp, ranks) assert np.isclose(tau_pos, 1.0) - + # Perfect negative correlation imp_neg = np.array([0.1, 0.3, 0.5]) tau_neg = calculate_weighted_tau(imp_neg, ranks) assert np.isclose(tau_neg, -1.0) - + # Mixed correlation imp_mix = np.array([0.5, 0.1, 0.3]) tau_mix = calculate_weighted_tau(imp_mix, ranks) - assert -1.0 < tau_mix < 1.0 \ No newline at end of file + assert -1.0 < tau_mix < 1.0 diff --git a/test/features/microstructural_features/test_microstructure.py b/test/features/microstructural_features/test_microstructure.py index 182257d..bbdbf7b 100644 --- a/test/features/microstructural_features/test_microstructure.py +++ b/test/features/microstructural_features/test_microstructure.py @@ -16,6 +16,7 @@ bekker_parkinson_volatility_estimates, ) + @pytest.fixture def sample_hl_prices(): """Fixture for high/low price series.""" @@ -24,11 +25,12 @@ def sample_hl_prices(): low = pd.Series([9.9, 10.1, 10.0, 10.2, 10.3]) return high, low + def test_corwin_schultz(sample_hl_prices): """Test the Corwin-Schultz estimator end-to-end.""" high, low = sample_hl_prices window = 2 - + # 1. Beta beta = beta_estimates(high, low, window) # log(H/L)^2 @@ -55,18 +57,19 @@ def test_corwin_schultz(sample_hl_prices): # 3. Alpha alpha = alpha_estimates(beta, gamma) - assert alpha.iloc[2] >= 0 # Should be floored at 0 - + assert alpha.iloc[2] >= 0 # Should be floored at 0 + # 4. Spread spread = corwin_schultz_estimator(high, low, window) assert not spread.isna().all() assert spread.iloc[-1] >= 0 + def test_bekker_parkinson(sample_hl_prices): """Test the Bekker-Parkinson estimator.""" high, low = sample_hl_prices window = 2 - + vol = bekker_parkinson_volatility_estimates(high, low, window) assert not vol.isna().all() - assert vol.iloc[-1] >= 0 \ No newline at end of file + assert vol.iloc[-1] >= 0 diff --git a/test/features/structural_breaks/test_structural_breaks.py b/test/features/structural_breaks/test_structural_breaks.py index b1fb69b..ec457a8 100644 --- a/test/features/structural_breaks/test_structural_breaks.py +++ b/test/features/structural_breaks/test_structural_breaks.py @@ -10,36 +10,40 @@ lag_dataframe, prepare_data, compute_beta, - get_bsadf_statistic + get_bsadf_statistic, ) + @pytest.fixture def sample_series(): """A simple series for testing.""" - return pd.DataFrame({'price': [1.0, 1.2, 1.1, 1.3, 1.5, 1.4]}) + return pd.DataFrame({"price": [1.0, 1.2, 1.1, 1.3, 1.5, 1.4]}) + @pytest.fixture def random_walk_series(): """A non-stationary random walk.""" rng = np.random.default_rng(42) log_price = np.log(100 + rng.normal(0, 1, 100).cumsum()) - return pd.DataFrame({'log_price': log_price}) + return pd.DataFrame({"log_price": log_price}) + def test_lag_dataframe(sample_series): """Test the lag_dataframe function.""" lags = 2 df = lag_dataframe(sample_series, lags) - + # Should create columns for lags 0, 1, 2 - assert 'price_0' in df.columns - assert 'price_1' in df.columns - assert 'price_2' in df.columns - + assert "price_0" in df.columns + assert "price_1" in df.columns + assert "price_2" in df.columns + # Check values - assert np.isclose(df['price_0'].iloc[2], 1.1) - assert np.isclose(df['price_1'].iloc[2], 1.2) - assert np.isclose(df['price_2'].iloc[2], 1.0) - assert pd.isna(df['price_2'].iloc[1]) + assert np.isclose(df["price_0"].iloc[2], 1.1) + assert np.isclose(df["price_1"].iloc[2], 1.2) + assert np.isclose(df["price_2"].iloc[2], 1.0) + assert pd.isna(df["price_2"].iloc[1]) + def test_prepare_data(sample_series): """Test the prepare_data function.""" @@ -61,10 +65,10 @@ def test_prepare_data(sample_series): # [ 1.4, 0.2 ] # y = diff (index 2-5) # [ -0.1, 0.2, 0.2, -0.1 ] - + # FIX 1: Pass the Series, not the DataFrame - y, x = prepare_data(sample_series['price'], constant='c', lags=lags) - + y, x = prepare_data(sample_series["price"], constant="c", lags=lags) + assert y.shape == (4, 1) # The expected x calculation in your test file comments was slightly off. # The lagged level (x_df) should be [nan, 1.0, 1.2, 1.1, 1.3, 1.5] @@ -75,19 +79,17 @@ def test_prepare_data(sample_series): # 0.2 [1.1, -0.1, 1.0] # 0.2 [1.3, 0.2, 1.0] # -0.1 [1.5, 0.2, 1.0] - assert x.shape == (4, 3) # level, lag 1 diff, constant - + assert x.shape == (4, 3) # level, lag 1 diff, constant + expected_y = np.array([[-0.1], [0.2], [0.2], [-0.1]]) - expected_x = np.array([ - [1.2, 0.2, 1.0], - [1.1, -0.1, 1.0], - [1.3, 0.2, 1.0], - [1.5, 0.2, 1.0] - ]) - + expected_x = np.array( + [[1.2, 0.2, 1.0], [1.1, -0.1, 1.0], [1.3, 0.2, 1.0], [1.5, 0.2, 1.0]] + ) + assert np.allclose(y, expected_y) assert np.allclose(x, expected_x) + def test_compute_beta_bugfix(): """ Test compute_beta against statsmodels to verify the bugfix. @@ -95,34 +97,35 @@ def test_compute_beta_bugfix(): # 1. Prepare data y_vec = np.array([1, 2, 3, 4, 5], dtype=float) x_vec = np.array([1.1, 1.9, 3.0, 4.1, 4.9], dtype=float) - x_mat = sm.add_constant(x_vec) # [const, x1] + x_mat = sm.add_constant(x_vec) # [const, x1] y_vec = y_vec.reshape(-1, 1) - + # 2. Get correct result from statsmodels model = sm.OLS(y_vec, x_mat).fit() sm_betas = model.params.reshape(-1, 1) sm_vcov = model.cov_params() - + # 3. Get result from our function my_betas, my_vcov = compute_beta(y_vec, x_mat) - + # 4. Compare assert np.allclose(my_betas, sm_betas) assert np.allclose(my_vcov, sm_vcov) - + + def test_adf_function(random_walk_series): """Test the main ADF loop.""" # FIX 2: Pass the Series, not the DataFrame results = get_bsadf_statistic( - log_price=random_walk_series['log_price'], + log_price=random_walk_series["log_price"], min_sample_length=20, - constant='c', - lags=1 + constant="c", + lags=1, ) - - assert 'Time' in results - - assert 'bsadf' in results - assert isinstance(results['Time'], int) - assert isinstance(results['bsadf'], float) - assert np.isfinite(results['bsadf']) \ No newline at end of file + + assert "Time" in results + + assert "bsadf" in results + assert isinstance(results["Time"], int) + assert isinstance(results["bsadf"], float) + assert np.isfinite(results["bsadf"]) diff --git a/test/hpc/test_hpc.py b/test/hpc/test_hpc.py index e0f4a6f..338bcdf 100644 --- a/test/hpc/test_hpc.py +++ b/test/hpc/test_hpc.py @@ -10,33 +10,38 @@ nested_partitions, process_jobs_sequential, process_jobs, - mp_pandas_obj + mp_pandas_obj, ) # --- Test Functions for Parallelism --- + def _test_func_sum(x: int, y: int) -> int: """Simple function for testing.""" return x + y + def _test_func_pandas(molecule: pd.Index, series: pd.Series) -> pd.Series: """Test function for mp_pandas_obj.""" return series.loc[molecule] * 2 + # --- Tests --- + def test_linear_partitions(): """Test linear partitioning.""" # 100 items, 4 threads parts = linear_partitions(num_atoms=100, num_threads=4) expected = np.array([0, 25, 50, 75, 100]) np.testing.assert_array_equal(parts, expected) - + # 10 atoms, 12 threads (caps at num_atoms) parts_capped = linear_partitions(num_atoms=10, num_threads=12) - assert len(parts_capped) == 11 # 10 partitions + 1 + assert len(parts_capped) == 11 # 10 partitions + 1 assert parts_capped[-1] == 10 + def test_nested_partitions(): """Test nested partitioning.""" parts = nested_partitions(num_atoms=100, num_threads=4) @@ -46,35 +51,33 @@ def test_nested_partitions(): assert parts[-1] == 100 assert parts[1] - parts[0] != parts[2] - parts[1] + def test_process_jobs_sequential(): """Test the sequential job processor.""" jobs = [ - {'func': _test_func_sum, 'x': 1, 'y': 2}, # 3 - {'func': _test_func_sum, 'x': 5, 'y': 5}, # 10 + {"func": _test_func_sum, "x": 1, "y": 2}, # 3 + {"func": _test_func_sum, "x": 5, "y": 5}, # 10 ] results = process_jobs_sequential(jobs) assert results == [3, 10] + def test_process_jobs_parallel(): """Test the parallel job processor.""" jobs = [ - {'func': _test_func_sum, 'x': 1, 'y': 2}, # 3 - {'func': _test_func_sum, 'x': 5, 'y': 5}, # 10 + {"func": _test_func_sum, "x": 1, "y": 2}, # 3 + {"func": _test_func_sum, "x": 5, "y": 5}, # 10 ] results = process_jobs(jobs, num_threads=2) assert sorted(results) == [3, 10] + def test_mp_pandas_obj(): """Test the mp_pandas_obj wrapper.""" - series = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e']) - pd_obj = ('molecule', series.index) - - result = mp_pandas_obj( - _test_func_pandas, - pd_obj, - num_threads=2, - series=series - ) - + series = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) + pd_obj = ("molecule", series.index) + + result = mp_pandas_obj(_test_func_pandas, pd_obj, num_threads=2, series=series) + expected = series * 2 - pd.testing.assert_series_equal(result, expected) \ No newline at end of file + pd.testing.assert_series_equal(result, expected) diff --git a/test/optimization/test_hedging.py b/test/optimization/test_hedging.py index fb04162..f5bb6ed 100644 --- a/test/optimization/test_hedging.py +++ b/test/optimization/test_hedging.py @@ -6,6 +6,7 @@ import pytest from RiskLabAI.optimization.hedging import pca_weights + @pytest.fixture def sample_cov_matrix(): """A simple 2x2 covariance matrix.""" @@ -15,11 +16,12 @@ def sample_cov_matrix(): # cov(0,1) = 0.5 * 1 * 2 = 1 return np.array([[1.0, 1.0], [1.0, 4.0]]) + def test_pca_weights_min_variance(sample_cov_matrix): """Test PCA weights for minimum variance (default).""" cov = sample_cov_matrix weights = pca_weights(cov, risk_distribution=None, risk_target=1.0) - + # Eigenvectors of [[1, 1], [1, 4]] # (solve (1-L)(4-L) - 1 = 0 => L^2 - 5L + 3 = 0) # L = (5 +/- sqrt(25-12))/2 = (5 +/- sqrt(13))/2 @@ -28,14 +30,14 @@ def test_pca_weights_min_variance(sample_cov_matrix): # [v1, v2] # v1 (L=4.302): [1, 3.302] -> norm -> [0.293, 0.956] # v2 (L=0.697): [1, -0.302] -> norm -> [0.956, -0.293] - + # Risk distribution = [0, 1] (all on L2) # Loads = 1.0 * [0, 1] / [4.302, 0.697]**0.5 = [0, 1.196] # Weights = V * Loads' # w0 = v1[0]*0 + v2[0]*1.196 = 0.956 * 1.196 = 1.143 # w1 = v1[1]*0 + v2[1]*1.196 = -0.293 * 1.196 = -0.350 # (Note: This is just one solution, sign can be flipped) - + # The important part: the resulting portfolio variance # w = [1.143, -0.350] # Var = w' * C * w = [1.143, -0.350] * [[1,1],[1,4]] * [1.143, -0.350]' @@ -43,13 +45,13 @@ def test_pca_weights_min_variance(sample_cov_matrix): # = [1.143, -0.350] * [0.793, -0.257]' # = 1.143*0.793 + (-0.350)*(-0.257) = 0.906 + 0.09 = 0.996 # This variance (0.996) should be 1.0 (the risk target) - + port_var = weights.T @ cov @ weights assert np.isclose(port_var, 1.0) - + # Check that it's not the max variance risk_dist_max = np.array([1.0, 0.0]) weights_max = pca_weights(cov, risk_dist_max, risk_target=1.0) port_var_max = weights_max.T @ cov @ weights_max assert np.isclose(port_var_max, 1.0) - assert not np.allclose(weights, weights_max) \ No newline at end of file + assert not np.allclose(weights, weights_max) diff --git a/test/optimization/test_hrp.py b/test/optimization/test_hrp.py index fb9a4f3..b9bf5f5 100644 --- a/test/optimization/test_hrp.py +++ b/test/optimization/test_hrp.py @@ -9,10 +9,11 @@ inverse_variance_weights, cluster_variance, quasi_diagonal, - hrp + hrp, ) import scipy.cluster.hierarchy as sch + @pytest.fixture def mock_cov_matrix(): """ @@ -21,13 +22,16 @@ def mock_cov_matrix(): Assets ['C', 'D'] are correlated. Blocks are uncorrelated. """ - cov = np.array([ - [1.0, 0.8, 0.0, 0.0], - [0.8, 1.0, 0.0, 0.0], - [0.0, 0.0, 1.0, 0.5], - [0.0, 0.0, 0.5, 1.0] - ]) - return pd.DataFrame(cov, columns=['A', 'B', 'C', 'D'], index=['A', 'B', 'C', 'D']) + cov = np.array( + [ + [1.0, 0.8, 0.0, 0.0], + [0.8, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.5], + [0.0, 0.0, 0.5, 1.0], + ] + ) + return pd.DataFrame(cov, columns=["A", "B", "C", "D"], index=["A", "B", "C", "D"]) + def test_inverse_variance_weights(mock_cov_matrix): """Test inverse variance weights.""" @@ -38,6 +42,7 @@ def test_inverse_variance_weights(mock_cov_matrix): # Weights = [0.25, 0.25, 0.25, 0.25] assert np.allclose(weights, [0.25, 0.25, 0.25, 0.25]) + def test_cluster_variance(mock_cov_matrix): """Test cluster variance calculation.""" # Cluster 0 = ['A', 'B'] @@ -45,7 +50,7 @@ def test_cluster_variance(mock_cov_matrix): # IVP weights = [0.5, 0.5] # Var = [0.5, 0.5] @ [[1, 0.8], [0.8, 1]] @ [0.5, 0.5]' # = [0.5, 0.5] @ [0.9, 0.9]' = 0.45 + 0.45 = 0.9 - var_0 = cluster_variance(mock_cov_matrix, ['A', 'B']) + var_0 = cluster_variance(mock_cov_matrix, ["A", "B"]) assert np.isclose(var_0, 0.9) # Cluster 1 = ['C', 'D'] @@ -53,17 +58,18 @@ def test_cluster_variance(mock_cov_matrix): # IVP weights = [0.5, 0.5] # Var = [0.5, 0.5] @ [[1, 0.5], [0.5, 1]] @ [0.5, 0.5]' # = [0.5, 0.5] @ [0.75, 0.75]' = 0.375 + 0.375 = 0.75 - var_1 = cluster_variance(mock_cov_matrix, ['C', 'D']) + var_1 = cluster_variance(mock_cov_matrix, ["C", "D"]) assert np.isclose(var_1, 0.75) + def test_hrp(mock_cov_matrix): """Test the full HRP algorithm.""" weights = hrp(mock_cov_matrix, mock_cov_matrix) - + assert isinstance(weights, pd.Series) assert weights.shape == (4,) assert np.isclose(weights.sum(), 1.0) - + # Test allocation # Var(A,B) = 0.9, Var(C,D) = 0.75 # Alpha = 1 - 0.9 / (0.9 + 0.75) = 1 - 0.545 = 0.4545 @@ -74,13 +80,12 @@ def test_hrp(mock_cov_matrix): # Bisection of (C,D): Var(C)=1, Var(D)=1. Alpha=0.5 # W(C) = 0.5454 * 0.5 = 0.272 # W(D) = 0.5454 * 0.5 = 0.272 - + # Note: The 'sorted_items' in the test fixture might be ['A', 'B', 'C', 'D'] # The code's `hrp` function will sort them by index name at the end. expected_weights = pd.Series( - [0.22727, 0.22727, 0.27272, 0.27272], - index=['A', 'B', 'C', 'D'] + [0.22727, 0.22727, 0.27272, 0.27272], index=["A", "B", "C", "D"] ) pd.testing.assert_series_equal( weights, expected_weights, atol=1e-5, check_names=False - ) \ No newline at end of file + ) diff --git a/test/optimization/test_hyper_parameter_tuning.py b/test/optimization/test_hyper_parameter_tuning.py index be250d9..b6a3599 100644 --- a/test/optimization/test_hyper_parameter_tuning.py +++ b/test/optimization/test_hyper_parameter_tuning.py @@ -9,88 +9,93 @@ from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler -from sklearn.ensemble import BaggingClassifier +from sklearn.ensemble import BaggingClassifier from RiskLabAI.optimization.hyper_parameter_tuning import MyPipeline, clf_hyper_fit + @pytest.fixture def mock_data(): """Mock data for tuning.""" - X = pd.DataFrame(np.random.randn(100, 3), columns=['A', 'B', 'C']) + X = pd.DataFrame(np.random.randn(100, 3), columns=["A", "B", "C"]) y = pd.Series(np.random.randint(0, 2, 100)) times = pd.Series( - pd.date_range('2020-01-01', periods=100), - index=pd.date_range('2020-01-01', periods=100) + pd.date_range("2020-01-01", periods=100), + index=pd.date_range("2020-01-01", periods=100), ) return X, y, times + def test_my_pipeline_fit_sample_weight(mock_data): """Test that MyPipeline correctly passes sample_weight.""" X, y, _ = mock_data sample_weight = np.random.rand(100) - + # Mock classifier to capture fit_params class MockLR(LogisticRegression): def fit(self, X, y, **kwargs): self.fit_kwargs = kwargs super().fit(X, y) - pipe = MyPipeline([ - ('scaler', StandardScaler()), - ('clf', MockLR()) - ]) - + pipe = MyPipeline([("scaler", StandardScaler()), ("clf", MockLR())]) + pipe.fit(X, y, sample_weight=sample_weight) - - assert 'sample_weight' in pipe.named_steps['clf'].fit_kwargs + + assert "sample_weight" in pipe.named_steps["clf"].fit_kwargs assert np.array_equal( - pipe.named_steps['clf'].fit_kwargs['sample_weight'], - sample_weight + pipe.named_steps["clf"].fit_kwargs["sample_weight"], sample_weight ) + def test_clf_hyper_fit_gridsearch(mock_data): """Test the grid search functionality.""" X, y, times = mock_data - - pipe_clf = MyPipeline([ - ('scaler', StandardScaler()), - ('clf', LogisticRegression()) - ]) - - param_grid = {'clf__C': [0.1, 1.0]} - + + pipe_clf = MyPipeline([("scaler", StandardScaler()), ("clf", LogisticRegression())]) + + param_grid = {"clf__C": [0.1, 1.0]} + # Use 'kfold' for simplicity, as purgedkfold requires 'times' - validator_params = {'n_splits': 3} - + validator_params = {"n_splits": 3} + best_model = clf_hyper_fit( - X, y, times, pipe_clf, param_grid, - validator_type='kfold', # Use standard KFold for this test + X, + y, + times, + pipe_clf, + param_grid, + validator_type="kfold", # Use standard KFold for this test validator_params=validator_params, - bagging=[0, 0, 0] # No bagging + bagging=[0, 0, 0], # No bagging ) - + assert isinstance(best_model, Pipeline) - assert 'clf' in best_model.named_steps - assert best_model.named_steps['clf'].C in [0.1, 1.0] + assert "clf" in best_model.named_steps + assert best_model.named_steps["clf"].C in [0.1, 1.0] + def test_clf_hyper_fit_bagging(mock_data): """Test the bagging functionality.""" X, y, times = mock_data - - pipe_clf = MyPipeline([('clf', LogisticRegression())]) - param_grid = {'clf__C': [1.0]} - validator_params = {'n_splits': 3} - + + pipe_clf = MyPipeline([("clf", LogisticRegression())]) + param_grid = {"clf__C": [1.0]} + validator_params = {"n_splits": 3} + # Bagging: 5 estimators, 50% samples, 100% features - bagging_params = [5, 0.5, 1.0] - + bagging_params = [5, 0.5, 1.0] + bagged_model = clf_hyper_fit( - X, y, times, pipe_clf, param_grid, - validator_type='kfold', + X, + y, + times, + pipe_clf, + param_grid, + validator_type="kfold", validator_params=validator_params, - bagging=bagging_params + bagging=bagging_params, ) - + assert isinstance(bagged_model, Pipeline) - assert 'bag' in bagged_model.named_steps - assert isinstance(bagged_model.named_steps['bag'], BaggingClassifier) - assert bagged_model.named_steps['bag'].n_estimators == 5 \ No newline at end of file + assert "bag" in bagged_model.named_steps + assert isinstance(bagged_model.named_steps["bag"], BaggingClassifier) + assert bagged_model.named_steps["bag"].n_estimators == 5 diff --git a/test/optimization/test_nco.py b/test/optimization/test_nco.py index effcf8d..86a3325 100644 --- a/test/optimization/test_nco.py +++ b/test/optimization/test_nco.py @@ -5,7 +5,11 @@ import pytest import numpy as np import pandas as pd -from RiskLabAI.optimization.nco import get_optimal_portfolio_weights, get_optimal_portfolio_weights_nco +from RiskLabAI.optimization.nco import ( + get_optimal_portfolio_weights, + get_optimal_portfolio_weights_nco, +) + @pytest.fixture def mock_cov_matrix(): @@ -15,45 +19,50 @@ def mock_cov_matrix(): Assets [2, 3] are correlated. Blocks are uncorrelated. """ - cov = np.array([ - [1.0, 0.8, 0.0, 0.0], - [0.8, 1.0, 0.0, 0.0], - [0.0, 0.0, 1.0, 0.5], - [0.0, 0.0, 0.5, 1.0] - ]) + cov = np.array( + [ + [1.0, 0.8, 0.0, 0.0], + [0.8, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.5], + [0.0, 0.0, 0.5, 1.0], + ] + ) return cov + def test_get_optimal_portfolio_weights_gmv(mock_cov_matrix): """Test the GMV portfolio calculation.""" weights = get_optimal_portfolio_weights(mock_cov_matrix, mu=None) - + assert weights.shape == (4, 1) assert np.isclose(weights.sum(), 1.0) - + # Check that weights are spread assert all(w > 0 for w in weights) + def test_get_optimal_portfolio_weights_mvo(mock_cov_matrix): """Test the MVO portfolio calculation.""" mu = np.array([0.1, 0.2, 0.05, 0.1]).reshape(-1, 1) weights = get_optimal_portfolio_weights(mock_cov_matrix, mu=mu) - + assert weights.shape == (4, 1) assert np.isclose(weights.sum(), 1.0) - + # MVO should overweight asset 1 (highest return) assert weights[1] > weights[0] assert weights[1] > weights[2] -@pytest.mark.filterwarnings("ignore:KMeans is known to have a memory leak on Windows with MKL:UserWarning") + +@pytest.mark.filterwarnings( + "ignore:KMeans is known to have a memory leak on Windows with MKL:UserWarning" +) def test_get_optimal_portfolio_weights_nco(mock_cov_matrix): """Test the NCO algorithm.""" # This will use the dummy clusterer, but it should still run. # To test properly, the RiskLabAI.cluster module must be available. - - weights = get_optimal_portfolio_weights_nco( - mock_cov_matrix, number_clusters=2 - ) - + + weights = get_optimal_portfolio_weights_nco(mock_cov_matrix, number_clusters=2) + assert weights.shape == (4, 1) - assert np.isclose(weights.sum(), 1.0) \ No newline at end of file + assert np.isclose(weights.sum(), 1.0) diff --git a/test/pde/test_pde_solver.py b/test/pde/test_pde_solver.py index 1cda9a0..8f2d443 100644 --- a/test/pde/test_pde_solver.py +++ b/test/pde/test_pde_solver.py @@ -3,6 +3,7 @@ """ import pytest + torch = pytest.importorskip("torch") import torch import numpy as np @@ -11,20 +12,23 @@ from RiskLabAI.pde.equation import HJBLQ from RiskLabAI.pde.solver import FBSDESolver + @pytest.fixture def pde_config(): """Fixture for a simple PDE configuration.""" return { - 'dim': 1, - 'total_time': 1.0, - 'num_time_interval': 10, + "dim": 1, + "total_time": 1.0, + "num_time_interval": 10, } + @pytest.fixture def device(): """Fixture to determine device.""" return torch.device("cuda" if torch.cuda.is_available() else "cpu") + def test_pde_solver_smoke_test(pde_config, device): """ A "smoke test" to ensure the FBSDESolver can be @@ -33,33 +37,31 @@ def test_pde_solver_smoke_test(pde_config, device): """ # 1. Initialize Equation pde = HJBLQ(pde_config) - + # 2. Initialize Solver layer_sizes = [pde.dim + 1] + [32, 32] + [pde.dim] solver = FBSDESolver( pde=pde, layer_sizes=layer_sizes, learning_rate=0.001, - solving_method='DTNN', - device=device + solving_method="DTNN", + device=device, ) - + # 3. Run solver for a few steps num_iterations = 2 batch_size = 16 init_y = 0.5 - + losses, inits = solver.solve( - num_iterations=num_iterations, - batch_size=batch_size, - init_y=init_y + num_iterations=num_iterations, batch_size=batch_size, init_y=init_y ) - + # 4. Check results assert isinstance(losses, list) assert len(losses) == num_iterations assert isinstance(losses[0], float) - + assert isinstance(inits, list) assert len(inits) == num_iterations - assert isinstance(inits[0], float) \ No newline at end of file + assert isinstance(inits[0], float) diff --git a/test/test_consolidation.py b/test/test_consolidation.py index c06140e..da53dae 100644 --- a/test/test_consolidation.py +++ b/test/test_consolidation.py @@ -39,8 +39,9 @@ def test_lin_parts_matches_linear_partitions_numerically(): # Same partition boundaries for representative inputs. np.testing.assert_array_equal(lin_parts(100, 4), np.array([0, 25, 50, 75, 100])) - np.testing.assert_array_equal(lin_parts(10, 3), np.ceil( - np.linspace(0, 10, min(3, 10) + 1)).astype(int)) + np.testing.assert_array_equal( + lin_parts(10, 3), np.ceil(np.linspace(0, 10, min(3, 10) + 1)).astype(int) + ) # --------------------------------------------------------------------------- # @@ -70,8 +71,7 @@ def test_clustering_cov_to_corr_delegates_and_matches(): # Delegates to the canonical implementation. np.testing.assert_allclose(out, cov_to_corr(cov), rtol=0, atol=1e-12) # And matches an independent reference. - np.testing.assert_allclose(out, _reference_cov_to_corr(cov), - rtol=0, atol=1e-12) + np.testing.assert_allclose(out, _reference_cov_to_corr(cov), rtol=0, atol=1e-12) # Correlation diagonal is exactly 1. np.testing.assert_allclose(np.diag(out), np.ones(n), rtol=0, atol=1e-12) diff --git a/test/test_performance.py b/test/test_performance.py index c6bc5d7..59b2e7e 100644 --- a/test/test_performance.py +++ b/test/test_performance.py @@ -35,7 +35,7 @@ def _frac_diff_std_reference(series, degree, threshold=0.01): arr = s.to_numpy() for iloc in range(skip, arr.shape[0]): result.loc[s.index[iloc], name] = np.dot( - weights[-(iloc + 1):].T, arr[:iloc + 1] + weights[-(iloc + 1) :].T, arr[: iloc + 1] )[0, 0] return result.dropna(how="all") @@ -92,9 +92,7 @@ def test_mp_avg_active_signals_matches_reference(): ) signals = signals[~signals.index.duplicated(keep="first")] - time_points = sorted( - set(signals["t1"].dropna().values).union(signals.index.values) - ) + time_points = sorted(set(signals["t1"].dropna().values).union(signals.index.values)) fast = mpAvgActiveSignals(signals, time_points) reference = _avg_active_reference(signals, time_points) @@ -117,8 +115,14 @@ def _triple_barrier_reference(close, events, ptsl, molecule): ef = events.loc[molecule] output = pd.DataFrame(index=ef.index) output["End Time"] = ef["End Time"] - pt = ptsl[0] * ef["Base Width"] if ptsl[0] > 0 else pd.Series(np.inf, index=ef.index) - sl = -ptsl[1] * ef["Base Width"] if ptsl[1] > 0 else pd.Series(-np.inf, index=ef.index) + pt = ( + ptsl[0] * ef["Base Width"] if ptsl[0] > 0 else pd.Series(np.inf, index=ef.index) + ) + sl = ( + -ptsl[1] * ef["Base Width"] + if ptsl[1] > 0 + else pd.Series(-np.inf, index=ef.index) + ) side = ef.get("Side", pd.Series(1.0, index=ef.index)) for loc, vbt in ef["End Time"].fillna(close.index[-1]).items(): path = close.loc[loc:vbt] @@ -134,7 +138,9 @@ def test_triple_barrier_matches_reference_randomized(): for _ in range(25): n = int(rng.integers(60, 300)) idx = pd.date_range("2020-01-01", periods=n, freq="min") - close = pd.Series(np.cumprod(1 + rng.standard_normal(n) * 0.01) * 100, index=idx) + close = pd.Series( + np.cumprod(1 + rng.standard_normal(n) * 0.01) * 100, index=idx + ) positions = np.sort( rng.choice(n - 2, size=int(rng.integers(3, 25)), replace=False) diff --git a/test/utils/test_ewma.py b/test/utils/test_ewma.py index 494db2f..1172760 100644 --- a/test/utils/test_ewma.py +++ b/test/utils/test_ewma.py @@ -7,6 +7,7 @@ import pandas as pd from RiskLabAI.utils.ewma import ewma + def test_ewma_vs_pandas(): """ Test that the Numba ewma function matches pandas @@ -14,28 +15,25 @@ def test_ewma_vs_pandas(): """ series_np = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0]) window = 3 - + # Our Numba implementation ewma_ours = ewma(series_np, window=window) - + # Pandas implementation - ewma_pandas = ( - pd.Series(series_np) - .ewm(span=window, adjust=True) - .mean() - .values - ) - + ewma_pandas = pd.Series(series_np).ewm(span=window, adjust=True).mean().values + assert np.allclose(ewma_ours, ewma_pandas) + def test_ewma_single_value(): """Test ewma with a single value.""" series_np = np.array([5.0]) ewma_ours = ewma(series_np, window=3) assert np.allclose(ewma_ours, [5.0]) + def test_ewma_empty_value(): """Test ewma with an empty array.""" series_np = np.array([]) ewma_ours = ewma(series_np, window=3) - assert ewma_ours.shape == (0,) \ No newline at end of file + assert ewma_ours.shape == (0,) diff --git a/test/utils/test_momentum_mean_reverting_strategy_sides.py b/test/utils/test_momentum_mean_reverting_strategy_sides.py index 8181cba..f54c69d 100644 --- a/test/utils/test_momentum_mean_reverting_strategy_sides.py +++ b/test/utils/test_momentum_mean_reverting_strategy_sides.py @@ -5,13 +5,17 @@ import pytest import pandas as pd import numpy as np -from RiskLabAI.utils.momentum_mean_reverting_strategy_sides import determine_strategy_side +from RiskLabAI.utils.momentum_mean_reverting_strategy_sides import ( + determine_strategy_side, +) + @pytest.fixture def price_series(): """A simple price series.""" return pd.Series([100, 101, 102, 103, 104, 105, 104, 103, 102, 101]) + def test_momentum_strategy(price_series): """Test momentum (mean_reversion=False).""" # fast=2, slow=5 @@ -23,13 +27,14 @@ def test_momentum_strategy(price_series): # slow = [100, 100.5, 101, 101.5, 102, 103, 103.6, 103.8, 103.6, 103] # fast >= slow: [T, T, T, T, T, T, T, F, F, F] # signal: [1, 1, 1, 1, 1, 1, 1, -1, -1, -1] - + sides = determine_strategy_side( price_series, fast_window=2, slow_window=5, mean_reversion=False ) expected = pd.Series([1, 1, 1, 1, 1, 1, 1, -1, -1, -1]) pd.testing.assert_series_equal(sides, expected) + def test_mean_reversion_strategy(price_series): """Test mean reversion (mean_reversion=True).""" # Should be the inverse of the momentum test @@ -39,6 +44,7 @@ def test_mean_reversion_strategy(price_series): expected = pd.Series([-1, -1, -1, -1, -1, -1, -1, 1, 1, 1]) pd.testing.assert_series_equal(sides, expected) + def test_exponential_ma(price_series): """Test that exponential=True runs.""" sides = determine_strategy_side( @@ -48,10 +54,11 @@ def test_exponential_ma(price_series): assert sides.shape == (10,) assert sides.isin([1, -1]).all() + def test_window_error(price_series): """Test that fast_window >= slow_window raises an error.""" with pytest.raises(ValueError, match="fast_window must be smaller"): determine_strategy_side(price_series, fast_window=5, slow_window=2) - + with pytest.raises(ValueError, match="fast_window must be smaller"): - determine_strategy_side(price_series, fast_window=5, slow_window=5) \ No newline at end of file + determine_strategy_side(price_series, fast_window=5, slow_window=5) diff --git a/test/utils/test_progress.py b/test/utils/test_progress.py index 78c6aa1..c293780 100644 --- a/test/utils/test_progress.py +++ b/test/utils/test_progress.py @@ -6,25 +6,28 @@ import time from RiskLabAI.utils.progress import progress_bar + def test_progress_bar_start(capsys): """Test the progress bar at the start (0%).""" progress_bar(0, 100, time.time()) captured = capsys.readouterr() assert "Completed: [ ] 0% - Calculating..." in captured.out + def test_progress_bar_mid(capsys): """Test the progress bar in the middle (50%).""" - start_time = time.time() - 10 # Pretend 10s have passed + start_time = time.time() - 10 # Pretend 10s have passed progress_bar(50, 100, start_time) captured = capsys.readouterr() - + assert "Completed: [--------->" in captured.out assert "] 50% - " in captured.out assert "minutes remaining" in captured.out + def test_progress_bar_end(capsys): """Test the progress bar at the end (100%).""" progress_bar(100, 100, time.time()) captured = capsys.readouterr() assert "Completed: [--------------------] 100% - Task completed!" in captured.out - assert captured.out.endswith('\n') # Should print a newline \ No newline at end of file + assert captured.out.endswith("\n") # Should print a newline