diff --git a/experiments/cn_vs_noisybn/config.yaml b/experiments/cn_vs_noisybn/config.yaml index 73a3cbe..3c4800b 100644 --- a/experiments/cn_vs_noisybn/config.yaml +++ b/experiments/cn_vs_noisybn/config.yaml @@ -36,4 +36,4 @@ n_infer: 1000 # Number of inferences to pe # Other seed: 42 # Global seed -num_cores: 'multiprocessing.cpu_count() - 1' # Number of threads to use for parallelization +num_cores: 'multiprocessing.cpu_count() - 1' # Number of threads to use for parallelization \ No newline at end of file diff --git a/src/data.py b/src/data.py index fad3a34..167b2ac 100644 --- a/src/data.py +++ b/src/data.py @@ -1,8 +1,8 @@ from itertools import product from pprint import pformat -from numpy.random import randint import pyagrum as gum +from numpy.random import randint from src.config import create_clean_dir, get_base_path, set_global_seed from src.utils import compact_dict @@ -26,7 +26,10 @@ def generate_naivebayes(config): # Set BN (naive Bayes) structure n_modmax = config["n_modmax"] - bn_str_gen = (f'{config["target_var"]}->X{i}[{randint(2, n_modmax+1)}]' for i in range(config["n_nodes"] - 1)) + bn_str_gen = ( + f'{config["target_var"]}->X{i}[{randint(2, n_modmax+1)}]' + for i in range(config["n_nodes"] - 1) + ) bn_str = "; ".join(bn_str_gen) # For each model ... diff --git a/src/inference.py b/src/inference.py index d74b67f..1a59c05 100644 --- a/src/inference.py +++ b/src/inference.py @@ -6,7 +6,7 @@ from more_itertools import random_product from src.config import get_base_path, set_global_seed -from src.utils import (add_counts_to_bn, get_min_max_bns, noisy_bn, safe_assert) +from src.utils import add_counts_to_bn, get_min_max_bns, noisy_bn, safe_assert def run_inferences(exp, ess, eps, config): @@ -128,9 +128,7 @@ def cpt_value( # Get a naive Bayes log-joint -def nb_log_joint( - bn: gum.BayesNet, target: str, t: float, children: dict -) -> float: +def nb_log_joint(bn: gum.BayesNet, target: str, t: float, children: dict) -> float: """ Get log[P(target=t, children)] by exploiting the BN factorization. The DAG is a naive Bayes with `target` a binary target variable. diff --git a/src/utils.py b/src/utils.py index 4b0d48d..8dfad4a 100644 --- a/src/utils.py +++ b/src/utils.py @@ -32,7 +32,7 @@ def get_llr(x: dict, theta, theta_hat): # Check BNs sampled from a CN -def are_all_bns_different(bn_vec) -> None: +def are_all_bns_different(bn_vec) -> bool: signatures = set() for bn in bn_vec: @@ -46,6 +46,8 @@ def are_all_bns_different(bn_vec) -> None: print(f"({len(signatures)}/{len(bn_vec)} different BNs.)") + return len(signatures) == len(bn_vec) + # Add counts of events to a BN def add_counts_to_bn(bn, data): @@ -133,11 +135,12 @@ def get_min_max_bns(cn, exp: str): # Sample from a credal set K(x | pi_x), i.e., a constrained polytope. -def sample_from_cset(vec_min, vec_max): +def sample_from_cset(vec_min, vec_max, n_samples) -> list: """ - A credal set is a polytope in a space of #X parameters, defined by a: + We assume a credal set is a polytope in a space of #X parameters, defined by a: - Multi-dimensional rectangle, i.e., inequality constraint Ax <= b, and - Hyperplane (provided all the variables sum up to 1), i.e., equality constraint A_eq x = b_eq. + In the case of local IDM, this is ensured. """ # Define the rectangle @@ -158,70 +161,79 @@ def sample_from_cset(vec_min, vec_max): # Sample from the polytope mc = hopsy.MarkovChain(constrained_rectangle) rng = hopsy.RandomNumberGenerator(42) - _, constrained_samples = hopsy.sample(mc, rng, n_samples=1, thinning=10) - constrained_samples = constrained_samples.flatten() + _, constrained_samples = hopsy.sample(mc, rng, n_samples, thinning=10) + constrained_samples = constrained_samples[0] # Debug safe_assert(np.all(vec_min <= vec_max)) safe_assert(n_par == len(vec_max)) safe_assert(n_par == A.shape[1]) safe_assert(n_par == A_eq.shape[1]) - safe_assert(n_par == len(constrained_samples)) + safe_assert(len(constrained_samples) == n_samples) + for i in constrained_samples: + safe_assert(len(i) == n_par) return constrained_samples -# Sample from two esxtreme CPTs -def sample_from_cpts(cpt_min, cpt_max) -> np.array: +# Sample from two extreme CPTs +def sample_from_cpts(cpt_min, cpt_max, n_samples) -> list: # Transform CPTs into pandas dataframes cpt_min = np.atleast_2d(cpt_min.topandas()) cpt_max = np.atleast_2d(cpt_max.topandas()) - # Sample conditional distributions - cpt_sample = [] + # For each row in the CPT ... + credal_dict = {} for row in range(cpt_min.shape[0]): - vec_min = cpt_min[row, :] - vec_max = cpt_max[row, :] + # ... sample `n_samples` points from the credal set + credal_dict[row] = sample_from_cset(cpt_min[row, :], cpt_max[row, :], n_samples) + + # For each sample ... + cpt_samples = [] + for i in range(n_samples): - # Sample from polytope - vec_sample = sample_from_cset(vec_min, vec_max) - cpt_sample.append(vec_sample) + # ... build the CPT + cpt = [] + for row in range(cpt_min.shape[0]): + cpt.append(credal_dict[row][i]) - cpt_sample = np.array(cpt_sample).flatten() + cpt = np.array(cpt).flatten() + cpt_samples.append(cpt) # Debug safe_assert(cpt_min.shape == cpt_max.shape) - safe_assert(cpt_min.shape == cpt_max.shape) - safe_assert(prod(cpt_min.shape) == prod(cpt_max.shape)) - safe_assert(len(cpt_sample) == prod(cpt_min.shape)) + safe_assert(len(credal_dict) == prod(cpt_min.shape)) + safe_assert(len(cpt_samples) == n_samples) - return cpt_sample + return cpt_samples # BNs sampler from a CN -def sample_from_cn(cn, exp: str, n: int): +def sample_from_cn(cn, exp: str, n_samples: int) -> list: # Get the DAG and extreme BNs dag = gum.BayesNet(cn.current_bn()) bn_min, bn_max = get_min_max_bns(cn, exp) - # Draw n random BNs + # For each variable ... + cpts_dict = {} + for var in dag.names(): + + # ... sample `n_samples` CPTs from the CN + cpts_dict[var] = sample_from_cpts(bn_min.cpt(var), bn_max.cpt(var), n_samples) + + # For each sample ... bns = [] - for _ in range(n): + for i in range(n_samples): - # Init an empty BN + # ... init an empty BN ... bn = gum.BayesNet(dag) - # For each variable ... + # ... and fill its CPTs for var in dag.names(): - - # ... sample from the CN CPT, ... - cpt_sample = sample_from_cpts(bn_min.cpt(var), bn_max.cpt(var)) - - # ... and fill the BN's CPT - bn.cpt(var).fillWith(cpt_sample) + bn.cpt(var).fillWith(cpts_dict[var][i]) bns.append(bn) @@ -229,7 +241,8 @@ def sample_from_cn(cn, exp: str, n: int): safe_assert(check_consistency(bn, bn_min, bn_max)) # Debug - safe_assert(n == len(bns)) + safe_assert(len(cpts_dict) == len(dag.names())) + safe_assert(len(bns) == n_samples) return bns @@ -239,27 +252,27 @@ def check_consistency(bn, bn_min, bn_max) -> bool: for var in bn.names(): bn_cpt = np.atleast_2d(bn.cpt(var).topandas()) - bn_min_cpt = np.array(bn_min.cpt(var).topandas()) - bn_max_cpt = np.array(bn_max.cpt(var).topandas()) + bn_min_cpt = np.atleast_2d(bn_min.cpt(var).topandas()) + bn_max_cpt = np.atleast_2d(bn_max.cpt(var).topandas()) # Check if probabilities sum to 1 sum_vec = np.sum(bn_cpt, axis=1) - probability_integrity = np.all(np.abs(sum_vec - 1) < 1e-5) + probability_consistency = np.all(np.abs(sum_vec - 1) < 1e-5) # Check if the BN CPT is >= min CPT - min_integrity = np.all(bn_cpt >= bn_min_cpt) + min_consistency = np.all(bn_cpt >= bn_min_cpt) # Check if the BN CPT is <= max CPT - max_integrity = np.all(bn_cpt <= bn_max_cpt) + max_consistency = np.all(bn_cpt <= bn_max_cpt) - integrity = probability_integrity and min_integrity and max_integrity + consistency = probability_consistency and min_consistency and max_consistency - if integrity: + if consistency: continue else: - print("probability_integrity: ", probability_integrity) - print("min_integrity: ", min_integrity) - print("max_integrity: ", max_integrity) + print("probability_consistency: ", probability_consistency) + print("min_consistency: ", min_consistency) + print("max_consistency: ", max_consistency) return False return True