diff --git a/.gitignore b/.gitignore index 0dc1ec51..91d307a8 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ devenv.local.nix pom.xml pom.xml.asc sum-product-dsl/ +/roc.png diff --git a/data/.gitignore b/data/.gitignore index 0eb04f4a..51fccf0d 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -39,3 +39,4 @@ /predictions.csv /synthetic-data-iql.csv /db.edn +/ml-metrics.csv diff --git a/dvc.yaml b/dvc.yaml index f4f29fc6..465ada1f 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -547,3 +547,21 @@ stages: - data/xcat outs: - data/db.edn + + roc-curve: + cmd: > + python scripts/evaluate-classifier-roc.py + deps: + - data/predictions.csv + params: + - synthetic_data_evaluation.positive_label + outs: + - roc.png + + ml-metrics: + cmd: > + python scripts/evaluate-classifier-statistics.py + deps: + - data/predictions.csv + outs: + - data/ml-metrics.csv diff --git a/params.yaml b/params.yaml index 12efa20e..4cc96b22 100644 --- a/params.yaml +++ b/params.yaml @@ -66,7 +66,8 @@ mi: # health_status: ["c) Average", "b) Below average"] synthetic_data_evaluation: # If target is not specified, a random target is chosen for prediction. - #target: Apogee_km + target: CC22_320a + positive_label: "yes" # XXX predictor: Random_forest # One of "Random_forest" or "GLM" #N: 10000 # Subsample held-out dataframe with 1000 samples database: diff --git a/scripts/evaluate-classifier-roc.py b/scripts/evaluate-classifier-roc.py new file mode 100755 index 00000000..0a119f08 --- /dev/null +++ b/scripts/evaluate-classifier-roc.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.metrics import roc_curve, auc +from sklearn.datasets import make_classification +from sklearn.linear_model import LogisticRegression +import yaml + +# Generate generic ROC curve for the results found in 'predictions.csv' in the +# current directory + +df = pd.read_csv("data/predictions.csv", header=0) +yv = df["prediction"] +yp = df["predictive-probability"] +tv = df["true_value"] + +with open("params.yaml", "r") as f: + params = yaml.safe_load(f.read()) +# Get held-out configuration for evaluation. +pos_label = params["synthetic_data_evaluation"]["positive_label"] + +# Compute ROC curve and ROC area +fpr, tpr, thresholds = roc_curve(tv, yp, pos_label=pos_label) +roc_auc = auc(fpr, tpr) + +# Plot ROC curve +plt.figure() +plt.plot(fpr, tpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc) +plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") +plt.xlim([0.0, 1.0]) +plt.ylim([0.0, 1.05]) +plt.xlabel("False Positive Rate") +plt.ylabel("True Positive Rate") +plt.title("Receiver Operating Characteristic") +plt.legend(loc="lower right") +plt.savefig("roc.png") diff --git a/scripts/evaluate-classifier-statistics.py b/scripts/evaluate-classifier-statistics.py new file mode 100755 index 00000000..aa69cb56 --- /dev/null +++ b/scripts/evaluate-classifier-statistics.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 + +import numpy as np +import pandas as pd + +df = pd.read_csv("data/predictions.csv", header=0) + +# Show some generic metrics for the results found in 'predictions.csv' in the +# current directory + +X = df["true_value"] +Y = df["prediction"] + +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score + +print("Accuracy...: %f" % accuracy_score(Y, X)) +print("Precision..: %f" % precision_score(Y, X, average="macro")) +print("Recall.....: %f" % recall_score(Y, X, average="macro")) +print("F1.........: %f" % f1_score(Y, X, average="macro")) + +# Also save to disk. Helpful to track the result with DVC. +result = pd.DataFrame( + { + "metric": ["Accuracy", "Precision", "Recall", "F1"], + "score:": [ + accuracy_score(Y, X), + precision_score(Y, X, average="macro"), + recall_score(Y, X, average="macro"), + f1_score(Y, X, average="macro"), + ], + } +) +result.to_csv("data/ml-metrics.csv", index=False) diff --git a/scripts/predict.py b/scripts/predict.py index 6aab64da..72992c9b 100644 --- a/scripts/predict.py +++ b/scripts/predict.py @@ -174,6 +174,7 @@ def main(): "training_data": [], "test_data": [], "prediction": [], + "predictive-probability": [], "true_value": [], } for train_dataset_path in args.training: @@ -193,8 +194,19 @@ def main(): # Need to call NP.array.flatten() here because CatBoost decides to # wrap prediction into a separate list. results["prediction"].extend((ml_model.predict(X_test).flatten().tolist())) - results["true_value"].extend(y_test.tolist()) + # Add a new column with the probability of the predicted value. + # Although it looks evil, use of 'classes_' is documented here: + # https://scikit-learn.org/stable/modules/generated/sklearn. + # linear_model.LogisticRegression.html#sklearn.linear_model. + # LogisticRegression.predict_proba + probabilities = ml_model.predict_proba(X_test) + pos_label = config["positive_label"] + j = list(ml_model.classes_).index(pos_label) + for i in range(len(probabilities)): + results["predictive-probability"].append(probabilities[i][j]) + + results["true_value"].extend(y_test.tolist()) n_test_datapoints = y_test.shape[0] results["target"].extend([target] * n_test_datapoints) results["training_data"].extend([train_dataset_path] * n_test_datapoints)