Source code for Orange.classification.calibration

import numpy as np
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import _SigmoidCalibration

from Orange.classification import Model, Learner
from Orange.evaluation import TestOnTrainingData
from Orange.evaluation.performance_curves import Curves

__all__ = ["ThresholdClassifier", "ThresholdLearner",
           "CalibratedLearner", "CalibratedClassifier"]


[docs]class ThresholdClassifier(Model): """ A model that wraps a binary model and sets a different threshold. The target class is the class with index 1. A data instances is classified to class 1 it the probability of this class equals or exceeds the threshold Attributes: base_model (Orange.classification.Model): base mode threshold (float): decision threshold """ def __init__(self, base_model, threshold): if not base_model.domain.class_var.is_discrete \ or len(base_model.domain.class_var.values) != 2: raise ValueError("ThresholdClassifier requires a binary class") super().__init__(base_model.domain, base_model.original_domain) self.name = f"{base_model.name}, thresh={threshold:.2f}" self.base_model = base_model self.threshold = threshold def __call__(self, data, ret=Model.Value): probs = self.base_model(data, ret=Model.Probs) if ret == Model.Probs: return probs class_probs = probs[:, 1].ravel() with np.errstate(invalid="ignore"): # we fix nanx below vals = (class_probs >= self.threshold).astype(float) vals[np.isnan(class_probs)] = np.nan if ret == Model.Value: return vals else: return vals, probs
[docs]class ThresholdLearner(Learner): """ A learner that runs another learner and then finds the optimal threshold for CA or F1 on the training data. Attributes: base_leaner (Learner): base learner threshold_criterion (int): `ThresholdLearner.OptimizeCA` or `ThresholdLearner.OptimizeF1` """ __returns__ = ThresholdClassifier OptimizeCA, OptimizeF1 = range(2) def __init__(self, base_learner, threshold_criterion=OptimizeCA): super().__init__() self.base_learner = base_learner self.threshold_criterion = threshold_criterion
[docs] def fit_storage(self, data): """ Induce a model using the provided `base_learner`, compute probabilities on training data and the find the optimal decision thresholds. In case of ties, select the threshold that is closest to 0.5. """ if not data.domain.class_var.is_discrete \ or len(data.domain.class_var.values) != 2: raise ValueError("ThresholdLearner requires a binary class") res = TestOnTrainingData(data, [self.base_learner], store_models=True) model = res.models[0, 0] curves = Curves.from_results(res) curve = [curves.ca, curves.f1][self.threshold_criterion]() # In case of ties, we want the optimal threshold that is closest to 0.5 best_threshs = curves.probs[curve == np.max(curve)] threshold = best_threshs[min(np.searchsorted(best_threshs, 0.5), len(best_threshs) - 1)] return ThresholdClassifier(model, threshold)
[docs]class CalibratedClassifier(Model): """ A model that wraps another model and recalibrates probabilities Attributes: base_model (Mode): base mode calibrators (list of callable): list of functions that get a vector of probabilities and return calibrated probabilities """ def __init__(self, base_model, calibrators): if not base_model.domain.class_var.is_discrete: raise ValueError("CalibratedClassifier requires a discrete target") super().__init__(base_model.domain, base_model.original_domain) self.base_model = base_model self.calibrators = calibrators self.name = f"{base_model.name}, calibrated" def __call__(self, data, ret=Model.Value): probs = self.base_model(data, Model.Probs) cal_probs = self.calibrated_probs(probs) if ret == Model.Probs: return cal_probs vals = np.argmax(cal_probs, axis=1) if ret == Model.Value: return vals else: return vals, cal_probs def calibrated_probs(self, probs): if self.calibrators: ps = np.hstack( tuple( calibr.predict(cls_probs).reshape(-1, 1) for calibr, cls_probs in zip(self.calibrators, probs.T))) else: ps = probs.copy() sums = np.sum(ps, axis=1) zero_sums = sums == 0 with np.errstate(invalid="ignore"): # handled below ps /= sums[:, None] if zero_sums.any(): ps[zero_sums] = 1 / ps.shape[1] return ps
[docs]class CalibratedLearner(Learner): """ Probability calibration for learning algorithms This learner that wraps another learner, so that after training, it predicts the probabilities on training data and calibrates them using sigmoid or isotonic calibration. It then returns a :obj:`CalibratedClassifier`. Attributes: base_learner (Learner): base learner calibration_method (int): `CalibratedLearner.Sigmoid` or `CalibratedLearner.Isotonic` """ __returns__ = CalibratedClassifier Sigmoid, Isotonic = range(2) def __init__(self, base_learner, calibration_method=Sigmoid): super().__init__() self.base_learner = base_learner self.calibration_method = calibration_method
[docs] def fit_storage(self, data): """ Induce a model using the provided `base_learner`, compute probabilities on training data and use scipy's `_SigmoidCalibration` or `IsotonicRegression` to prepare calibrators. """ res = TestOnTrainingData(data, [self.base_learner], store_models=True) model = res.models[0, 0] probabilities = res.probabilities[0] return self.get_model(model, res.actual, probabilities)
def get_model(self, model, ytrue, probabilities): if self.calibration_method == CalibratedLearner.Sigmoid: fitter = _SigmoidCalibration() else: fitter = IsotonicRegression(out_of_bounds='clip') probabilities[np.isinf(probabilities)] = 1 calibrators = [fitter.fit(cls_probs, ytrue) for cls_idx, cls_probs in enumerate(probabilities.T)] return CalibratedClassifier(model, calibrators)