Source code for Orange.preprocess.discretize

import re
from collections import namedtuple

import numpy as np
import scipy.sparse as sp

from Orange.data import DiscreteVariable, Domain
from Orange.data.sql.table import SqlTable
from Orange.preprocess.util import _RefuseDataInConstructor
from Orange.statistics import distribution, contingency, util as ut
from Orange.statistics.basic_stats import BasicStats
from Orange.util import Reprable
from .transformation import Transformation
from . import _discretize

__all__ = ["EqualFreq", "EqualWidth", "EntropyMDL", "DomainDiscretizer",
           "decimal_binnings", "get_bins"]


class Discretizer(Transformation):
    """Value transformer that returns an index of the bin for the given value.
    """
    def __init__(self, variable, points):
        super().__init__(variable)
        self.points = points

    @staticmethod
    def digitize(x, bins):
        if sp.issparse(x):
            if len(bins):
                x.data = np.digitize(x.data, bins)
            else:
                x = sp.csr_matrix(x.shape)
            return x
        else:
            return np.digitize(x, bins) if len(bins) else [0]*len(x)

    def transform(self, c):
        if sp.issparse(c):
            return self.digitize(c, self.points)
        elif c.size:
            return np.where(np.isnan(c), np.NaN, self.digitize(c, self.points))
        else:
            return np.array([], dtype=int)

    @staticmethod
    def _fmt_interval(low, high, formatter):
        assert low is not None or high is not None
        assert low is None or high is None or low < high
        if low is None or np.isinf(low):
            return f"< {formatter(high)}"
        if high is None or np.isinf(high):
            return f"≥ {formatter(low)}"
        return f"{formatter(low)} - {formatter(high)}"

    @classmethod
    def create_discretized_var(cls, var, points):
        def fmt(val):
            sval = var.str_val(val)
            # For decimal numbers, remove trailing 0's and . if no decimals left
            if re.match(r"^\d+\.\d+", sval):
                return sval.rstrip("0").rstrip(".")
            return sval

        lpoints = list(points)
        if lpoints:
            values = [
                cls._fmt_interval(low, high, fmt)
                for low, high in zip([-np.inf] + lpoints, lpoints + [np.inf])]
            to_sql = BinSql(var, lpoints)
        else:
            values = ["single_value"]
            to_sql = SingleValueSql(values[0])

        dvar = DiscreteVariable(name=var.name, values=values,
                                compute_value=cls(var, points),
                                sparse=var.sparse)
        dvar.source_variable = var
        dvar.to_sql = to_sql
        return dvar


class BinSql:
    def __init__(self, var, points):
        self.var = var
        self.points = points

    def __call__(self):
        return 'width_bucket(%s, ARRAY%s::double precision[])' % (
            self.var.to_sql(), str(self.points))


class SingleValueSql:
    def __init__(self, value):
        self.value = value

    def __call__(self):
        return "'%s'" % self.value


[docs]class Discretization(Reprable): """Abstract base class for discretization classes.""" def __call__(self, data, variable): """ Compute discretization of the given variable on the given data. Return a new variable with the appropriate domain (:obj:`Orange.data.DiscreteVariable.values`) and transformer (:obj:`Orange.data.Variable.compute_value`). """ raise NotImplementedError( "Subclasses of 'Discretization' need to implement " "the call operator")
[docs]class EqualFreq(Discretization): """Discretization into bins with approximately equal number of data instances. .. attribute:: n Number of bins (default: 4). The actual number may be lower if the variable has less than n distinct values. """ def __init__(self, n=4): self.n = n # noinspection PyProtectedMember def __call__(self, data, attribute): if type(data) == SqlTable: att = attribute.to_sql() quantiles = [(i + 1) / self.n for i in range(self.n - 1)] query = data._sql_query( ['quantile(%s, ARRAY%s)' % (att, str(quantiles))], use_time_sample=1000) with data._execute_sql_query(query) as cur: points = sorted(set(cur.fetchone()[0])) else: d = distribution.get_distribution(data, attribute) points = _discretize.split_eq_freq(d, self.n) return Discretizer.create_discretized_var( data.domain[attribute], points)
[docs]class EqualWidth(Discretization): """Discretization into a fixed number of bins with equal widths. .. attribute:: n Number of bins (default: 4). """ def __init__(self, n=4): self.n = n # noinspection PyProtectedMember def __call__(self, data, attribute, fixed=None): if fixed: min, max = fixed[attribute.name] points = self._split_eq_width(min, max) else: if type(data) == SqlTable: stats = BasicStats(data, attribute) points = self._split_eq_width(stats.min, stats.max) else: values = data[:, attribute] values = values.X if values.X.size else values.Y min, max = ut.nanmin(values), ut.nanmax(values) points = self._split_eq_width(min, max) return Discretizer.create_discretized_var( data.domain[attribute], points) def _split_eq_width(self, min, max): if np.isnan(min) or np.isnan(max) or min == max: return [] dif = (max - min) / self.n return [min + (i + 1) * dif for i in range(self.n - 1)]
BinDefinition = namedtuple("BinDefinition", ("start", "nbins", "width")) def decimal_binnings( data, *, min_width=0, min_bins=2, max_bins=50, min_unique=5, add_unique=None, factors=(20, 10, 5, 2, 1, 0.5, 0.25, 0.2, 0.1, 0.05, 0.025, 0.02, 0.01), return_defs=False): """ Find a set of nice splits of data into bins The function first computes the scaling factor that is, the power of 10 that brings the interval of values within [0, 1]. For instances, if the numbers come from interaval 10004001 and 10007005, the width of the interval is 3004, so the scaling factor is 1000. The function next considers bin widths that are products of scaling and different factors from 20 to 0.01 that make sense in decimal scale (see default value for argument `factors`). For each width, it rounds the minimal value down to this width and the maximal value up, and it computes the number of bins of that width that fit between these two values. If the number of bins is between `min_bins` and `max_bins`, and the width is at least `min_width`, this is a valid interval. If the data has no more than `min_unique` unique values, the function will add a set of bins that put each value into its own bin. If the data has no more than `add_unique` values, that last bins will put each value into its own bin. Args: data (np.ndarray): vector of data points; values may repeat, and nans and infs are filtered out. min_width (float): minimal bin width min_bins (int): minimal number of bins max_bins (int): maximal number of bins; the number of bins will never exceed the number of unique values min_unique (int): if the number of unique values are less or equal to `min_unique`, the function returns a single binning that matches that values in the data add_unique (int): similar to `min_unique` except that such bins are added to the list factors (list of float): The factors with which the scaling is multiplied. Default is `(20, 10, 5, 2, 1, 0.5, 0.25, 0.2, 0.1, 0.05, 0.025, 0.02, 0.01)`, so if scaling is 1000, considered bin widths are 20000, 10000, 5000, 2000, 1000, 500, 250, 200, 100, 50, 25, 20 and 10. return_defs (bool): If set to `True`, the function returns a list of instances of `BinDefinition`, otherwise a list of bin boundaries. Returns: bin_boundaries (list of np.ndarray): a list of bin boundaries, including the top boundary of the last interval, hence the list size equals the number bins + 1. These array match the `bin` argument of `numpy.histogram`. This is returned if `return_defs` is left `True`. bin_definition (list of BinDefinition): `BinDefinition` is a named tuple containing the beginning of the first bin (`start`), number of bins (`nbins`) and their widths (`width`). The last value can also be a `nd.array` with `nbins + 1` elements, which describes bins of unequal width and is used for binnings that match the unique values in the data (see `min_unique` and `add_unique`). This is returned if `return_defs` is `False`. """ def unique_bins(): if len(unique) >= 2: # make the last bin the same width as the one before last_boundary = 2 * unique[-1] - unique[-2] else: last_boundary = unique[0] + 1 return BinDefinition( unique[0], len(unique), np.hstack((unique, [last_boundary]))) unique = np.unique(data) unique = unique[np.isfinite(unique)] if not unique.size: raise ValueError("no valid (non-nan) data") mn, mx = unique[0], unique[-1] if mn == mx or len(unique) <= min_unique: bins = unique_bins() if not return_defs: bins = get_bins(bins) return [bins] diff = mx - mn f10 = 10 ** -np.floor(np.log10(diff)) bins = [] max_bins = min(max_bins, len(unique)) for f in factors: width = f / f10 if width < min_width: continue mn_ = np.floor(mn / width) * width mx_ = np.ceil(mx / width) * width nbins = np.round((mx_ - mn_) / width) if min_bins <= nbins <= max_bins \ and (not bins or bins[-1].nbins != nbins): bins.append(BinDefinition(mn_, nbins, width)) if add_unique is not None and len(unique) <= add_unique: if bins and bins[-1].nbins == len(unique): del bins[-1] bins.append(unique_bins()) if len(unique) < min_unique: del bins[:-1] if not return_defs: bins = [get_bins(bin) for bin in bins] return bins def get_bins(bin_def: BinDefinition): """ Return a `np.ndarray` corresponding to interval Args: bin_def (BinDefinition): definition of bins. a named tuple containing the beginning of the first bin (`start`), number of bins (`nbins`) and their widths (`width`). The last value can also be a `nd.array` with `nbins + 1` elements, in which case the function returns this as a result. Returns: bin boundaries (np.ndarray): bin boundaries including the top boundary of the last interval, hence the list size equals `bin_def.nbins + 1`. This array matches the `bin` argument of `numpy.histogram`. """ if isinstance(bin_def.width, np.ndarray): return bin_def.width return bin_def.start + bin_def.width * np.arange(bin_def.nbins + 1) # noinspection PyPep8Naming
[docs]class EntropyMDL(Discretization): """ Discretization into bins inferred by recursively splitting the values to minimize the class-entropy. The procedure stops when further splits would decrease the entropy for less than the corresponding increase of minimal description length (MDL). [FayyadIrani93]. If there are no suitable cut-off points, the procedure returns a single bin, which means that the new feature is constant and can be removed. .. attribute:: force Induce at least one cut-off point, even when its information gain is lower than MDL (default: False). """ def __init__(self, force=False): self.force = force def __call__(self, data, attribute): cont = contingency.get_contingency(data, attribute) values, I = cont.values, cont.counts.T cut_ind = np.array(self._entropy_discretize_sorted(I, self.force)) if len(cut_ind) > 0: # "the midpoint between each successive pair of examples" (FI p.1) points = (values[cut_ind] + values[cut_ind - 1]) / 2. else: points = [] return Discretizer.create_discretized_var( data.domain[attribute], points) @classmethod def _normalize(cls, X, axis=None, out=None): """ Normalize `X` array so it sums to 1.0 over the `axis`. Parameters ---------- X : array Array to normalize. axis : optional int Axis over which the resulting array sums to 1. out : optional array Output array of the same shape as X. """ X = np.asarray(X, dtype=float) scale = np.sum(X, axis=axis, keepdims=True) if out is None: return X / scale else: if out is not X: assert out.shape == X.shape out[:] = X out /= scale return out @classmethod def _entropy_normalized(cls, D, axis=None): """ Compute the entropy of distribution array `D`. `D` must be a distribution (i.e. sum to 1.0 over `axis`) Parameters ---------- D : array Distribution. axis : optional int Axis of `D` along which to compute the entropy. """ # req: (np.sum(D, axis=axis) >= 0).all() # req: (np.sum(D, axis=axis) <= 1).all() # req: np.all(np.abs(np.sum(D, axis=axis) - 1) < 1e-9) D = np.asarray(D) Dc = np.clip(D, np.finfo(D.dtype).eps, 1.0) return - np.sum(D * np.log2(Dc), axis=axis) @classmethod def _entropy(cls, D, axis=None): """ Compute the entropy of distribution `D`. Parameters ---------- D : array Distribution. axis : optional int Axis of `D` along which to compute the entropy. """ D = cls._normalize(D, axis=axis) return cls._entropy_normalized(D, axis=axis) @classmethod def _entropy1(cls, D): """ Compute the entropy of distributions in `D` (one per each row). """ D = cls._normalize(D) return _discretize.entropy_normalized1(D) @classmethod def _entropy2(cls, D): """ Compute the entropy of distributions in `D` (one per each row). """ D = cls._normalize(D, axis=1) return _discretize.entropy_normalized2(D) @classmethod def _entropy_cuts_sorted(cls, CS): """ Return the class information entropy induced by partitioning the `CS` distribution at all N-1 candidate cut points. Parameters ---------- CS : (N, K) array of class distributions. """ CS = np.asarray(CS) # |--|-------|--------| # S1 ^ S2 # S1 contains all points which are <= to cut point # Cumulative distributions for S1 and S2 (left right set) # i.e. a cut at index i separates the CS into S1Dist[i] and S2Dist[i] S1Dist = np.cumsum(CS, axis=0)[:-1] S2Dist = np.cumsum(CS[::-1], axis=0)[-2::-1] # Entropy of S1[i] and S2[i] sets ES1 = cls._entropy2(S1Dist) ES2 = cls._entropy2(S2Dist) # Number of cases in S1[i] and S2[i] sets S1_count = np.sum(S1Dist, axis=1) S2_count = np.sum(S2Dist, axis=1) # Number of all cases S_count = np.sum(CS) ES1w = ES1 * S1_count / S_count ES2w = ES2 * S2_count / S_count # E(A, T; S) Class information entropy of the partition S E = ES1w + ES2w return E, ES1, ES2 @classmethod def _entropy_discretize_sorted(cls, C, force=False): """ Entropy discretization on a sorted C. :param C: (N, K) array of class distributions. """ E, ES1, ES2 = cls._entropy_cuts_sorted(C) # TODO: Also get the left right distribution counts from # entropy_cuts_sorted, # Note the + 1 if len(E) == 0: return [] cut_index = np.argmin(E) + 1 # Distribution of classed in S1, S2 and S S1_c = np.sum(C[:cut_index], axis=0) S2_c = np.sum(C[cut_index:], axis=0) S_c = S1_c + S2_c ES = cls._entropy1(np.sum(C, axis=0)) ES1, ES2 = ES1[cut_index - 1], ES2[cut_index - 1] # Information gain of the best split Gain = ES - E[cut_index - 1] # Number of different classes in S, S1 and S2 k = float(np.sum(S_c > 0)) k1 = float(np.sum(S1_c > 0)) k2 = float(np.sum(S2_c > 0)) assert k > 0 delta = np.log2(3 ** k - 2) - (k * ES - k1 * ES1 - k2 * ES2) N = float(np.sum(S_c)) if Gain > np.log2(N - 1) / N + delta / N: # Accept the cut point and recursively split the subsets. left, right = [], [] if k1 > 1 and cut_index > 1: left = cls._entropy_discretize_sorted(C[:cut_index, :]) if k2 > 1 and cut_index < len(C) - 1: right = cls._entropy_discretize_sorted(C[cut_index:, :]) return left + [cut_index] + [i + cut_index for i in right] elif force: return [cut_index] else: return []
class DomainDiscretizer(_RefuseDataInConstructor, Reprable): """Discretizes all continuous features in the data. .. attribute:: method Feature discretization method (instance of :obj:`Orange.preprocess.Discretization`). If `None` (default), :class:`Orange.preprocess.EqualFreq` with 4 intervals is used. .. attribute:: clean If `True`, features discretized into a single interval constant are removed. This is useful for discretization methods that infer the number of intervals from the data, such as :class:`Orange.preprocess.EntropyMDL` (default: `True`). .. attribute:: discretize_class Determines whether a target is also discretized if it is continuous. (default: `False`) """ def __init__(self, discretize_class=False, method=None, clean=True, fixed=None): self.discretize_class = discretize_class self.method = method self.clean = clean self.fixed = fixed def __call__(self, data, fixed=None): """ Compute and return discretized domain. :param data: Data to discretize. """ def transform_list(s, fixed=None): new_vars = [] for var in s: if var.is_continuous: if fixed and var.name in fixed.keys(): nv = method(data, var, fixed) else: nv = method(data, var) if not self.clean or len(nv.values) > 1: new_vars.append(nv) else: new_vars.append(var) return new_vars if self.method is None: method = EqualFreq(n=4) else: method = self.method domain = data.domain new_attrs = transform_list(domain.attributes, fixed or self.fixed) if self.discretize_class: new_classes = transform_list(domain.class_vars) else: new_classes = domain.class_vars return Domain(new_attrs, new_classes)