Source code for Orange.preprocess.fss
import random
from itertools import takewhile
from operator import itemgetter
import numpy as np
import Orange
from Orange.util import Reprable
from Orange.preprocess.score import ANOVA, GainRatio, \
UnivariateLinearRegression
__all__ = ["SelectBestFeatures", "SelectRandomFeatures"]
[docs]
class SelectBestFeatures(Reprable):
"""
A feature selector that builds a new dataset consisting of either the top
`k` features (if `k` is an `int`) or a proportion (if `k` is a `float`
between 0.0 and 1.0), or all those that exceed a given `threshold`. Features
are scored using the provided feature scoring `method`. By default it is
assumed that feature importance decreases with decreasing scores.
If both `k` and `threshold` are set, only features satisfying both
conditions will be selected.
If `method` is not set, it is automatically selected when presented with
the dataset. Datasets with both continuous and discrete features are
scored using a method suitable for the majority of features.
Parameters
----------
method : Orange.preprocess.score.ClassificationScorer, Orange.preprocess.score.SklScorer
Univariate feature scoring method.
k : int or float
The number or propotion of top features to select.
threshold : float
A threshold that a feature should meet according to the provided method.
decreasing : boolean
The order of feature importance when sorted from the most to the least
important feature.
"""
def __init__(self, method=None, k=None, threshold=None, decreasing=True):
self.method = method
self.k = k
self.threshold = threshold
self.decreasing = decreasing
def __call__(self, data):
n_attrs = len(data.domain.attributes)
if isinstance(self.k, float):
effective_k = np.round(self.k * n_attrs).astype(int) or 1
else:
effective_k = self.k
method = self.method
# select default method according to the provided data
if method is None:
autoMethod = True
discr_ratio = (sum(a.is_discrete
for a in data.domain.attributes)
/ len(data.domain.attributes))
if data.domain.has_discrete_class:
if discr_ratio >= 0.5:
method = GainRatio()
else:
method = ANOVA()
else:
method = UnivariateLinearRegression()
features = data.domain.attributes
try:
scores = method(data)
except ValueError:
scores = self.score_only_nice_features(data, method)
best = sorted(zip(scores, features), key=itemgetter(0),
reverse=self.decreasing)
if self.k:
best = best[:effective_k]
if self.threshold:
pred = ((lambda x: x[0] >= self.threshold) if self.decreasing else
(lambda x: x[0] <= self.threshold))
best = takewhile(pred, best)
domain = Orange.data.Domain([f for s, f in best],
data.domain.class_vars, data.domain.metas)
return data.transform(domain)
def score_only_nice_features(self, data, method):
# dtype must be defined because array can be empty
mask = np.array([isinstance(a, method.feature_type)
for a in data.domain.attributes], dtype=bool)
features = [f for f in data.domain.attributes
if isinstance(f, method.feature_type)]
scores = [method(data, f) for f in features]
bad = float('-inf') if self.decreasing else float('inf')
all_scores = np.array([bad] * len(data.domain.attributes))
all_scores[mask] = scores
return all_scores
class SelectRandomFeatures(Reprable):
"""
A feature selector that selects random `k` features from an input
dataset and returns a dataset with selected features. Parameter
`k` is either an integer (number of feature) or float (from 0.0 to
1.0, proportion of retained features).
Parameters
----------
k : int or float (default = 0.1)
The number or proportion of features to retain.
"""
def __init__(self, k=0.1):
self.k = k
def __call__(self, data):
if isinstance(self.k, float):
effective_k = int(len(data.domain.attributes) * self.k)
else:
effective_k = self.k
domain = Orange.data.Domain(
random.sample(data.domain.attributes,
min(effective_k, len(data.domain.attributes))),
data.domain.class_vars, data.domain.metas)
return data.transform(domain)