Source code for stree.Strees

"""
Oblique decision tree classifier based on SVM nodes
"""

import numbers
import random
from typing import Optional
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import (
    check_X_y,
    check_array,
    check_is_fitted,
    _check_sample_weight,
)
from .Splitter import Splitter, Snode, Siterator
from ._version import __version__


[docs]class Stree(BaseEstimator, ClassifierMixin): """ Estimator that is based on binary trees of svm nodes can deal with sample_weights in predict, used in boosting sklearn methods inheriting from BaseEstimator implements get_params and set_params methods inheriting from ClassifierMixin implement the attribute _estimator_type with "classifier" as value Parameters ---------- C : float, optional Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive., by default 1.0 kernel : str, optional Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library, by default "linear" max_iter : int, optional Hard limit on iterations within solver, or -1 for no limit., by default 1e5 random_state : int, optional Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.Pass an int for reproducible output across multiple function calls, by default None max_depth : int, optional Specifies the maximum depth of the tree, by default None tol : float, optional Tolerance for stopping, by default 1e-4 degree : int, optional Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels., by default 3 gamma : str, optional Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.if gamma='scale' (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma,if ‘auto’, uses 1 / n_features., by default "scale" split_criteria : str, optional Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node. max_samples is incompatible with 'ovo' multiclass_strategy, by default "impurity" criterion : str, optional The function to measure the quality of a split (only used if max_features != num_features). Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain., by default "entropy" min_samples_split : int, optional The minimum number of samples required to split an internal node. 0 (default) for any, by default 0 max_features : optional The number of features to consider when looking for the split: If int, then consider max_features features at each split. If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split. If “auto”, then max_features= sqrt(n_features). If “sqrt”, then max_features=sqrt(n_features). If “log2”, then max_features=log2(n_features). If None, then max_features= n_features., by default None splitter : str, optional The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: “best”: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. “random”: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. “trandom”: The algorithm generates only one random combination. "mutual": Chooses the best features w.r.t. their mutual info with the label. "cfs": Apply Correlation-based Feature Selection. "fcbf": Apply Fast Correlation- Based , by default "random" multiclass_strategy : str, optional Strategy to use with multiclass datasets, "ovo": one versus one. "ovr": one versus rest, by default "ovo" normalize : bool, optional If standardization of features should be applied on each node with the samples that reach it , by default False Attributes ---------- classes_ : ndarray of shape (n_classes,) The classes labels. n_classes_ : int The number of classes n_iter_ : int Max number of iterations in classifier depth_ : int Max depht of the tree n_features_ : int The number of features when ``fit`` is performed. n_features_in_ : int Number of features seen during :term:`fit`. max_features_ : int Number of features to use in hyperplane computation tree_ : Node root of the tree X_ : ndarray points to the input dataset y_ : ndarray points to the input labels References ---------- R. Montañana, J. A. Gámez, J. M. Puerta, "STree: a single multi-class oblique decision tree based on support vector machines.", 2021 LNAI 12882 """ def __init__( self, C: float = 1.0, kernel: str = "linear", max_iter: int = int(1e5), random_state: int = None, max_depth: int = None, tol: float = 1e-4, degree: int = 3, gamma="scale", split_criteria: str = "impurity", criterion: str = "entropy", min_samples_split: int = 0, max_features=None, splitter: str = "random", multiclass_strategy: str = "ovo", normalize: bool = False, ): self.max_iter = max_iter self.C = C self.kernel = kernel self.random_state = random_state self.max_depth = max_depth self.tol = tol self.gamma = gamma self.degree = degree self.min_samples_split = min_samples_split self.split_criteria = split_criteria self.max_features = max_features self.criterion = criterion self.splitter = splitter self.normalize = normalize self.multiclass_strategy = multiclass_strategy
[docs] @staticmethod def version() -> str: """Return the version of the package.""" return __version__
[docs] def _more_tags(self) -> dict: """Required by sklearn to supply features of the classifier make mandatory the labels array :return: the tag required :rtype: dict """ return {"requires_y": True}
[docs] def fit( self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None ) -> "Stree": """Build the tree based on the dataset of samples and its labels Returns ------- Stree itself to be able to chain actions: fit().predict() ... Raises ------ ValueError if C < 0 ValueError if max_depth < 1 ValueError if all samples have 0 or negative weights """ # Check parameters are Ok. if self.C < 0: raise ValueError( f"Penalty term must be positive... got (C={self.C:f})" ) self.__max_depth = ( np.iinfo(np.int32).max if self.max_depth is None else self.max_depth ) if self.__max_depth < 1: raise ValueError( f"Maximum depth has to be greater than 1... got (max_depth=\ {self.max_depth})" ) if self.multiclass_strategy not in ["ovr", "ovo"]: raise ValueError( "mutliclass_strategy has to be either ovr or ovo" f" but got {self.multiclass_strategy}" ) if self.multiclass_strategy == "ovo": if self.kernel == "liblinear": raise ValueError( "The kernel liblinear is incompatible with ovo " "multiclass_strategy" ) if self.split_criteria == "max_samples": raise ValueError( "The multiclass_strategy 'ovo' is incompatible with " "split_criteria 'max_samples'" ) kernels = ["liblinear", "linear", "rbf", "poly", "sigmoid"] if self.kernel not in kernels: raise ValueError(f"Kernel {self.kernel} not in {kernels}") check_classification_targets(y) X, y = check_X_y(X, y) sample_weight = _check_sample_weight( sample_weight, X, dtype=np.float64 ) if not any(sample_weight): raise ValueError( "Invalid input - all samples have zero or negative weights." ) check_classification_targets(y) # Initialize computed parameters self.splitter_ = Splitter( clf=self._build_clf(), criterion=self.criterion, feature_select=self.splitter, criteria=self.split_criteria, random_state=self.random_state, min_samples_split=self.min_samples_split, normalize=self.normalize, ) if self.random_state is not None: random.seed(self.random_state) self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = self.classes_.shape[0] self.n_iter_ = self.max_iter self.depth_ = 0 self.n_features_ = X.shape[1] self.n_features_in_ = X.shape[1] self.max_features_ = self._initialize_max_features() self.tree_ = self._train(X, y, sample_weight, 1, "root") self.X_ = X self.y_ = y return self
[docs] def _train( self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray, depth: int, title: str, ) -> Optional[Snode]: """Recursive function to split the original dataset into predictor nodes (leaves) Parameters ---------- X : np.ndarray samples dataset y : np.ndarray samples labels sample_weight : np.ndarray weight of samples. Rescale C per sample. depth : int actual depth in the tree title : str description of the node Returns ------- Optional[Snode] binary tree """ if depth > self.__max_depth: return None # Mask samples with 0 weight if any(sample_weight == 0): indices_zero = sample_weight == 0 X = X[~indices_zero, :] y = y[~indices_zero] sample_weight = sample_weight[~indices_zero] self.depth_ = max(depth, self.depth_) scaler = StandardScaler() node = Snode(None, X, y, X.shape[1], 0.0, title, sample_weight, scaler) if np.unique(y).shape[0] == 1: # only 1 class => pure dataset node.set_title(title + ", <pure>") node.make_predictor(self.n_classes_) return node # Train the model clf = self._build_clf() Xs, features = self.splitter_.get_subspace(X, y, self.max_features_) if self.normalize: scaler.fit(Xs) Xs = scaler.transform(Xs) clf.fit(Xs, y, sample_weight=sample_weight) node.set_impurity(self.splitter_.partition_impurity(y)) node.set_classifier(clf) node.set_features(features) self.splitter_.partition(X, node, True) X_U, X_D = self.splitter_.part(X) y_u, y_d = self.splitter_.part(y) sw_u, sw_d = self.splitter_.part(sample_weight) if X_U is None or X_D is None: # didn't part anything node.set_title(title + ", <cgaf>") node.make_predictor(self.n_classes_) return node node.set_up( self._train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})") ) node.set_down( self._train( X_D, y_d, sw_d, depth + 1, title + f" - Down({depth+1})" ) ) return node
[docs] def _build_clf(self): """Build the right classifier for the node""" return ( LinearSVC( max_iter=self.max_iter, random_state=self.random_state, C=self.C, tol=self.tol, ) if self.kernel == "liblinear" else SVC( kernel=self.kernel, max_iter=self.max_iter, tol=self.tol, C=self.C, gamma=self.gamma, degree=self.degree, random_state=self.random_state, decision_function_shape=self.multiclass_strategy, ) )
def __predict_class(self, X: np.array) -> np.array: """Compute the predicted class for the samples in X. Returns the number of samples of each class in the corresponding leaf node. Parameters ---------- X : np.array Array of samples Returns ------- np.array Array of shape (n_samples, n_classes) with the number of samples of each class in the corresponding leaf node """ def compute_prediction(xp, indices, node): if xp is None: return if node.is_leaf(): # set a class for indices result[indices] = node._proba return self.splitter_.partition(xp, node, train=False) x_u, x_d = self.splitter_.part(xp) i_u, i_d = self.splitter_.part(indices) compute_prediction(x_u, i_u, node.get_up()) compute_prediction(x_d, i_d, node.get_down()) # setup prediction & make it happen result = np.zeros((X.shape[0], self.n_classes_)) indices = np.arange(X.shape[0]) compute_prediction(X, indices, self.tree_) return result
[docs] def check_predict(self, X) -> np.array: """Checks predict and predict_proba preconditions. If input X is not an np.array convert it to one. Parameters ---------- X : np.ndarray Array of samples Returns ------- np.array Array of samples Raises ------ ValueError If number of features of X is different of the number of features in training data """ check_is_fitted(self, ["tree_"]) # Input validation X = check_array(X) if X.shape[1] != self.n_features_: raise ValueError( f"Expected {self.n_features_} features but got " f"({X.shape[1]})" ) return X
[docs] def predict_proba(self, X: np.array) -> np.array: """Predict class probabilities of the input samples X. The predicted class probability is the fraction of samples of the same class in a leaf. Parameters ---------- X : dataset of samples. Returns ------- proba : array of shape (n_samples, n_classes) The class probabilities of the input samples. Raises ------ ValueError if dataset with inconsistent number of features NotFittedError if model is not fitted """ X = self.check_predict(X) # return # of samples of each class in leaf node values = self.__predict_class(X) normalizer = values.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 return values / normalizer
[docs] def predict(self, X: np.array) -> np.array: """Predict labels for each sample in dataset passed Parameters ---------- X : np.array dataset of samples Returns ------- np.array array of labels Raises ------ ValueError if dataset with inconsistent number of features NotFittedError if model is not fitted """ X = self.check_predict(X) return self.classes_[np.argmax(self.__predict_class(X), axis=1)]
[docs] def nodes_leaves(self) -> tuple: """Compute the number of nodes and leaves in the built tree Returns ------- [tuple] tuple with the number of nodes and the number of leaves """ nodes = 0 leaves = 0 for node in self: nodes += 1 if node.is_leaf(): leaves += 1 return nodes, leaves
def __iter__(self) -> Siterator: """Create an iterator to be able to visit the nodes of the tree in preorder, can make a list with all the nodes in preorder Returns ------- Siterator an iterator, can for i in... and list(...) """ try: tree = self.tree_ except AttributeError: tree = None return Siterator(tree)
[docs] def graph(self, title="") -> str: """Graphviz code representing the tree Returns ------- str graphviz code """ output = ( "digraph STree {\nlabel=<STree " f"{title}>\nfontsize=30\nfontcolor=blue\nlabelloc=t\n" ) for node in self: output += node.graph() output += "}\n" return output
def __str__(self) -> str: """String representation of the tree Returns ------- str description of nodes in the tree in preorder """ output = "" for i in self: output += str(i) + "\n" return output
[docs] def _initialize_max_features(self) -> int: if isinstance(self.max_features, str): if self.max_features == "auto": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( "Invalid value for max_features. " "Allowed string values are 'auto', " "'sqrt' or 'log2'." ) elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, numbers.Integral): if self.max_features > self.n_features_: raise ValueError( "Invalid value for max_features. " "It can not be greater than number of features " f"({self.n_features_})" ) max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max( 1, int(self.max_features * self.n_features_) ) else: raise ValueError( "Invalid value for max_features." "Allowed float must be in range (0, 1] " f"got ({self.max_features})" ) return max_features