Source code for arfs.feature_selection.variable_importance

"""Supervised Feature Selection

This module provides selectors using supervised statistics and a threshold, using SHAP, permutation importance or impurity (Gini) importance.

Module Structure:
-----------------
- ``VariableImportance`` main class for identifying non-important features
"""

from __future__ import print_function
from tqdm.auto import trange

# pandas
import pandas as pd

# numpy
import numpy as np

# matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

# sklearn
from sklearn.utils.validation import check_is_fitted
from sklearn.base import BaseEstimator
from sklearn.feature_selection._base import SelectorMixin

# ARFS
from ..utils import reset_plot
from ..gbm import GradientBoosting
from ..preprocessing import OrdinalEncoderPandas


[docs]class VariableImportance(SelectorMixin, BaseEstimator): """Feature selector that removes predictors with zero or low variable importance. Identify the features with zero/low importance according to SHAP values of a lightgbm. The gbm can be trained with early stopping using a utils set to prevent overfitting. The feature importances are averaged over `n_iterations` to reduce the variance. The predictors are then ranked from the most important to the least important and the cumulative variable importance is computed. All the predictors not contributing (VI=0) or contributing to less than the threshold to the cumulative importance are removed. Parameters ---------- task : string The machine learning task, either 'classification' or 'regression' or 'multiclass', be sure to use a consistent objective function encode : boolean, default = True Whether or not to encode the predictors n_iterations : int, default = 10 Number of iterations, the more iterations, the smaller the variance threshold : float, default = .99 The selector computes the cumulative feature importance and ranks the predictors from the most important to the least important. All the predictors contributing to less than this value are rejected. lgb_kwargs : dictionary of keyword arguments dictionary of lightgbm estimators parameters with at least the objective function {'objective':'rmse'} encoder_kwargs : dictionary of keyword arguments, optional dictionary of the :class:`OrdinalEncoderPandas` parameters Returns ------- selected_features: list of str List of selected features. Attributes ---------- n_features_in_ : int number of input predictors assoc_matrix_ : pd.DataFrame the square association matrix collinearity_summary_ : pd.DataFrame the pairs of collinear features and the association values support_ : list of bool the list of the selected X-columns selected_features_ : list of str the list of names of selected features not_selected_features_ : list of str the list of names of rejected features fastshap : boolean enable or not the fasttreeshap implementation verbose : int, default = -1 controls the progress bar, > 1 print out progress Example ------- >>> from sklearn.datasets import make_classification, make_regression >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5 >>> X = pd.DataFrame(X) >>> y = pd.Series(y) >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])] >>> X.columns = pred_name >>> selector = VariableImportance(threshold=0.75) >>> selector.fit_transform(X, y) """ def __init__( self, task="regression", encode=True, n_iterations=10, threshold=0.99, lgb_kwargs={"objective": "rmse", "zero_as_missing": False}, encoder_kwargs=None, fastshap=False, verbose=-1, ): self.task = task self.encode = encode self.n_iterations = n_iterations self.threshold = threshold self.lgb_kwargs = lgb_kwargs self.encoder_kwargs = encoder_kwargs self.verbose = verbose self.fastshap = fastshap if (self.threshold > 1.0) or (self.threshold < 0.0): raise ValueError("``threshold`` should be larger than 0 and smaller than 1")
[docs] def fit(self, X, y, sample_weight=None): """Learn variable importance from X and y, supervised learning. Parameters ---------- X : pd.DataFrame, shape (n_samples, n_features) Data from which to compute variances, where `n_samples` is the number of samples and `n_features` is the number of features. y : any, default=None Ignored. This parameter exists only for compatibility with sklearn.pipeline.Pipeline. sample_weight : pd.Series, optional, shape (n_samples,) weights for computing the statistics (e.g. weighted average) Returns ------- self : object Returns the instance itself. """ if isinstance(X, pd.DataFrame): self.feature_names_in_ = X.columns.to_numpy() else: raise TypeError("X is not a dataframe") feature_importances = _compute_varimp_lgb( X=X, y=y, sample_weight=sample_weight, encode=self.encode, task=self.task, n_iterations=self.n_iterations, verbose=self.verbose, encoder_kwargs=self.encoder_kwargs, lgb_kwargs=self.lgb_kwargs, fastshap=self.fastshap, ) self.feature_importances_summary_ = feature_importances support_ordered = ( self.feature_importances_summary_["cumulative_importance"] >= self.threshold ) to_drop = list( self.feature_importances_summary_.loc[support_ordered, "feature"] ) self.support_ = np.asarray( [False if c in to_drop else True for c in self.feature_names_in_] ) self.selected_features_ = self.feature_names_in_[self.support_] self.not_selected_features_ = self.feature_names_in_[~self.support_] return self
def _get_support_mask(self): check_is_fitted(self) return self.support_
[docs] def transform(self, X): """ Transform the data, returns a transformed version of `X`. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. Returns ------- X : ndarray array of shape (n_samples, n_features_new) Transformed array. Raises ------ TypeError if the input is not a pd.DataFrame """ if not isinstance(X, pd.DataFrame): raise TypeError("X is not a dataframe") return X[self.selected_features_]
[docs] def fit_transform(self, X, y=None, sample_weight=None): """ Fit to data, then transform it. Fits transformer to `X` and `y` with optional parameters `fit_params` and returns a transformed version of `X`. Parameters ---------- X : array-like of shape (n_samples, n_features) Input samples. y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ default=None Target values (None for unsupervised transformations). **fit_params : dict Additional fit parameters. Returns ------- X_new : ndarray array of shape (n_samples, n_features_new) Transformed array. """ return self.fit(X=X, y=y, sample_weight=sample_weight).transform(X)
def _more_tags(self): return {"allow_nan": True}
[docs] def plot_importance( self, figsize=None, plot_n=50, n_feat_per_inch=3, log=True, style=None ): """Plots `plot_n` most important features and the cumulative importance of features. If `threshold` is provided, prints the number of features needed to reach `threshold` cumulative importance. Parameters ---------- plot_n : int, default = 50 Number of most important features to plot. Defaults to 15 or the maximum number of features whichever is smaller n_feat_per_inch : int number of features per inch, the larger the less space between labels figsize : tuple of float, optional The rendered size as a percentage size log : bool, default=True Whether or not render variable importance on a log scale style : bool, default=False set arfs style or not Returns ------- hv.plot the feature importances holoviews object """ if style: plt.style.use(style) else: reset_plot() if plot_n > self.feature_importances_summary_.shape[0]: plot_n = self.feature_importances_summary_.shape[0] - 1 df = self.feature_importances_summary_ importance_index = np.min( np.where(df["cumulative_importance"] > self.threshold) ) non_cum_threshold = df.iloc[importance_index, 2] max_norm_importance = 0.99 * df.normalized_importance.max() if plot_n > df.shape[0]: plot_n = df.shape[0] - 1 if figsize is None: figsize = (8, plot_n / n_feat_per_inch) fig = plt.figure(tight_layout=True, figsize=figsize) gs = gridspec.GridSpec(3, 3) ax1 = fig.add_subplot(gs[:, 0]) ax1.scatter(df.normalized_importance, df.feature) # ax.set_ylabel('YLabel0') ax1.set_xlabel("normalized importance") ax1.xaxis.set_label_position("top") ax1.invert_yaxis() ax1.axvline(x=non_cum_threshold, linestyle="dashed", color="r") if log: ax1.set_xscale("log") ax1.grid() ax1.set(frame_on=False) ax2 = fig.add_subplot(gs[:, 1:]) ax2.scatter(df.feature, df.cumulative_importance) # ax.set_ylabel('YLabel0') ax2.set_ylabel("cumulative importance") ax2.tick_params(axis="x", labelrotation=90) importance_min_value_on_axis = max_norm_importance if log else 0 x_vert, y_vert = [importance_index, importance_index], [ importance_min_value_on_axis, self.threshold, ] x_horiz, y_horiz = [importance_min_value_on_axis, importance_index], [ self.threshold, self.threshold, ] ax2.plot(x_vert, y_vert, linestyle="dashed", color="r") ax2.plot(x_horiz, y_horiz, linestyle="dashed", color="r") ax2.set_ylim(max_norm_importance, 1.0) if log: ax2.set_xscale("log") ax2.grid() ax2.set(frame_on=False) fig.align_labels() return fig
def _compute_varimp_lgb( X, y, sample_weight=None, encode=False, task="regression", n_iterations=10, verbose=-1, fastshap=True, encoder_kwargs=None, lgb_kwargs={"objective": "rmse", "zero_as_missing": False}, ): if task not in ["regression", "classification", "multiclass"]: raise ValueError('Task must be either "classification" or "regression"') if y is None: raise ValueError("No training labels provided.") if encode: encoder = ( OrdinalEncoderPandas(**encoder_kwargs) if encoder_kwargs is not None else OrdinalEncoderPandas() ) X = encoder.fit(X).transform(X) del encoder # Extract feature names feature_names = list(X.columns) # Empty array for feature importances feature_importance_values = np.zeros(len(feature_names)) progress_bar = trange(n_iterations) if verbose > 1 else range(n_iterations) # Iterate through each fold for _ in progress_bar: if verbose > 1: progress_bar.set_description("Iteration nb: {0:<3}".format(_)) # lgb_kwargs['verbose'] = -1 gbm_model = GradientBoosting( cat_feat="auto", stratified=False, params=lgb_kwargs, show_learning_curve=False, return_valid_features=True, verbose_eval=0, ) gbm_model.fit(X=X, y=y, sample_weight=sample_weight) # pimp cool but too slow # perm_imp = permutation_importance( # model, valid_features, valid_labels, n_repeats=10, random_state=42, n_jobs=-1 # ) # perm_imp = perm_imp.importances_mean if fastshap: try: from fasttreeshap import TreeExplainer as FastTreeExplainer except ImportError: ImportError("fasttreeshap is not installed") explainer = FastTreeExplainer( gbm_model.model, algorithm="auto", shortcut=False, feature_perturbation="tree_path_dependent", ) shap_matrix = explainer.shap_values(gbm_model.valid_features) if isinstance(shap_matrix, list): # For LightGBM classifier, RF, in sklearn API, SHAP returns a list of arrays # https://github.com/slundberg/shap/issues/526 shap_imp = np.mean([np.abs(sv).mean(0) for sv in shap_matrix], axis=0) else: shap_imp = np.abs(shap_matrix).mean(0) else: shap_matrix = gbm_model.model.predict( gbm_model.valid_features, pred_contrib=True ) # the dim changed in lightGBM >= 3.0.0 if task == "multiclass": # X_SHAP_values (array-like of shape = [n_samples, n_features + 1] # or shape = [n_samples, (n_features + 1) * n_classes]) # index starts from 0 n_features_plus_bias = gbm_model.valid_features.shape[1] + 1 n_samples = gbm_model.valid_features.shape[0] y_freq_table = pd.Series(y.fillna(0)).value_counts(normalize=True) n_classes = y_freq_table.size # Reshape the array to [n_samples, n_features + 1, n_classes] reshaped_values = shap_matrix.reshape(n_samples, n_classes, n_features_plus_bias) # Since we need (n_samples, n_features + 1, n_classes), transpose the second and third dimensions reshaped_values = reshaped_values.transpose(0, 2, 1) reshaped_values = reshaped_values[:, :-1, :] reshaped_values.shape # Sum the contributions for each class ignoring the bias term # average on all the samples shap_imp = np.abs(reshaped_values).sum(axis=-1).mean(axis=0) else: # for binary, only one class is returned, for regression a single column added as well shap_imp = np.mean(np.abs(shap_matrix[:, :-1]), axis=0) # Record the feature importances feature_importance_values += ( shap_imp / n_iterations ) # model.feature_importances_ / n_iterations feature_importances = pd.DataFrame( {"feature": feature_names, "importance": feature_importance_values} ) # Sort features according to importance feature_importances = feature_importances.sort_values( "importance", ascending=False ).reset_index(drop=True) # Normalize the feature importances to add up to one feature_importances["normalized_importance"] = ( feature_importances["importance"] / feature_importances["importance"].sum() ) feature_importances["cumulative_importance"] = np.cumsum( feature_importances["normalized_importance"] ) # Extract the features with zero importance # record_zero_importance = feature_importances[ # feature_importances["importance"] == 0.0 # ] return feature_importances