Source code for arfs.feature_selection.allrelevant

"""This module provides 3 different methods to perform 'all relevant feature selection'


Reference:
----------
NILSSON, Roland, PEÑA, José M., BJÖRKEGREN, Johan, et al.
Consistent feature selection for pattern recognition in polynomial time.
Journal of Machine Learning Research, 2007, vol. 8, no Mar, p. 589-612.

KURSA, Miron B., RUDNICKI, Witold R., et al.
Feature selection with the Boruta package.
J Stat Softw, 2010, vol. 36, no 11, p. 1-13.

https://github.com/chasedehan/BoostARoota

The module structure
--------------------
- The ``Leshy`` class, a heavy re-work of ``BorutaPy`` class
  itself a modified version of Boruta, the pull request I submitted and still pending:
  https://github.com/scikit-learn-contrib/boruta_py/pull/100

- The ``BoostAGroota`` class, a modified version of BoostARoota, PR still to be submitted
  https://github.com/chasedehan/BoostARoota

- The ``GrootCV`` class for a new method for all relevant feature selection using a lightgGBM model,
  cross-validated SHAP importances and shadowing.

Original BorutaPy version
-------------------------
Author: Daniel Homola <dani.homola@gmail.com>

Original code and method by: Miron B Kursa, https://m2.icm.edu.pl/boruta/
Modified by Thomas Bury, pull request:
https://github.com/scikit-learn-contrib/boruta_py/pull/100
Waiting for merging

https://github.com/scikit-learn-contrib/boruta_py/pull/100
is a new PR based on #77 making all the changes optional. Waiting for merge

Leshy is a re-work of the PR I submitted.

License: BSD 3 clause

"""

from __future__ import print_function, division
import operator
import warnings
import time
import shap
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy as sp

from typing import Tuple, Optional, Union, Iterable
from tqdm.auto import tqdm
from sklearn.utils import check_random_state, check_X_y
from sklearn.base import BaseEstimator, is_regressor, is_classifier, clone

from sklearn.utils.validation import check_is_fitted
from sklearn.feature_selection._base import SelectorMixin
from sklearn.model_selection import RepeatedKFold, train_test_split
from sklearn.inspection import permutation_importance
from sklearn.utils.validation import _check_sample_weight
from matplotlib.lines import Line2D
from lightgbm import early_stopping


from ..utils import (
    check_if_tree_based,
    is_lightgbm,
    is_catboost,
    create_dtype_dict,
    get_pandas_cat_codes,
    validate_sample_weight,
)

########################################################################################
#
# Main Classes and Methods
# Provide a fit, transform and fit_transform method
#
########################################################################################
# !/usr/bin/env python
# -*- coding: utf-8 -*-

NO_FEATURE_SELECTED_WARNINGS = "No feature selected - No data to plot"
ARFS_COLOR_LIST = [
    "#000000",
    "#7F3C8D",
    "#11A579",
    "#3969AC",
    "#F2B701",
    "#E73F74",
    "#80BA5A",
    "#E68310",
    "#008695",
    "#CF1C90",
    "#F97B72",
]
BLUE = "#2590fa"
YELLOW = "#f0be00"
RED = "#b51204"
BCKGRD_COLOR = "#f5f5f5"
PLT_PARAMS = {
    "axes.prop_cycle": plt.cycler(color=ARFS_COLOR_LIST),
    "axes.facecolor": BCKGRD_COLOR,
    "patch.edgecolor": BCKGRD_COLOR,
    "figure.facecolor": BCKGRD_COLOR,
    "axes.edgecolor": BCKGRD_COLOR,
    "savefig.edgecolor": BCKGRD_COLOR,
    "savefig.facecolor": BCKGRD_COLOR,
    "grid.color": "#d2d2d2",
    "lines.linewidth": 1.5,
}
PLOT_KWARGS = dict(
    kind="box",
    boxprops=dict(linestyle="-", linewidth=1.5, color="gray", facecolor="gray"),
    flierprops=dict(linestyle="-", linewidth=1.5, color="gray"),
    medianprops=dict(linestyle="-", linewidth=1.5, color="#000000"),
    whiskerprops=dict(linestyle="-", linewidth=1.5, color="gray"),
    capprops=dict(linestyle="-", linewidth=1.5, color="gray"),
    showfliers=False,
    grid=True,
    rot=0,
    vert=False,
    patch_artist=True,
    fontsize=9,
)


[docs]class Leshy(SelectorMixin, BaseEstimator):
    """This is an improved version of BorutaPy which itself is an
    improved Python implementation of the Boruta R package.
    Boruta is an all relevant feature selection method, while most other are
    minimal optimal; this means it tries to find all features carrying
    information usable for prediction, rather than finding a possibly compact
    subset of features on which some estimator has a minimal error.
    Why bother with all relevant feature selection?
    When you try to understand the phenomenon that made your data, you should
    care about all factors that contribute to it, not just the bluntest signs
    of it in context of your methodology (minimal optimal set of features
    by definition depends on your estimator choice).

    Parameters
    ----------
    estimator : object
        A supervised learning estimator, with a 'fit' method that returns the
        ``feature_importances_`` attribute. Important features must correspond to
        high absolute values in the ``feature_importances_``
    n_estimators : int or string, default = 1000
        If int sets the number of estimators in the chosen ensemble method.
        If 'auto' this is determined automatically based on the size of the
        dataset. The other parameters of the used estimators need to be set
        with initialisation.
    perc : int, default = 100
        Instead of the max we use the percentile defined by the user, to pick
        our threshold for comparison between shadow and real features. The max
        tend to be too stringent. This provides a finer control over this. The
        lower perc is the more false positives will be picked as relevant but
        also the less relevant features will be left out. The usual trade-off.
        The default is essentially the vanilla Boruta corresponding to the max.
    alpha : float, default = 0.05
        Level at which the corrected p-values will get rejected in both
        correction steps.
    importance : str, default = 'shap'
        The kind of variable importance used to compare and discriminate original
        vs shadow predictors. Note that the builtin tree importance (gini/impurity based
        importance) is biased towards numerical and large cardinality predictors, even
        if they are random. Shapley values and permutation imp. are robust w.r.t those predictors.
        Possible values: 'shap' (Shapley values), 'fastshap' (FastTreeShap implementation),
        'pimp' (permutation importance) and 'native' (Gini/impurity)
    two_step : Boolean, default = True
        If you want to use the original implementation of Boruta with Bonferroni
        correction only set this to False.
    max_iter : int, default = 100
        The number of maximum iterations to perform.
    random_state : int, RandomState instance or None; default=None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    verbose : int, default=0
        Controls verbosity of output. 0: no output, 1: displays iteration number,
        2: which features have been selected already


    Attributes
    ----------
    n_features_ : int
        The number of selected features.
    support_ : array of shape [n_features]
        The mask of selected features - only confirmed ones are True.
    support_weak_ : array of shape [n_features]
        The mask of selected tentative features, which haven't gained enough
        support during the max_iter number of iterations.
    selected_features_ : list of str
        the list of columns to keep
    ranking_ : array of shape [n_features]
        The feature ranking, such that ``ranking_[i]`` corresponds to the
        ranking position of the i-th feature. Selected (i.e., estimated
        best) features are assigned rank one and tentative features are assigned
        rank 2.
    ranking_absolutes_ : array of shape [n_features]
        The absolute feature ranking as ordered by selection process. It does not guarantee
        that this order is correct for all models. For a model agnostic ranking, see the
        the attribute ``ranking``
    cat_name : list of str
        the name of the categorical columns
    cat_idx : list of int
        the index of the categorical columns
    imp_real_hist : array
        array of the historical feature importance of the real predictors
    sha_max : float
        the maximum feature importance of the shadow predictors
    col_names : list of str
        the names of the real predictors


    Examples
    --------
    >>> import pandas as pd
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from boruta import BorutaPy
    >>>
    >>> # load X and y
    >>> # NOTE BorutaPy accepts numpy arrays only, hence the .values attribute
    >>> X = pd.read_csv('examples/test_X.csv', index_col=0).values
    >>> y = pd.read_csv('examples/test_y.csv', header=None, index_col=0).values
    >>> y = y.ravel()
    >>>
    >>> # define random forest classifier, with utilising all cores and
    >>> # sampling in proportion to y labels
    >>> rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
    >>>
    >>> # define Boruta feature selection method
    >>> feat_selector = Leshy(rf, n_estimators='auto', verbose=2, random_state=1)
    >>>
    >>> # find all relevant features - 5 features should be selected
    >>> feat_selector.fit(X, y)
    >>>
    >>> # check selected features - first 5 features are selected
    >>> feat_selector.selected_features_
    >>>
    >>> # check ranking of features
    >>> feat_selector.ranking_
    >>>
    >>> # call transform() on X to filter it down to selected features
    >>> X_filtered = feat_selector.transform(X)

    References
    ----------
    See the original paper [1]_ for more details.

    ..[1] Kursa M., Rudnicki W., "Feature Selection with the Boruta Package"
        Journal of Statistical Software, Vol. 36, Issue 11, Sep 2010

    """

    def __init__(
        self,
        estimator,
        n_estimators=1000,
        perc=90,
        alpha=0.05,
        importance="shap",
        two_step=True,
        max_iter=100,
        random_state=None,
        verbose=0,
        keep_weak=False,
    ):
        
        if max_iter < 2:
            raise ValueError("max_iter must be > 1 for Leshy to work properly")
        
        self.estimator = estimator
        self.n_estimators = n_estimators
        self.perc = perc
        self.alpha = alpha
        self.two_step = two_step
        self.max_iter = max_iter
        self.random_state = random_state
        self.random_state_instance = None
        self.verbose = verbose
        self.keep_weak = keep_weak
        self.importance = importance
        self.cat_name = None
        self.cat_idx = None
        # Catboost doesn't allow to change random seed after fitting
        self.is_cat = is_catboost(estimator)
        self.is_lgb = is_lightgbm(estimator)
        # plotting
        self.imp_real_hist = None
        self.sha_max = None
        self.n_features_ = 0
        self.support_ = None
        self.support_weak_ = None

[docs]    def fit(self, X, y, sample_weight=None):
        """Fits the Boruta feature selection with the provided estimator.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples.
        y : array-like, shape = [n_samples]
            The target values.
        sample_weight : array-like, shape = [n_samples], default=None
            Individual weights for each sample

        Returns
        -------
        self : object
            Nothing but attributes

        """
        if self.importance == "fastshap":
            try:
                from fasttreeshap import TreeExplainer as FastTreeExplainer
            except ImportError:
                warnings.warn("fasttreeshap is not installed. Fallback to shap.")
                self.importance = "shap"

        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_numpy()
        else:
            raise TypeError("X is not a dataframe")

        self.imp_real_hist = np.empty((0, X.shape[1]), float)
        self._fit(X, y, sample_weight=sample_weight)
        self.selected_features_ = self.feature_names_in_[self.support_]
        self.not_selected_features_ = self.feature_names_in_[~self.support_]

        return self

[docs]    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X is not a dataframe")
        return X[self.selected_features_]

[docs]    @mpl.rc_context(PLT_PARAMS)
    def plot_importance(self, n_feat_per_inch=5):
        """Boxplot of the variable importance, ordered by magnitude
        The max shadow variable importance illustrated by the dashed line.
        Requires to apply the fit method first.

        Parameters
        ----------
        n_feat_per_inch : int, default=5
            number of features to plot per inch (for scaling the figure)

        Returns
        -------
        fig : plt.figure
            the matplotlib figure object containing the boxplot
        """

        if self.imp_real_hist is None:
            raise ValueError("Use the fit method first to compute the var.imp")

        vimp_df = pd.DataFrame(self.imp_real_hist, columns=self.feature_names_in_)
        vimp_df = vimp_df.reindex(
            vimp_df.mean().sort_values(ascending=True).index, axis=1
        )

        if vimp_df.dropna().empty:
            warnings.warn(NO_FEATURE_SELECTED_WARNINGS)
            return None
        else:
            fig, ax = plt.subplots(figsize=(16, vimp_df.shape[1] / n_feat_per_inch))
            bp = vimp_df.plot(**PLOT_KWARGS, ax=ax)

            n_strong = sum(self.support_)
            n_weak = np.sum(self.support_weak_)
            n_discarded = np.sum(~(self.support_ | self.support_weak_))
            box_face_col = (
                [BLUE] * n_strong + [YELLOW] * n_weak + ["gray"] * n_discarded
            )
            for c in range(len(box_face_col)):
                bp.findobj(mpl.patches.Patch)[len(self.support_) - c - 1].set_facecolor(
                    box_face_col[c]
                )
                bp.findobj(mpl.patches.Patch)[len(self.support_) - c - 1].set_color(
                    box_face_col[c]
                )

            xrange = vimp_df.max(skipna=True).max(skipna=True) - vimp_df.min(
                skipna=True
            ).min(skipna=True)
            bp.set_xlim(left=vimp_df.min(skipna=True).min(skipna=True) - 0.10 * xrange)

            custom_lines = [
                Line2D([0], [0], color=BLUE, lw=5),
                Line2D([0], [0], color=YELLOW, lw=5),
                Line2D([0], [0], color="gray", lw=5),
                Line2D([0], [0], linestyle="--", color=RED, lw=2),
            ]
            bp.legend(
                custom_lines,
                ["confirmed", "tentative", "rejected", "sha. max"],
                loc="lower right",
            )
            ax.axvline(x=self.sha_max, linestyle="--", color=RED)
            ax.set_title("Leshy importance and selected predictors")
            return fig

    def _get_support_mask(self):
        check_is_fitted(self)

        return self.support_

    def _more_tags(self):
        return {"allow_nan": True, "requires_y": True}

[docs]    def _fit(self, X_raw, y, sample_weight=None):
        """Private method. See the methods overview in the documentation
        for explanation of the process

        Parameters
        ----------
        X_raw : array-like, shape = [n_samples, n_features]
            The training input samples.
        y : array-like, shape = [n_samples]
            The target values.
        sample_weight : array-like, shape = [n_samples], default=None
            Individual weights for each sample

        Returns
        -------
        self : object
            Nothing but attributes
        """

        start_time = time.time()
        # the basic cat features encoding
        # is performed when getting importances
        # because the columns are dynamically created/rejected
        X = X_raw

        # only sklearn requires to fillna data
        # modern GBM implementations can handle this
        # X = X.fillna(0)
        y = pd.Series(y).fillna(0) if not isinstance(y, pd.Series) else y.fillna(0)

        # check input params
        self._check_params(X, y)
        sample_weight = validate_sample_weight(sample_weight)
        self.random_state = check_random_state(self.random_state)

        # setup variables for Boruta
        n_sample, n_feat = X.shape
        _iter = 1
        # holds the decision about each feature:
        # 0  - default state = tentative in original code
        # 1  - accepted in original code
        # -1 - rejected in original code
        dec_reg = np.zeros(n_feat, dtype=int)
        # counts how many times a given feature was more important than
        # the best of the shadow features
        hit_reg = np.zeros(n_feat, dtype=int)
        # these record the history of the iterations
        imp_history = np.zeros(n_feat, dtype=float)
        sha_max_history = []

        # set n_estimators
        if self.n_estimators != "auto":
            self.estimator.set_params(n_estimators=self.n_estimators)

        dec_reg, sha_max_history, imp_history, imp_sha_max = self.select_features(
            X=X, y=y, sample_weight=sample_weight
        )
        confirmed, tentative = _get_confirmed_and_tentative(dec_reg)
        tentative = _select_tentative(tentative, imp_history, sha_max_history)
        self._calculate_support(confirmed, tentative, n_feat)

        # for plotting
        self.imp_real_hist = imp_history
        self.sha_max = imp_sha_max

        # absolute and relative ranking
        self._calculate_absolute_ranking()
        self._calculate_relative_ranking(
            n_feat=n_feat,
            tentative=tentative,
            confirmed=confirmed,
            imp_history=imp_history,
        )
        self._print_result(dec_reg, _iter, start_time)
        return self

[docs]    def _get_tree_num(self, n_feat):
        """private method, get a good estimated for the number of trees
           given the number of features

        Parameters
        ----------
        n_feat : int
            The number of features

        Returns
        -------
        n_estimators : int
            the number of trees
        """
        depth = (
            self.estimator.get_params()["max_depth"]
            if not self.is_cat
            else self.estimator.get_param("max_depth")
        )
        if depth is None:
            depth = 10
        # how many times a feature should be considered on average
        f_repr = 100
        # n_feat * 2 because the training matrix is extended with n shadow features
        multi = (n_feat * 2) / (np.sqrt(n_feat * 2) * depth)
        n_estimators = int(multi * f_repr)
        return n_estimators

[docs]    def _add_shadows_get_imps(self, X, y, sample_weight, dec_reg):
        """Add a shuffled copy of the columns (shadows) and get the feature
        importance of the augmented data set

        Parameters
        ----------
        X: pd.DataFrame of shape [n_samples, n_features]
            predictor matrix
        y: pd.series of shape [n_samples]
            target
        sample_weight: array-like, shape = [n_samples], default=None
            Individual weights for each sample
        dec_reg: array
            holds the decision about each feature 1, 0, -1 (accepted, undecided, rejected)
        Returns
        -------
         imp_real: array
            feature importance of the real predictors
         imp_sha: array
            feature importance of the shadow predictors
        """
        # find features that are tentative still
        x_cur_ind = np.where(dec_reg >= 0)[0]
        x_cur = X.iloc[:, x_cur_ind].copy()
        x_cur_w = x_cur.shape[1]
        # deep copy the matrix for the shadow matrix
        x_sha = x_cur.copy()
        # make sure there's at least 5 columns in the shadow matrix for
        while x_sha.shape[1] < 5:
            x_sha = pd.concat([x_sha, x_sha], axis=1)
        # shuffle xSha
        x_sha = x_sha.apply(self.random_state.permutation, axis=0)
        x_sha.columns = [f"Shadow_{i}" for i in range(x_sha.shape[1])]
        # get importance of the merged matrix
        if self.importance == "shap":
            imp = _get_shap_imp(
                self.estimator, pd.concat([x_cur, x_sha], axis=1), y, sample_weight
            )
        elif self.importance == "fastshap":
            imp = _get_shap_imp_fast(
                self.estimator, pd.concat([x_cur, x_sha], axis=1), y, sample_weight
            )
        elif self.importance == "pimp":
            imp = _get_perm_imp(
                self.estimator, pd.concat([x_cur, x_sha], axis=1), y, sample_weight
            )
        else:
            imp = _get_imp(
                self.estimator, pd.concat([x_cur, x_sha], axis=1), y, sample_weight
            )

        # separate importances of real and shadow features
        imp_sha = imp[x_cur_w:]
        imp_real = np.zeros(X.shape[1])
        imp_real[:] = np.nan
        imp_real[x_cur_ind] = imp[:x_cur_w]

        return imp_real, imp_sha

[docs]    @staticmethod
    def _assign_hits(hit_reg, cur_imp, imp_sha_max):
        """count how many times a given feature was more important than
        the best of the shadow features

        Parameters
        ----------
        hit_reg: array
            count how many times a given feature was more important than the
            best of the shadow features
        cur_imp: array
            current importance
        imp_sha_max: array
            importance of the best shadow predictor
        Returns
        -------
        hit_reg : array
            the how many times a given feature was more important than the
            best of the shadow features
        """
        # register hits for features that did better than the best of shadows
        cur_imp_no_nan = cur_imp[0]
        cur_imp_no_nan[np.isnan(cur_imp_no_nan)] = 0
        hits = np.where(cur_imp_no_nan > imp_sha_max)[0]
        hit_reg[hits] += 1
        return hit_reg

[docs]    def _do_tests(self, dec_reg, hit_reg, _iter):
        """Private method, Perform the rest if the feature should be tagget as relevant (confirmed), not relevant (rejected)
        or undecided. The test is performed by considering the binomial tentatives over several attempts.
        I.e. count how many times a given feature was more important than the best of the shadow features
        and test if the associated probability to the z-score is below, between or above the rejection or
        acceptance threshold.

        Parameters
        ----------
        dec_reg : array
            holds the decision about each feature 1, 0, -1 (accepted, undecided, rejected)
        hit_reg : array
            counts how many times a given feature was more important than the best of the shadow features
        _iter : int
            iteration number
        Returns
        -------
        dec_reg : array
            holds the decision about each feature 1, 0, -1 (accepted, undecided, rejected)

        """
        active_features = np.where(dec_reg >= 0)[0]
        hits = hit_reg[active_features]
        # get uncorrected p values based on hit_reg
        to_accept_ps = sp.stats.binom.sf(hits - 1, _iter, 0.5).flatten()
        to_reject_ps = sp.stats.binom.cdf(hits, _iter, 0.5).flatten()

        if self.two_step:
            # two step multicor process
            # first we correct for testing several features in each round using FDR
            to_accept = self._fdrcorrection(to_accept_ps, alpha=self.alpha)[0]
            to_reject = self._fdrcorrection(to_reject_ps, alpha=self.alpha)[0]

            # second we correct for testing the same feature over and over again
            # using bonferroni
            to_accept2 = to_accept_ps <= self.alpha / float(_iter)
            to_reject2 = to_reject_ps <= self.alpha / float(_iter)

            # combine the two multi corrections, and get indexes
            to_accept *= to_accept2
            to_reject *= to_reject2
        else:
            # as in th original Boruta, we simply do bonferroni correction
            # with the total n_feat in each iteration
            to_accept = to_accept_ps <= self.alpha / float(len(dec_reg))
            to_reject = to_reject_ps <= self.alpha / float(len(dec_reg))

        # find features which are 0 and have been rejected or accepted
        to_accept = np.where((dec_reg[active_features] == 0) * to_accept)[0]
        to_reject = np.where((dec_reg[active_features] == 0) * to_reject)[0]

        # updating dec_reg
        dec_reg[active_features[to_accept]] = 1
        dec_reg[active_features[to_reject]] = -1
        return dec_reg

[docs]    @staticmethod
    def _fdrcorrection(pvals, alpha=0.05):
        """Benjamini/Hochberg p-value correction for false discovery rate, from
        statsmodels package. Included here for decoupling dependency on statsmodels.

        Parameters
        ----------
        pvals : array_like
            set of p-values of the individual tests.
        alpha : float
            error rate
        Returns
        -------
        rejected : array, bool
            True if a hypothesis is rejected, False if not
        pvalue-corrected : array
            pvalues adjusted for multiple hypothesis testing to limit FDR
        """
        pvals = np.asarray(pvals)
        pvals_sortind = np.argsort(pvals)
        pvals_sorted = np.take(pvals, pvals_sortind)
        nobs = len(pvals_sorted)
        ecdffactor = np.arange(1, nobs + 1) / float(nobs)

        reject = pvals_sorted <= ecdffactor * alpha
        if reject.any():
            rejectmax = max(np.nonzero(reject)[0])
            reject[:rejectmax] = True

        pvals_corrected_raw = pvals_sorted / ecdffactor
        pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
        pvals_corrected[pvals_corrected > 1] = 1
        # reorder p-values and rejection mask to original order of pvals
        pvals_corrected_ = np.empty_like(pvals_corrected)
        pvals_corrected_[pvals_sortind] = pvals_corrected
        reject_ = np.empty_like(reject)
        reject_[pvals_sortind] = reject
        return reject_, pvals_corrected_

[docs]    @staticmethod
    def _nanrankdata(X, axis=1):
        """Replaces bottleneck's nanrankdata with scipy and numpy alternative.

        Parameters
        ----------
        X : array or pd.DataFrame
            the data array
        axis : int, optional
            row-wise (0) or column-wise (1), by default 1

        Returns
        -------
        ranks : array
            the ranked array
        """
        ranks = sp.stats.mstats.rankdata(X, axis=axis)
        ranks[np.isnan(X)] = np.nan
        return ranks

[docs]    def _check_params(self, X, y):
        """Private method, Check hyperparameters as well as X and y before proceeding with fit.

        Parameters
        ----------
        X : pd.DataFrame
            predictor matrix
        y : pd.series
            target series

        Raises
        ------
        ValueError
            [description]
        ValueError
            [description]
        """
        # check X and y are consistent len, X is Array and y is column
        X, y = check_X_y(X, y, dtype=None, ensure_all_finite=False)
        if self.perc <= 0 or self.perc > 100:
            raise ValueError("The percentile should be between 0 and 100.")

        if self.alpha <= 0 or self.alpha > 1:
            raise ValueError("Alpha should be between 0 and 1.")

[docs]    def _print_results(self, dec_reg, _iter, flag):
        """Private method, printing the result

        Parameters
        ----------
        dec_reg: array
            if the feature as been tagged as relevant (confirmed),
            not relevant (rejected) or undecided
        _iter: int
            the iteration number
        flag: int
            is still in the feature selection process or not
        Returns
        -------
         output: str
            the output to be printed out
        """
        n_iter = str(_iter) + " / " + str(self.max_iter)
        n_confirmed = np.where(dec_reg == 1)[0].shape[0]
        n_rejected = np.where(dec_reg == -1)[0].shape[0]
        cols = ["Iteration: ", "Confirmed: ", "Tentative: ", "Rejected: "]

        # still in feature selection
        if flag == 0:
            n_tentative = np.where(dec_reg == 0)[0].shape[0]
            content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected])
            if self.verbose == 1:
                output = cols[0] + n_iter
            elif self.verbose > 1:
                output = "\n".join([x[0] + "\t" + x[1] for x in zip(cols, content)])

        # Boruta finished running and tentatives have been filtered
        else:
            n_tentative = np.sum(self.support_weak_)
            n_rejected = np.sum(~(self.support_ | self.support_weak_))
            content = map(str, [n_iter, n_confirmed, n_tentative, n_rejected])
            result = "\n".join([x[0] + "\t" + x[1] for x in zip(cols, content)])
            if self.importance in ["shap", "pimp"]:
                vimp = str(self.importance)
            else:
                vimp = "native"
            output = (
                "\n\nLeshy finished running using " + vimp + " var. imp.\n\n" + result
            )
        print(output)

[docs]    def _update_estimator(self):
        """
        Update the estimator with a new random state, if applicable.

        If the dataset is not categorical, the estimator's `random_state` parameter is updated
        with a new random state generated by the `random_state` attribute of the Leshy object.
        If the estimator is a LightGBM model, the random state value is generated between 0 and 10000.

        Parameters
        ----------
        None

        Returns
        -------
        None
        """
        if self.is_cat is False:
            if self.is_lgb:
                self.estimator.set_params(
                    random_state=self.random_state.randint(0, 10000)
                )
            else:
                self.estimator.set_params(random_state=self.random_state)

[docs]    def _update_tree_num(self, dec_reg):
        """Update the number of trees in the estimator based on the number of selected features.

        Parameters
        ----------
        dec_reg : array-like of shape (n_features,)
            The decision rule for each feature, where negative values indicate that the feature should be rejected
            and non-negative values indicate that the feature should be selected.

        Returns
        -------
        None

        Notes
        -----
        This function updates the `n_estimators` parameter of the estimator if it is set to "auto". The number of trees is
        determined based on the number of selected features. Specifically, the number of trees is set to the value returned
        by the `_get_tree_num` method, which takes as input the number of selected features that are not rejected.

        If `n_estimators` is not set to "auto", this function does nothing.
        """
        if self.n_estimators == "auto":
            # number of features that aren't rejected
            not_rejected = np.where(dec_reg >= 0)[0].shape[0]
            n_tree = self._get_tree_num(not_rejected)
            self.estimator.set_params(n_estimators=n_tree)

[docs]    def _run_iteration(
        self, X, y, sample_weight, dec_reg, sha_max_history, imp_history, hit_reg, _iter
    ):
        """
        Run an iteration of the Gradient Boosting algorithm.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input samples.

        y : array-like of shape (n_samples,)
            The target values.

        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted.

        dec_reg : array-like of shape (n_samples,)
            Decision function of the estimator.

        sha_max_history : list of floats
            List of the maximum shadow importance value at each iteration.

        imp_history : array-like of shape (n_iterations, n_features)
            Matrix of feature importances at each iteration.

        hit_reg : array-like of shape (n_samples,)
            Array of hit counts for each sample.

        _iter : int
            The current iteration number.

        Returns
        -------
        dec_reg : array-like of shape (n_samples,)
            Updated decision function of the estimator.

        sha_max_history : list of floats
            List of the maximum shadow importance value at each iteration.

        imp_history : array-like of shape (n_iterations, n_features)
            Matrix of feature importances at each iteration.

        hit_reg : array-like of shape (n_samples,)
            Array of hit counts for each sample.

        imp_sha_max : float
            The maximum shadow importance value for this iteration.
        """
        cur_imp = self._add_shadows_get_imps(X, y, sample_weight, dec_reg)
        imp_sha_max = np.percentile(cur_imp[1], self.perc)
        sha_max_history.append(imp_sha_max)
        imp_history = np.vstack((imp_history, cur_imp[0]))
        hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max)
        dec_reg = self._do_tests(dec_reg, hit_reg, _iter)
        return dec_reg, sha_max_history, imp_history, hit_reg, imp_sha_max

[docs]    def select_features(self, X, y, sample_weight=None):
        """
        Select features using the Leshy algorithm.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input data.
        y : array-like of shape (n_samples,)
            The target values.
        sample_weight : array-like of shape (n_samples,), default=None
            Individual weights for each sample.

        Returns
        -------
        dec_reg : ndarray of shape (n_features,)
            The decision rule. 1 means the feature is selected, 0 means the feature is not selected.
        sha_max_history : list
            List of the maximum shadow importances per iteration.
        imp_history : ndarray of shape (n_iterations, n_features)
            Array containing the feature importances per iteration.
        imp_sha_max : float
            Maximum shadow importance value.
        """
        pbar = tqdm(total=self.max_iter, desc="Leshy iteration")
        dec_reg = np.zeros(X.shape[1])
        hit_reg = np.zeros(X.shape[1])
        sha_max_history = []
        imp_history = np.empty((0, X.shape[1]))

        for _iter in range(1, self.max_iter + 1):
            if not np.any(dec_reg == 0):
                break
            self._update_tree_num(dec_reg)
            self._update_estimator()
            (
                dec_reg,
                sha_max_history,
                imp_history,
                hit_reg,
                imp_sha_max,
            ) = self._run_iteration(
                X,
                y,
                sample_weight,
                dec_reg,
                sha_max_history,
                imp_history,
                hit_reg,
                _iter,
            )
            _iter += 1
            pbar.update(1)

        pbar.close()
        return dec_reg, sha_max_history, imp_history, imp_sha_max

[docs]    def _calculate_support(self, confirmed, tentative, n_feat):
        """
        Calculate the feature support arrays.

        Parameters
        ----------
        confirmed : array-like of shape (n_confirmed,)
            Indices of confirmed features.
        tentative : array-like of shape (n_tentative,)
            Indices of tentative features.
        n_feat : int
            Total number of features.

        Returns
        -------
        None
            The function populates the following class attributes:
            - n_features_ : int
                Number of selected features.
            - support_ : ndarray of shape (n_feat,)
                Boolean array indicating the selected features.
            - support_weak_ : ndarray of shape (n_feat,)
                Boolean array indicating the tentatively selected features.
        """
        # basic result variables
        self.n_features_ = confirmed.shape[0]
        self.support_ = np.zeros(n_feat, dtype=bool)
        self.support_weak_ = np.zeros(n_feat, dtype=bool)
        self.support_[confirmed] = 1
        self.support_weak_ = np.zeros(n_feat, dtype=bool)
        self.support_weak_[tentative] = 1
        if self.keep_weak:
            self.support_[tentative] = 1

[docs]    def _calculate_absolute_ranking(self):
        """
        Compute feature importance scores using SHAP values.

        Parameters
        ----------
        new_x_tr : numpy.ndarray
            The training dataset after being processed.
        shap_matrix : numpy.ndarray
            The matrix containing SHAP values computed by a LightGBM model.
        param : dict
            A dictionary containing the parameters for a LightGBM model.
        objective : str
            The objective function of the LightGBM model.

        Returns
        -------
        list
            A list of tuples containing feature names and their corresponding importance scores.
        """
        vimp_df = pd.DataFrame(self.imp_real_hist, columns=self.feature_names_in_)
        self.ranking_absolutes_ = list(
            vimp_df.mean().sort_values(ascending=False).index
        )

[docs]    def _calculate_relative_ranking(self, n_feat, tentative, confirmed, imp_history):
        """
        Calculates the relative ranking of features based on their importance history.

        Parameters
        ----------
        n_feat : int
            The total number of features.
        tentative : ndarray of shape (n_tentative_features,)
            An array containing the indices of tentative features.
        confirmed : ndarray of shape (n_confirmed_features,)
            An array containing the indices of confirmed features.
        imp_history : ndarray of shape (n_iterations + 1, n_features)
            An array containing the feature importances for each iteration.

        Returns
        -------
        None

        """
        # ranking, confirmed variables are rank 1
        self.ranking_ = np.ones(n_feat, dtype=int)
        # tentative variables are rank 2
        self.ranking_[tentative] = 2
        selected = np.hstack((confirmed, tentative))
        # all rejected features are sorted by importance history
        not_selected = np.setdiff1d(np.arange(n_feat), selected)
        # large importance values should rank higher = lower ranks -> *(-1)
        imp_history_rejected = imp_history[1:, not_selected] * -1

        # update rank for not_selected features
        if not_selected.shape[0] > 0:
            # calculate ranks in each iteration, then median of ranks across feats
            iter_ranks = self._nanrankdata(imp_history_rejected, axis=1)
            rank_medians = np.nanmedian(iter_ranks, axis=0)
            ranks = self._nanrankdata(rank_medians, axis=0)

            # set smallest rank to 3 if there are tentative feats
            if tentative.shape[0] > 0:
                ranks = ranks - np.min(ranks) + 3
            else:
                # and 2 otherwise
                ranks = ranks - np.min(ranks) + 2
            self.ranking_[not_selected] = ranks
        else:
            # all are selected, thus we set feature supports to True
            self.support_ = np.ones(n_feat, dtype=bool)

[docs]    def _print_result(self, dec_reg, _iter, start_time):
        """
        Print the results of feature selection.

        Parameters
        ----------
        dec_reg : bool
            Decision on whether to proceed with another round of feature selection.
        _iter : int
            Current iteration number.
        start_time : float
            Time when the feature selection process started.

        Returns
        -------
        None
            The function prints the relevant results and running time.
        """
        if self.verbose > 0:
            self._print_results(dec_reg, _iter, 1)
        self.running_time = time.time() - start_time
        hours, rem = divmod(self.running_time, 3600)
        minutes, seconds = divmod(rem, 60)
        print(
            "All relevant predictors selected in {:0>2}:{:0>2}:{:05.2f}".format(
                int(hours), int(minutes), seconds
            )
        )


[docs]def _get_confirmed_and_tentative(dec_reg):
    """Extracts the confirmed and tentative features from dec_reg."""
    confirmed = np.where(dec_reg == 1)[0]
    tentative = np.where(dec_reg == 0)[0]
    return confirmed, tentative


[docs]def _select_tentative(tentative, imp_history, sha_max_history):
    """
    Select tentative features based on median importance values.

    Parameters
    ----------
    tentative: array-like of shape (n_tentative,)
        Array of indices representing tentative features.
    imp_history: array-like of shape (n_iterations + 1, n_features)
        Importance values for each feature in each iteration.
    sha_max_history: array-like of shape (n_iterations + 1,)
        The history of the highest stability scores.

    Returns
    -------
    tentative: array-like of shape (n_tentative_confirmed,)
        The confirmed tentative features based on their median importance values.
    """
    # ignore the first row of zeros
    tentative_median = np.median(imp_history[1:, tentative], axis=0)
    # which tentative to keep
    tentative_confirmed = np.where(tentative_median > np.median(sha_max_history))[0]
    tentative = tentative[tentative_confirmed]
    return tentative


[docs]def _split_fit_estimator(estimator, X, y, sample_weight=None, cat_feature=None):
    """Private function, split the train, test and fit the model

    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.
    X : pd.DataFrame of shape [n_samples, n_features]
        predictor matrix
    y : pd.series of shape [n_samples]
        target
    sample_weight : array-like, shape = [n_samples], default=None
        Individual weights for each sample
    cat_feature : list of int or None
        the list of integers, cols loc, of the categrocial predictors. Avoids to detect and encode
        each iteration if the exact same columns are passed to the selection methods.
    Returns
    -------
     model :
        fitted model
     X_tt : array [n_samples, n_features]
        the test split, predictors
     y_tt : array [n_samples]
        the test split, target
    """
    if cat_feature is None:
        # detect, store and encode categorical predictors
        X, _, cat_idx = get_pandas_cat_codes(X)
    else:
        cat_idx = cat_feature

    if sample_weight is not None:
        w = sample_weight
        if is_regressor(estimator):
            X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(
                X, y, w, random_state=42
            )
        else:
            X_tr, X_tt, y_tr, y_tt, w_tr, w_tt = train_test_split(
                X, y, w, stratify=y, random_state=42
            )
    else:
        if is_regressor(estimator):
            X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, random_state=42)
        else:
            X_tr, X_tt, y_tr, y_tt = train_test_split(X, y, stratify=y, random_state=42)
        w_tr, w_tt = None, None
        
    if check_if_tree_based(estimator):
        try:
            # Handle LightGBM categorical features at initialization
            if is_lightgbm(estimator):
                model = estimator.fit(X_tr, y_tr, sample_weight=w_tr, categorical_feature=cat_idx)
            elif is_catboost(estimator) or ("cat_feature" in estimator.fit.__code__.co_varnames):
                model = estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=cat_idx)
            else:
                model = estimator.fit(X_tr, y_tr, sample_weight=w_tr)
                
        except Exception as e:
            raise ValueError(
                "Please check your X and y variable. The provided "
                "estimator cannot be fitted to your data.\n" + str(e)
            )
    else:
        raise ValueError("Not a tree based model")

    return model, X_tt, y_tt, w_tt



[docs]def _get_shap_imp(estimator, X, y, sample_weight=None, cat_feature=None):
    """Get the SHAP feature importance (compatible with all SHAP versions)
    
    Parameters
    ----------
    estimator : estimator object
        An estimator object implementing `fit` and `predict` methods.
    X : pd.DataFrame of shape [n_samples, n_features]
        Predictor matrix.
    y : pd.Series of shape [n_samples]
        Target variable.
    sample_weight : array-like, shape = [n_samples], default=None
        Individual weights for each sample.
    cat_feature : list of int or None, default=None
        The list of integers, columns loc, of the categorical predictors.

    Returns
    -------
    shap_imp : array
        The SHAP importance array.
    """
    estimator = clone(estimator)
    model, X_tt, y_tt, w_tt = _split_fit_estimator(
        estimator, X, y, sample_weight=sample_weight, cat_feature=cat_feature
    )

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_tt)
    
    if isinstance(shap_values, list):
        # Pre-v0.45.0 format
        if is_classifier(estimator):
            if len(np.unique(y)) > 2:  # Multiclass
                # Reconstruct the concatenated matrix like in _compute_importance
                shap_concat = np.concatenate(shap_values, axis=1)
                n_feat = X_tt.shape[1]
                shap_values = np.delete(
                    shap_concat,
                    list(range(n_feat, (n_feat + 1) * len(shap_values), n_feat + 1)),
                    axis=1
                )
                shap_imp = np.abs(shap_values[:, :-1]).mean(axis=0)
            else:
                # Binary - use positive class only (consistent with _compute_importance)
                shap_imp = np.abs(shap_values[1][:, :-1]).mean(axis=0)
        else:
            # Regression
            shap_imp = np.abs(shap_values[0][:, :-1]).mean(axis=0)
            
    elif isinstance(shap_values, np.ndarray):
        # v0.45.0+ format
        if is_classifier(estimator):
            if shap_values.ndim == 3:
                # Multiclass - sum across classes
                shap_imp = np.abs(shap_values).sum(axis=2).mean(axis=0)
            else:
                # bias has been move to a separate attributes, no more a column
                shap_imp = np.abs(shap_values).mean(axis=0)
        else:
            # Regression
            shap_imp = np.abs(shap_values).mean(axis=0)
    else:
        raise ValueError(f"Unsupported SHAP values format: {type(shap_values)}")

    return shap_imp

[docs]def _get_shap_imp_fast(estimator, X, y, sample_weight=None, cat_feature=None):
    """Get the SHAP feature importance using the fasttreeshap implementation

    Parameters
    ----------
    estimator : estimator object
        An estimator object implementing `fit` and `predict` methods.
    X : pd.DataFrame of shape [n_samples, n_features]
        Predictor matrix.
    y : pd.Series of shape [n_samples]
        Target variable.
    sample_weight : array-like, shape = [n_samples], default=None
        Individual weights for each sample.
    cat_feature : list of int or None, default=None
        The list of integers, columns loc, of the categorical predictors. Avoids detecting and encoding
        each iteration if the exact same columns are passed to the selection methods.

    Returns
    -------
    shap_imp : array
        The SHAP importance array.
    """
    try:
        from fasttreeshap import TreeExplainer as FastTreeExplainer
    except ImportError:
        ImportError("fasttreeshap is not installed")

    # Clone the estimator to avoid modifying the original one
    estimator = clone(estimator)

    # Split the data into train and test sets and fit the model
    model, X_tt, y_tt, w_tt = _split_fit_estimator(
        estimator, X, y, sample_weight=sample_weight, cat_feature=cat_feature
    )
    explainer = FastTreeExplainer(
        model,
        algorithm="auto",
        shortcut=False,
        feature_perturbation="tree_path_dependent",
    )
    shap_matrix = explainer.shap_values(X_tt)
    # multiclass returns a list
    # for binary and for some models, shap is still returning a list
    if is_classifier(estimator):
        if isinstance(shap_matrix, list):
            # For LightGBM classifier, RF, in sklearn API, SHAP returns a list of arrays
            # https://github.com/slundberg/shap/issues/526
            shap_imp = np.mean([np.abs(sv).mean(0) for sv in shap_matrix], axis=0)
        else:
            shap_imp = np.abs(shap_matrix).mean(0)
    else:
        shap_imp = np.abs(shap_matrix).mean(0)
    return shap_imp


[docs]def _get_perm_imp(estimator, X, y, sample_weight, cat_feature=None):
    """Private function, Get the SHAP feature importance

    Parameters
    ----------
    estimator: sklearn estimator
    X : pd.DataFrame of shape [n_samples, n_features]
        predictor matrix
    y : pd.series of shape [n_samples]
        target
    sample_weight : array-like, shape = [n_samples], default=None
        Individual weights for each sample
    cat_feature : list of int or None
        the list of integers, cols loc, of the categorical predictors. Avoids to detect and encode
        each iteration if the exact same columns are passed to the selection methods.
    Returns
    -------
    imp : array
        the permutation importance array
    """
    # be sure to use an non-fitted estimator
    estimator = clone(estimator)

    model, X_tt, y_tt, w_tt = _split_fit_estimator(
        estimator, X, y, sample_weight=sample_weight, cat_feature=cat_feature
    )
    perm_imp = permutation_importance(
        model, X_tt, y_tt, n_repeats=5, random_state=42, n_jobs=-1
    )
    imp = perm_imp.importances_mean.ravel()
    return imp


[docs]def _get_imp(estimator, X, y, sample_weight=None, cat_feature=None):
    """Private function, Get the native feature importance (impurity based for instance)

    Notes
    -----
    This is know to return biased and uninformative results.
    e.g.
    https://scikit-learn.org/stable/auto_examples/inspection/
    plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py

    or

    https://explained.ai/rf-importance/

    Parameters
    ----------
    X : array-like, shape = [n_samples, n_features]
        The training input samples.
    y : array-like, shape = [n_samples]
        The target values.
    sample_weight : array-like, shape = [n_samples], default=None
        Individual weights for each sample
    cat_feature: list of int or None
        the list of integers, cols loc, of the categorical predictors. Avoids to detect and encode
        each iteration if the exact same columns are passed to the selection methods.
    Returns
    -------
    imp : array
        the permutation importance array
    """
    # be sure to use an non-fitted estimator
    estimator = clone(estimator)
    try:
        if cat_feature is None:
            X, _, cat_idx = get_pandas_cat_codes(X)
        else:
            cat_idx = cat_feature

        if is_lightgbm(estimator):
            estimator.fit(X, y, sample_weight=sample_weight, categorical_feature=cat_idx)
        elif is_catboost(estimator) or ("cat_feature" in estimator.fit.__code__.co_varnames):
            estimator.fit(X, y, sample_weight=sample_weight, cat_features=cat_idx)
        else:
            estimator.fit(X, y, sample_weight=sample_weight)
    except Exception as e:
        raise ValueError(f"Estimator fitting failed: {str(e)}")

    if is_lightgbm(estimator):
        return estimator.booster_.feature_importance(importance_type='gain')  # LightGBM 4.x compatible
    else:
        return estimator.feature_importances_


###################################
#
# BoostAGroota
#
###################################
[docs]class BoostAGroota(SelectorMixin, BaseEstimator):  # (object):
    """
    BoostAGroota is an all-relevant feature selection method, while most others are minimal optimal.
    It tries to find all features carrying information usable for prediction, rather than finding a possibly compact
    subset of features on which some estimator has a minimal error.

    Why bother with all-relevant feature selection?
    When you try to understand the phenomenon that made your data, you should
    care about all factors that contribute to it, not just the bluntest signs
    of it in the context of your methodology (minimal optimal set of features
    by definition depends on your estimator choice).

    Parameters
    ----------
    estimator : scikit-learn estimator
        The model to train, lightGBM recommended, see the reduce lightgbm method.
    cutoff : float
        The value by which the max of shadow imp is divided, to compare to real importance.
    iters : int (>0)
        The number of iterations to average for the feature importance (on the same split),
        to reduce the variance.
    max_rounds : int (>0)
        The number of times the core BoostAGroota algorithm will run.
        Each round eliminates more and more features.
    delta : float (0 < delta <= 1)
        Stopping criteria for whether another round is started.
    silent : bool
        Set to True if you don't want to see the BoostAGroota output printed.
    importance : str, default='shap'
        The kind of feature importance to use. Possible values: 'shap' (Shapley values),
        'pimp' (permutation importance), and 'native' (Gini/impurity).

    Attributes
    ----------
    selected_features_ : list of str
        The list of columns to keep.
    ranking_ : array of shape [n_features]
        The feature ranking, such that ``ranking_[i]`` corresponds to the
        ranking position of the i-th feature. Selected (i.e., estimated
        best) features are assigned rank 1, and tentative features are assigned
        rank 2.
    ranking_absolutes_ : array of shape [n_features]
        The absolute feature ranking as ordered by the selection process. It does not guarantee
        that this order is correct for all models. For a model-agnostic ranking, see the
        attribute ``ranking``.
    sha_cutoff_df : dataframe
        Feature importance of the real+shadow predictors over iterations.
    mean_shadow : float
        The threshold below which the predictors are rejected.

    Examples
    --------
    >>> X = df[filtered_features].copy()
    >>> y = df['target'].copy()
    >>> w = df['weight'].copy()
    >>> model = LGBMRegressor(n_jobs=-1, n_estimators=100, objective='rmse', random_state=42, verbose=0)
    >>> feat_selector = BoostAGroota(estimator=model, cutoff=1, iters=10, max_rounds=10, delta=0.1, importance='shap')
    >>> feat_selector.fit(X, y, sample_weight=None)
    >>> print(feat_selector.selected_features_)
    >>> feat_selector.plot_importance(n_feat_per_inch=5)
    """

    def __init__(
        self,
        estimator=None,
        cutoff=4,
        iters=10,
        max_rounds=500,
        delta=0.1,
        silent=True,
        importance="shap",
    ):
        self.estimator = estimator
        self.cutoff = cutoff
        self.iters = iters
        self.max_rounds = max_rounds
        self.delta = delta
        self.silent = silent
        self.importance = importance
        self.sha_cutoff_df = None
        self.mean_shadow = None
        self.ranking_absolutes_ = None
        self.ranking_ = None

        # Throw errors if the inputted parameters don't meet the necessary criteria
        if cutoff <= 0:
            raise ValueError(
                "cutoff should be greater than 0. You entered" + str(cutoff)
            )
        if iters <= 0:
            raise ValueError("iters should be greater than 0. You entered" + str(iters))
        if (delta <= 0) | (delta > 1):
            raise ValueError("delta should be between 0 and 1, was " + str(delta))

        # Issue warnings for parameters to still let it run
        if delta < 0.02:
            warnings.warn(
                "WARNING: Setting a delta below 0.02 may not converge on a solution."
            )
        if max_rounds < 1:
            warnings.warn(
                "WARNING: Setting max_rounds below 1 will automatically be set to 1."
            )

        if importance == "native":
            warnings.warn(
                "[BoostAGroota]: using native variable importance might break the FS"
            )

    def __repr__(self):
        s = (
            "BoostARoota(est={est}, \n"
            "                cutoff={cutoff},\n"
            "                iters={iters},\n"
            "                max_rounds={mr},\n"
            "                delta={delta},\n"
            "                silent={silent}, \n"
            '                importance="{importance}")'.format(
                est=self.estimator,
                cutoff=self.cutoff,
                iters=self.iters,
                mr=self.max_rounds,
                delta=self.delta,
                silent=self.silent,
                importance=self.importance,
            )
        )
        return s

[docs]    def fit(self, X, y, sample_weight=None):
        """Fit the BoostAGroota transformer with the provided estimator.
        Parameters
        ----------
        X : pd.DataFrame
            the predictors matrix
        y : pd.Series
            the target
        sample_weight : pd.series
            sample_weight, if any
        """
        if self.importance == "fastshap":
            try:
                from fasttreeshap import TreeExplainer as FastTreeExplainer
            except ImportError:
                warnings.warn("fasttreeshap is not installed. Fallback to shap.")
                self.importance = "shap"

        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_numpy()
        else:
            raise TypeError("X is not a dataframe")

        if sample_weight is not None:
            sample_weight = pd.Series(_check_sample_weight(sample_weight, X))

        # crit, keep_vars, df_vimp, mean_shadow
        _, self.selected_features_, self.sha_cutoff_df, self.mean_shadow = _boostaroota(
            X,
            y,
            # metric=self.metric,
            estimator=self.estimator,
            cutoff=self.cutoff,
            iters=self.iters,
            max_rounds=self.max_rounds,
            delta=self.delta,
            silent=self.silent,
            weight=sample_weight,
            imp=self.importance,
        )
        self.selected_features_ = self.selected_features_.values
        self.support_ = np.asarray(
            [c in self.selected_features_ for c in self.feature_names_in_]
        )
        self.ranking_absolutes_ = list(
            self.sha_cutoff_df.iloc[:, : int(self.sha_cutoff_df.shape[1] / 2)]
            .mean()
            .sort_values(ascending=False)
            .index
        )
        self.ranking_ = np.where(self.support_, 2, 1)
        return self

    def _get_support_mask(self):
        check_is_fitted(self)

        return self.support_

[docs]    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X is not a dataframe")
        return X[self.selected_features_]

    def _more_tags(self):
        return {"allow_nan": True, "requires_y": True}

[docs]    @mpl.rc_context(PLT_PARAMS)
    def plot_importance(self, n_feat_per_inch=5):
        """
        Boxplot of the variable importance, ordered by magnitude.
        The max shadow variable importance illustrated by the dashed line.
        Requires to apply the fit method first.

        Parameters
        ----------
        n_feat_per_inch : int, default=5
            Number of features to plot per inch (for scaling the figure).

        Returns
        -------
        fig : plt.figure or None
            The matplotlib figure object containing the boxplot, or None if there are no selected features.
        """
        if self.mean_shadow is None:
            raise ValueError("Apply fit method first")

        b_df = self.sha_cutoff_df
        real_df = b_df.iloc[:, : int(b_df.shape[1] / 2)].copy()

        real_df = real_df.reindex(
            real_df.mean().sort_values(ascending=True).index, axis=1
        )

        if real_df.dropna().empty:
            warnings.warn(NO_FEATURE_SELECTED_WARNINGS)
            return None

        fig, ax = plt.subplots(figsize=(16, real_df.shape[1] / n_feat_per_inch))
        bp = real_df.plot(**PLOT_KWARGS, ax=ax)
        bp.set_xlim(left=real_df.min(skipna=True).min(skipna=True) - 0.025)

        for c in range(len(self.selected_features_)):
            patch = bp.findobj(mpl.patches.Patch)[real_df.shape[1] - c - 1]
            patch.set_facecolor(BLUE)
            patch.set_color(BLUE)

        custom_lines = [
            Line2D([0], [0], color=BLUE, lw=5),
            Line2D([0], [0], color="gray", lw=5),
            Line2D([0], [0], linestyle="--", color=RED, lw=2),
        ]
        bp.legend(
            custom_lines, ["confirmed", "rejected", "sha. max"], loc="lower right"
        )

        ax.axvline(x=self.mean_shadow, linestyle="--", color=RED)
        ax.set_title("BoostAGroota importance of selected predictors")

        return fig


############################################
# Helper Functions to do the Heavy Lifting
############################################


[docs]def _create_shadow(X_train):
    """Create shadow features by making copies of all X variables and randomly shuffling them.

    Parameters
    ----------
    X_train : pd.DataFrame
        The dataframe to create shadow features on.

    Returns
    -------
    pd.DataFrame
        A dataframe that is twice the width of X_train and contains the shadow features, along with a list of the shadow feature names.
    """
    X_shadow = X_train.copy()
    for c in X_shadow.columns:
        np.random.shuffle(X_shadow[c].values)
    # Rename the shadow variables
    shadow_names = [f"ShadowVar{i+1}" for i in range(X_train.shape[1])]
    X_shadow.columns = shadow_names
    # Combine to make one new dataframe
    new_x = pd.concat([X_train, X_shadow], axis=1)
    # Separate the columns that start with 'ShadowVar' from the other columns
    shadow_columns = [col for col in new_x.columns if col.startswith('ShadowVar')]
    other_columns = [col for col in new_x.columns if not col.startswith('ShadowVar')]
    
    # Concatenate the two lists of column names
    new_column_order = other_columns + shadow_columns
    # Reorder the DataFrame columns
    new_x = new_x[new_column_order]
    return new_x, shadow_names


########################################################################################
# BoostARoota. In principle, you cannot/don't need to access those methods (reason of
# the _ in front of the function name, they're internal functions)
########################################################################################


[docs]def _reduce_vars_sklearn(
    X,
    y,
    estimator,
    this_round,
    cutoff,
    n_iterations,
    delta,
    silent,
    weight,
    imp_kind,
    cat_feature,
):
    """
    Private function, reduce the number of predictors using a sklearn estimator

    Parameters
    ----------
    x : pd.DataFrame
        the dataframe to create shadow features on
    y : pd.Series
        the target
    estimator : sklearn estimator
        the model to train, lightGBM recommended
    this_round : int
        The number of times the core BoostARoota algorithm will run.
        Each round eliminates more and more features
    cutoff : float
        the value by which the max of shadow imp is divided, to compare to real importance
    n_iterations : int
        The number of iterations to average for the feature importance (on the same split),
        to reduce the variance
    delta : float (0 < delta <= 1)
        Stopping criteria for whether another round is started
    silent : bool
        Set to True if don't want to see the BoostARoota output printed.
        Will still show any errors or warnings that may occur
    weight : pd.series
        sample_weight, if any
    imp_kind : str
        whether if native, shap, fastshap or permutation importance should be used
    cat_feature : list of int or None
        the list of integers, cols loc, of the categorical predictors. Avoids to detect and encode
        each iteration if the exact same columns are passed to the selection methods.

    Returns
    -------
    criteria : bool
        if the criteria has been reached or not
    real_vars['feature'] : pd.dataframe
        feature importance of the real predictors over iter
    df : pd.DataFrame
        feature importance of the real+shadow predictors over iter
    mean_shadow : float
        the feature importance threshold, to reject or not the predictors

    Raises
    ------
    ValueError
        error if the feature importance type is not
    """
    # Set up the parameters for running the model in XGBoost - split is on multi log loss
    for i in range(1, n_iterations + 1):
        # Create the shadow variables and run the model to obtain importances
        new_x, shadow_names = _create_shadow(X)
        imp_func = {
            "shap": _get_shap_imp,
            "fastshap": _get_shap_imp_fast,
            "pimp": _get_perm_imp,
            "native": _get_imp,
        }
        importance = imp_func[imp_kind](
            estimator, new_x, y, sample_weight=weight, cat_feature=cat_feature
        )
        
        
        # Create a dataframe to store the feature importances
        if i == 1:
            df = pd.DataFrame({"feature": new_x.columns})

        # Store the feature importances
        try:
            # Normalize the feature importances
            df["fscore" + str(i)] = importance / importance.sum()
        except ValueError:
            print("Only Sklearn tree based methods allowed")

        # Print the iteration number if not silent
        if not silent:
            print("Round: ", this_round, " iteration: ", i)
    df["Mean"] = df.select_dtypes(include=[np.number]).mean(axis=1, skipna=True)
    # Split them back out
    real_vars = df.loc[~df["feature"].isin(shadow_names)]
    shadow_vars = df.loc[df["feature"].isin(shadow_names)]

    # Get mean value from the shadows (max, like in Boruta)
    mean_shadow = (
        shadow_vars.select_dtypes(include=[np.number])
        .max(skipna=True, axis=1)
        .mean(skipna=True)
        / cutoff
    )
    real_vars = real_vars[(real_vars.Mean >= mean_shadow)]

    # Check for the stopping criteria
    # Compute the fraction of features selected in this round
    selected_frac = len(real_vars) / len(X.columns)

    # Check if we should stop feature selection
    # make sure we are removing at least delta % of the variables
    criteria = len(X.columns) == 0 or selected_frac == 0 or selected_frac > 1 - delta

    return criteria, real_vars["feature"], df, mean_shadow


[docs]def _boostaroota(
    X, y, estimator, cutoff, iters, max_rounds, delta, silent, weight, imp
):
    """
    Private function, reduces the number of predictors using a sklearn estimator.

    Parameters
    ----------
    x : pd.DataFrame
        The dataframe to create shadow features on.
    y : pd.Series
        The target.
    estimator : scikit-learn estimator
        The model to train, lightGBM recommended, see the reduce lightgbm method.
    cutoff : float
        The value by which the max of shadow imp is divided, to compare to real importance.
    iters : int (>0)
        The number of iterations to average for the feature importances (on the same split),
        to reduce the variance.
    max_rounds : int (>0)
        The number of times the core BoostARoota algorithm will run.
        Each round eliminates more and more features.
    delta : float (0 < delta <= 1)
        Stopping criteria for whether another round is started.
    silent : bool
        Set to True if you don't want to see the BoostARoota output printed.
        Will still show any errors or warnings that may occur.
    weight : pd.Series, optional
        Sample weights, if any.
    imp : str
        whether if native, shap, fastshap or permutation importance should be used

    Returns
    -------
    crit : bool
        If the criteria have been reached or not.
    keep_vars : pd.DataFrame
        Feature importance of the real predictors over iterations.
    df_vimp : pd.DataFrame
        Feature importance of the real+shadow predictors over iterations.
    mean_shadow : float
        The feature importance threshold to reject or not the predictors.
    """
    start_time = time.time()
    new_x = X.copy()

    # extract the categorical names for the first time, store it for next iterations
    # In the below while loop this list will be update only once some of the predictors
    # are removed. This way the encoding is done only every predictors update and not
    # every iteration. The code will then be much faster since the encoding is done only once.
    new_x, obj_feat, cat_idx = get_pandas_cat_codes(X)

    # Run through loop until "crit" changes
    i = 0
    imp_dic = {}
    with tqdm(total=max_rounds, desc="BoostaGRoota round") as pbar:
        while True:
            # Inside this loop we reduce the dataset on each iteration exiting with keep_vars
            i += 1
            crit, keep_vars, df_vimp, mean_shadow = _reduce_vars_sklearn(
                new_x,
                y,
                estimator=estimator,
                this_round=i,
                cutoff=cutoff,
                n_iterations=iters,
                delta=delta,
                silent=silent,
                weight=weight,
                imp_kind=imp,
                cat_feature=cat_idx,
            )

            b_df = df_vimp.T.iloc[1:-1].astype(float)
            b_df.columns = df_vimp.T.iloc[0].values
            imp_dic.update(b_df.to_dict())

            if crit or i >= max_rounds or len(keep_vars) == 0:
                break
            else:
                new_x = new_x[keep_vars].copy()
                _, _, cat_idx = get_pandas_cat_codes(new_x)

                pbar.update(1)

    elapsed = (time.time() - start_time) / 60
    if not silent:
        print(f"BoostARoota ran successfully! Algorithm went through {i} rounds.")
        print(f"The feature selection BoostARoota running time is {elapsed:8.2f} min")

    df_vimp = pd.DataFrame.from_dict(imp_dic)

    return crit, keep_vars, df_vimp, mean_shadow


###################################
#
# GrootCV
#
###################################


[docs]class GrootCV(SelectorMixin, BaseEstimator):
    """
    GrootCV is a feature selection method based on cross-validation with lightGBM.

    A shuffled copy of the predictors matrix is added (shadows) to the original set of predictors.
    The lightGBM is fitted using repeated cross-validation, the feature importance
    is extracted each time and averaged to smooth out the noise.
    If the feature importance is larger than the average shadow feature importance then the predictors are rejected, the others are kept.
        - Cross-validated feature importance to smooth out the noise, based on lightGBM only
          (which is, most of the time, the fastest and more accurate Boosting).
        - the feature importance is derived using SHAP importance
        - Taking the max of median of the shadow var. imp over folds otherwise not enough conservative and
          it improves the convergence (needs less evaluation to find a threshold)
        - Not based on a given percentage of cols needed to be deleted
        - Plot method for var. imp

    Parameters
    ----------
    objective : str or callable, default=None
        The objective function to use in lightGBM. If None, it uses the objective specified in `lgbm_params`.
    cutoff : float, default=1
        The value by which the max of shadow imp is divided, to compare to real importance.
    n_folds : int, default=5
        The number of folds for cross-validation.
    folds : Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]]
        (generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None)) 
        If generator or iterator, it should yield the train and test indices for each fold. If object, it should be one of the scikit-learn 
        splitter classes (https://scikit-learn.org/stable/modules/classes.html#splitter-classes) and have split method. 
        This argument has highest priority over other data split arguments.
    n_iter : int, default=5
        The number of iterations to average for the feature importance (on the same split), to reduce variance.
    silent : bool, default=True
        Set to True if you don't want to see the GrootCV output printed.
    rf : bool, default=False
        If True, use random forest for calculating feature importances; otherwise, use lightGBM.
    fastshap : bool, default=False
        If True, use fastSHAP for calculating feature importances; otherwise, use SHAP.
    n_jobs : int, default=0
        The number of jobs to run in parallel. If 0, no parallelism is used.
    lgbm_params : dict, default=None
        The parameters for the lightGBM model.

    Attributes
    ----------
    selected_features_ : ndarray
        The list of columns to keep as selected features.
    cv_df : pd.DataFrame
        DataFrame containing feature importance values for each fold and iteration.
    sha_cutoff : float
        The threshold below which the predictors are rejected.
    ranking_absolutes_ : list
        The absolute feature ranking as ordered by the selection process.
    ranking_ : ndarray
        The feature ranking, where 2 corresponds to selected features and 1 to tentative features.

    Methods
    -------
    fit(X, y, sample_weight=None)
        Fit the GrootCV on the input data.
    transform(X)
        Apply the fitted GrootCV on new data.
    plot_importance(n_feat_per_inch=5)
        Plot the feature importance of the fitted GrootCV.

    Warnings
    --------
    If `sha_cutoff` is None, you should apply the fit method first.
    Examples
    -------
    >>> X = df[filtered_features].copy()
    >>> y = df['target'].copy()
    >>> w = df['weight'].copy()
    >>> feat_selector = arfsgroot.GrootCV(objective='rmse', cutoff = 1, n_folds=5, n_iter=5)
    >>> feat_selector.fit(X, y, sample_weight=None)
    >>> feat_selector.plot_importance(n_feat_per_inch=5)
    """

    def __init__(
        self,
        objective=None,
        cutoff=1,
        n_folds=5,
        folds=None,
        n_iter=5,
        silent=True,
        rf=False,
        fastshap=False,
        n_jobs=0,
        lgbm_params=None,
    ):
        self.objective = objective
        self.cutoff = cutoff
        self.n_folds = n_folds
        self.folds = folds
        self.n_iter = n_iter
        self.silent = silent
        self.rf = rf
        self.fastshap = fastshap
        self.cv_df = None
        self.sha_cutoff = None
        self.ranking_absolutes_ = None
        self.ranking_ = None
        self.lgbm_params = lgbm_params
        self.n_jobs = n_jobs

        # Throw errors if the inputted parameters don't meet the necessary criteria
        # Ensure parameters meet necessary criteria
        if cutoff <= 0 or n_iter <= 0 or n_folds <= 0:
            raise ValueError("cutoff, n_iter, and n_folds should be greater than 0.")

[docs]    def fit(self, X, y, sample_weight=None):
        """
        Fit the GrootCV on the input data.

        Parameters
        ----------
        X : pd.DataFrame of shape (n_samples, n_features)
            The predictor dataframe.
        y : array-like of shape (n_samples,)
            The target vector.
        sample_weight : array-like of shape (n_samples,), optional
            The weight vector, by default None.

        Returns
        -------
        self : object
            Returns self.
        """

        if not isinstance(X, pd.DataFrame):
            raise TypeError("X is not a pandas DataFrame")

        self.feature_names_in_ = X.columns.to_numpy()
        y = pd.Series(y) if not isinstance(y, pd.Series) else y

        if sample_weight is not None:
            sample_weight = pd.Series(_check_sample_weight(sample_weight, X))

        # internal encoding (ordinal encoding)
        X, obj_feat, cat_idx = get_pandas_cat_codes(X)

        self.selected_features_, self.cv_df, self.sha_cutoff = _reduce_vars_lgb_cv(
            X,
            y,
            objective=self.objective,
            cutoff=self.cutoff,
            n_folds=self.n_folds,
            folds=self.folds,
            n_iter=self.n_iter,
            silent=self.silent,
            weight=sample_weight,
            rf=self.rf,
            fastshap=self.fastshap,
            lgbm_params=self.lgbm_params,
            n_jobs=self.n_jobs,
        )

        self.selected_features_ = self.selected_features_.values
        self.support_ = np.asarray(
            [c in self.selected_features_ for c in self.feature_names_in_]
        )
        self.ranking_ = np.where(self.support_, 2, 1)

        b_df = self.cv_df.T.copy()
        b_df.columns = b_df.iloc[0]
        b_df = b_df.drop(b_df.index[0])
        b_df = b_df.drop(b_df.index[-1])
        b_df = b_df.convert_dtypes()
        real_df = b_df.iloc[:, : int(b_df.shape[1] / 2)].copy()
        self.ranking_absolutes_ = list(
            real_df.mean().sort_values(ascending=False).index
        )
        return self

    def _get_support_mask(self):
        check_is_fitted(self)

        return self.support_

[docs]    def transform(self, X):
        """
        Apply the fitted GrootCV on new data.

        Parameters
        ----------
        X : pd.DataFrame of shape (n_samples, n_features)
            The predictor dataframe.

        Returns
        -------
        X_selected : pd.DataFrame of shape (n_samples, n_selected_features)
            The selected features from the input dataframe.
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X is not a dataframe")
        return X[self.selected_features_]

    def _more_tags(self):
        return {"allow_nan": True, "requires_y": True}

[docs]    @mpl.rc_context(PLT_PARAMS)
    def plot_importance(self, n_feat_per_inch=5):
        """
        Plot the feature importance of the fitted GrootCV.

        Parameters
        ----------
        n_feat_per_inch : int, default=5
            The number of features per inch in the plot.

        Returns
        -------
        fig : matplotlib.figure.Figure or None
            The matplotlib figure containing the plot or None if no feature is selected.
        """

        if self.sha_cutoff is None:
            raise ValueError("Apply fit method first")

        b_df = self.cv_df.T.copy()
        b_df.columns = b_df.iloc[0]
        # Separate the columns that start with 'ShadowVar' from the other columns
        shadow_columns = [col for col in b_df.columns if col.startswith('ShadowVar')]
        other_columns = [col for col in b_df.columns if not col.startswith('ShadowVar')]
        
        # Concatenate the two lists of column names
        new_column_order = other_columns + shadow_columns
        # Reorder the DataFrame columns
        b_df = b_df[new_column_order]
        b_df = b_df.drop(b_df.index[0])
        b_df = b_df.drop(b_df.index[-1])
        b_df = b_df.convert_dtypes()
        real_df = b_df.iloc[:, : int(b_df.shape[1] / 2)].copy()
        sha_df = b_df.iloc[:, int(b_df.shape[1] / 2) :].copy()

        real_df = real_df.reindex(
            real_df.select_dtypes(include=[np.number])
            .mean()
            .sort_values(ascending=True)
            .index,
            axis=1,
        )

        if real_df.dropna().empty:
            warnings.warn(NO_FEATURE_SELECTED_WARNINGS)
            return None
        else:
            fig, ax = plt.subplots(figsize=(16, real_df.shape[1] / n_feat_per_inch))
            bp = real_df.plot(**PLOT_KWARGS, ax=ax)
            col_idx = np.argwhere(real_df.columns.isin(self.selected_features_)).ravel()

            for c in range(real_df.shape[1]):
                bp.findobj(mpl.patches.Patch)[c].set_facecolor("gray")
                bp.findobj(mpl.patches.Patch)[c].set_color("gray")

            for c in col_idx:
                bp.findobj(mpl.patches.Patch)[c].set_facecolor(BLUE)
                bp.findobj(mpl.patches.Patch)[c].set_color(BLUE)

            ax.axvline(x=self.sha_cutoff, linestyle="--", color=RED)
            bp.set_xlim(left=real_df.min(skipna=True).min(skipna=True) - 0.025)
            custom_lines = [
                Line2D([0], [0], color=BLUE, lw=5),
                Line2D([0], [0], color="gray", lw=5),
                Line2D([0], [0], linestyle="--", color=RED, lw=2),
            ]
            bp.legend(
                custom_lines, ["confirmed", "rejected", "threshold"], loc="lower right"
            )
            ax.set_title("Groot CV importance and selected predictors")

            return fig


########################################################################################
#
# GrootCV. In principle, you cannot/don't need to access those methods (reason of
# the _ in front of the function name, they're internal functions)
#
########################################################################################


[docs]def _reduce_vars_lgb_cv(
    X,
    y,
    objective,
    folds,
    n_folds,
    cutoff,
    n_iter,
    silent,
    weight,
    rf,
    fastshap,
    lgbm_params=None,
    n_jobs=0,
):
    """
    Reduce the number of predictors using a lightgbm (python API)

    Parameters
    ----------
    X : pd.DataFrame
            the dataframe to create shadow features on
    y : pd.Series
            the target
    objective : str
            the lightGBM objective
    folds : 
        (generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None)) 
        If generator or iterator, it should yield the train and test indices for each fold. If object, it should be one of the scikit-learn 
        splitter classes (https://scikit-learn.org/stable/modules/classes.html#splitter-classes) and have split method. 
        This argument has highest priority over other data split arguments.
    nfold : int
        Number of folds in CV.
    cutoff : float
            the value by which the max of shadow imp is divided, to compare to real importance
    n_iter : int
            The number of repetition of the cross-validation, smooth out the feature importance noise
    silent : bool
            Set to True if don't want to see the BoostARoota output printed.
            Will still show any errors or warnings that may occur
    weight : pd.series
            sample_weight, if any
    rf : bool, default=False
            the lightGBM implementation of the random forest
    fastshap : bool
            enable or not the fasttreeshap implementation
    lgbm_params : dict, optional
            dictionary of lightgbm parameters
    n_jobs: int, default 0
        0 means default number of threads in OpenMP
        for the best speed, set this to the number of real CPU cores, not the number of threads

    Returns
    -------
    real_vars['feature'] : pd.dataframe
            feature importance of the real predictors over iter
    df : pd.DataFrame
            feature importance of the real+shadow predictors over iter
    cutoff_shadow : float
            the feature importance threshold, to reject or not the predictors
    """

    params = _set_lgb_parameters(
        X=X,
        y=y,
        objective=objective,
        rf=rf,
        silent=silent,
        n_jobs=n_jobs,
        lgbm_params=lgbm_params,
    )

    dtypes_dic = create_dtype_dict(X, dic_keys="dtypes")
    category_cols = dtypes_dic["cat"] + dtypes_dic["time"] + dtypes_dic["unk"]
    cat_idx = [X.columns.get_loc(col) for col in category_cols]

    if folds is None:
        folds = RepeatedKFold(n_splits=n_folds, n_repeats=n_iter, random_state=2652124)
    
    iter = 0
    df = pd.DataFrame({"feature": X.columns})
    for tridx, validx in tqdm(
        folds.split(X, y), total=folds.get_n_splits(), desc="Cross Validation"
    ):
        X_train, X_val, y_train, y_val, weight_tr, weight_val = _split_data(
            X, y, tridx, validx, weight
        )

        # Create the shadow variables and run the model to obtain importances
        new_x_tr, shadow_names = _create_shadow(X_train)
        new_x_val, _ = _create_shadow(X_val)

        bst, shap_matrix, bst.best_iteration = _train_lgb_model(
            new_x_tr,
            y_train,
            weight_tr,
            new_x_val,
            y_val,
            weight_val,
            category_cols=category_cols,
            early_stopping_rounds=20,
            fastshap=fastshap,
            **params,
        )

        importance = _compute_importance(
            new_x_tr, shap_matrix, params, objective, fastshap
        )
        df = _merge_importance_df(
            df=df,
            importance=importance,
            iter=iter,
            n_folds=n_folds,
            column_names=new_x_tr.columns,
            silent=silent,
        )
        iter += 1

    df["Med"] = df.select_dtypes(include=[np.number]).mean(axis=1)
    # Split them back out
    real_vars = df[~df["feature"].isin(shadow_names)]
    shadow_vars = df[df["feature"].isin(shadow_names)]

    # Get median value from the shadows, comparing predictor by predictor. Not the same criteria
    # max().max() like in Boruta but max of the median to mitigate.
    # Otherwise too conservative (reject too often)
    cutoff_shadow = shadow_vars.select_dtypes(include=[np.number]).max().mean() / cutoff
    real_vars = real_vars[(real_vars.Med.values >= cutoff_shadow)]

    return real_vars["feature"], df, cutoff_shadow


[docs]def _set_lgb_parameters(
    X: np.ndarray,
    y: np.ndarray,
    objective: str,
    rf: bool,
    silent: bool,
    n_jobs: int = 0,
    lgbm_params: dict = None,
) -> dict:
    """Set parameters for a LightGBM model based on the input features and the objective.

    Parameters
    ----------
    X : numpy array or pandas DataFrame
        The feature matrix of the training data.
    y : numpy array or pandas Series
        The target variable of the training data.
    objective : str
        The objective function to optimize during training.
    rf : bool, default False
        Whether to use random forest boosting.
    silent : bool, default True
        Whether to print messages during parameter setting.
    n_jobs: int, default 0
        0 means default number of threads in OpenMP
        for the best speed, set this to the number of real CPU cores, not the number of threads

    Returns
    -------
    dict
        The dictionary of LightGBM parameters.

    """

    n_feat = X.shape[1]

    params = lgbm_params if lgbm_params is not None else {}

    params["objective"] = objective
    params["verbosity"] = -1
    if objective == "softmax":
        params["num_class"] = len(np.unique(y))

    if rf:
        feat_frac = (
            np.sqrt(n_feat) / n_feat
            if objective in ["softmax", "binary"]
            else n_feat / (3 * n_feat)
        )
        params.update(
            {
                "boosting_type": "rf",
                "bagging_fraction": 0.7,
                "feature_fraction": feat_frac,
                "bagging_freq": 1,
            }
        )

    clf_losses = [
        "binary",
        "softmax",
        "multi_logloss",
        "multiclassova",
        "multiclass",
        "multiclass_ova",
        "ova",
        "ovr",
        "binary_logloss",
    ]
    if objective in clf_losses:
        y = y.astype(int)
        y_freq_table = pd.Series(y.fillna(0)).value_counts(normalize=True)
        n_classes = y_freq_table.size
        if n_classes > 2 and objective != "softmax":
            params["objective"] = "softmax"
            params["num_class"] = len(np.unique(y))
            if not silent:
                print("Multi-class task, setting objective to softmax")
        main_class = y_freq_table[0]
        if not silent:
            print("GrootCV: classification with unbalance classes")
        if main_class > 0.8:
            params.update({"is_unbalance": True})

    params.update({"num_threads": n_jobs})

    # we are using early_stopping
    # we prevent the overridding of it by popping the n_iterations
    keys_to_pop = [
        "num_iterations",
        "num_iteration",
        "n_iter",
        "num_tree",
        "num_trees",
        "num_round",
        "num_rounds",
        "nrounds",
        "num_boost_round",
        "n_estimators",
        "max_iter",
    ]
    for key in keys_to_pop:
        params.pop(key, None)

    return params


[docs]def _split_data(X, y, tridx, validx, weight=None):
    """
    Split data into train and validation sets based on provided indices.

    Parameters
    ----------
    X : pandas.DataFrame
        Features.
    y : pandas.Series
        Target variable.
    tridx : list
        Indices to be used for training.
    validx : list
        Indices to be used for validation.
    weight : pandas.Series, optional
        Weights for each sample, by default None.

    Returns
    -------
    tuple of pandas.DataFrame and pandas.Series
        X_train, X_val, y_train, y_val, weight_tr, weight_val
    """
    if weight is not None:
        X_train, y_train, weight_tr = (
            X.iloc[tridx, :],
            y.iloc[tridx],
            weight.iloc[tridx],
        )
        X_val, y_val, weight_val = (
            X.iloc[validx, :],
            y.iloc[validx],
            weight.iloc[validx],
        )
    else:
        X_train, y_train = X.iloc[tridx, :], y.iloc[tridx]
        X_val, y_val = X.iloc[validx, :], y.iloc[validx]
        weight_tr = None
        weight_val = None
    return X_train, X_val, y_train, y_val, weight_tr, weight_val


[docs]def _train_lgb_model(
    X_train,
    y_train,
    weight_train,
    X_val,
    y_val,
    weight_val,
    category_cols=None,
    early_stopping_rounds=20,
    fastshap=False,
    **params,
):
    """
    Train a LightGBM model with the given training data and hyperparameters and return the trained model and its SHAP values.

    Parameters
    ----------
    X_train : array-like of shape (n_samples, n_features)
        The input training data.
    y_train : array-like of shape (n_samples,)
        The target training data.
    weight_train : array-like of shape (n_samples,)
        The sample weights for training data.
    X_val : array-like of shape (n_val_samples, n_features)
        The input validation data.
    y_val : array-like of shape (n_val_samples,)
        The target validation data.
    weight_val : array-like of shape (n_val_samples,)
        The sample weights for validation data.
    category_cols : array-like or None, optional (default=None)
        The indices of categorical columns. If None, no categorical columns will be considered.
    early_stopping_rounds : int, optional (default=20)
        Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds
        round(s) to continue training._train_lgb_model
    fastshap : bool
        enable or not fasttreeshap implementation
    **params : dict
        Other parameters passed to the LightGBM model.

    Returns
    -------
    tuple of (Booster, numpy.ndarray, int)
        The trained LightGBM model, its SHAP values for X_train, and the best iteration reached during training.
    """
    d_train = lgb.Dataset(X_train, label=y_train, weight=weight_train)
    d_valid = lgb.Dataset(X_val, label=y_val, weight=weight_val)
    
    if category_cols:
        d_train.set_categorical_feature(category_cols)
        d_valid.set_categorical_feature(category_cols)

    bst = lgb.train(
        params,
        train_set=d_train,
        num_boost_round=10000,
        valid_sets=[d_train, d_valid],
        callbacks=[lgb.early_stopping(early_stopping_rounds)],
    )

    # Compute SHAP values with proper version handling
    try:
        if fastshap:
            from fasttreeshap import TreeExplainer
            explainer = TreeExplainer(bst, algorithm="v1")
        else:
            import shap
            explainer = shap.TreeExplainer(bst)
        
        shap_matrix = explainer.shap_values(X_train)
        
    except Exception as e:
        raise RuntimeError(f"SHAP computation failed: {str(e)}")

    return bst, shap_matrix, bst.best_iteration


[docs]def _compute_importance(new_x_tr, shap_matrix, param, objective, fastshap):
    """Compute feature importance scores using SHAP values.

    Parameters
    ----------
    new_x_tr : numpy.ndarray
        The training dataset after being processed.
    shap_matrix : numpy.ndarray
        The matrix containing SHAP values computed by a LightGBM model.
    param : dict
        A dictionary containing the parameters for a LightGBM model.
    objective : str
        The objective function of the LightGBM model.

    Returns
    -------
    list
        A list of tuples containing feature names and their corresponding importance scores.
    """
    try:
        from shap import __version__ as shap_version
        new_shap = tuple(map(int, shap_version.split('.')[:2])) >= (0, 45)
    except Exception:
        new_shap = False  # Assume old version if can't check

    if fastshap:
        if not new_shap and objective == "softmax" and isinstance(shap_matrix, list):
            shap_matrix = np.abs(np.concatenate(shap_matrix, axis=1))
        shap_imp = np.mean(np.abs(shap_matrix), axis=0)
    else:
        if new_shap:
            # the bias is now in a dedicated attribute, no more a column
            if objective == "softmax":
                shap_imp = np.mean(np.abs(shap_matrix).sum(axis=2), axis=0)
            else:
                shap_imp = np.mean(np.abs(shap_matrix), axis=0)
        else:
            if objective == "softmax":
                n_feat = new_x_tr.shape[1]
                shap_matrix = np.delete(
                    shap_matrix,
                    list(range(n_feat, (n_feat + 1) * param["num_class"], n_feat + 1)),
                    axis=1,
                )
                shap_imp = np.mean(np.abs(shap_matrix[:, :-1]), axis=0)
            else:
                if isinstance(shap_matrix, list):
                    shap_matrix = shap_matrix[1]  # Positive class
                shap_imp = np.mean(np.abs(shap_matrix[:, :-1]), axis=0)
    
    importance = dict(zip(new_x_tr.columns, shap_imp))
    return sorted(importance.items(), key=operator.itemgetter(1))


[docs]def _merge_importance_df(df, importance, iter, n_folds, column_names, silent=True):
    """
    Merge the feature importance dataframe `df` with the importance
    information for the current iteration of a cross-validation loop.

    Parameters
    ----------
    df : pandas.DataFrame
        The current feature importance dataframe.
    importance : dict
        A dictionary with the feature importance information for
        the current iteration.
    i : int
        The index of the current iteration.
    n_folds : int
        The number of folds used in the cross-validation loop.
    silent : bool, optional
        If True, suppress output.

    Returns
    -------
    pandas.DataFrame
        The updated feature importance dataframe.
    """

    df2 = pd.DataFrame(importance, columns=["feature", "fscore" + str(iter)])
    df2["fscore" + str(iter)] = (
        df2["fscore" + str(iter)] / df2["fscore" + str(iter)].sum()
    )
    df = pd.merge(df, df2, on="feature", how="outer")
    nit = divmod(iter, n_folds)[0]
    nf = divmod(iter, n_folds)[1]
    if not silent:
        if nf == 0:
            print("Groot iteration: ", nit, " with " + str(n_folds) + " folds")
    return df