ARFS with Categorical predictors#
Let’s illustrate how ARFS is handling the categorical predictors
[3]:
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:95% !important; }</style>"))
import catboost
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import gc
import shap
from boruta import BorutaPy as bp
from sklearn.datasets import fetch_openml
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.inspection import permutation_importance
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sys import getsizeof, path
import arfs
import arfs.feature_selection as arfsfs
import arfs.feature_selection.allrelevant as arfsgroot
from arfs.utils import LightForestClassifier, LightForestRegressor
from arfs.benchmark import highlight_tick, compare_varimp, sklearn_pimp_bench
from arfs.utils import load_data
# plt.style.use('fivethirtyeight')
rng = np.random.RandomState(seed=42)
# import warnings
# warnings.filterwarnings('ignore')
[4]:
print(f"Run with ARFS {arfs.__version__}")
Run with ARFS 3.0.0
[5]:
gc.enable()
gc.collect()
[5]:
0
Simple Usage#
Most of the tree-based models do not handle categorical predictors without pre-processing. In the three different All Relevant Feature Selection methods, the categorical predictors are handled automatically. How?
the non-numeric columns are selected
integer encoding is performed
for lightGBM and XGBoost, the columns are encoded using contiguous integers and are passed as categoricals. See the official documentation
for CatBoost, the columns are encoded but the categorical columns are passed to the fit methods
estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=obj_feat)(for lightGBM, this param is pass to the estimator not the fit method).
This is done internally and automatically, you don’t have to set it yourself (if you do so, it’ll throw an error).
Let’s see an example using the Titanic data, with synthetic predictors added for benchmarking purpose.
[6]:
titanic = load_data(name="Titanic")
X, y = titanic.data, titanic.target
y = y.astype(int)
[7]:
X.dtypes
[7]:
pclass object
sex object
embarked object
random_cat object
is_alone object
title object
age float64
family_size float64
fare float64
random_num float64
dtype: object
[8]:
X.head()
[8]:
| pclass | sex | embarked | random_cat | is_alone | title | age | family_size | fare | random_num | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | female | S | Fry | 1 | Mrs | 29.0000 | 0.0 | 211.3375 | 0.496714 |
| 1 | 1 | male | S | Bender | 0 | Master | 0.9167 | 3.0 | 151.5500 | -0.138264 |
| 2 | 1 | female | S | Thanos | 0 | Mrs | 2.0000 | 3.0 | 151.5500 | 0.647689 |
| 3 | 1 | male | S | Morty | 0 | Mr | 30.0000 | 3.0 | 151.5500 | 1.523030 |
| 4 | 1 | female | S | Morty | 0 | Mrs | 25.0000 | 3.0 | 151.5500 | -0.234153 |
GrootCV#
[9]:
%%time
# GrootCV
feat_selector = arfsgroot.GrootCV(
objective="binary", cutoff=1, n_folds=5, n_iter=5, silent=True
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)
# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[25] training's binary_logloss: 0.271703 valid_1's binary_logloss: 0.420219
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[23] training's binary_logloss: 0.274628 valid_1's binary_logloss: 0.479866
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[26] training's binary_logloss: 0.271065 valid_1's binary_logloss: 0.401061
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[25] training's binary_logloss: 0.278582 valid_1's binary_logloss: 0.449048
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[17] training's binary_logloss: 0.323201 valid_1's binary_logloss: 0.499945
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[22] training's binary_logloss: 0.296943 valid_1's binary_logloss: 0.4165
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[34] training's binary_logloss: 0.21854 valid_1's binary_logloss: 0.432526
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[15] training's binary_logloss: 0.343947 valid_1's binary_logloss: 0.488293
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[27] training's binary_logloss: 0.272154 valid_1's binary_logloss: 0.397612
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[15] training's binary_logloss: 0.331041 valid_1's binary_logloss: 0.529122
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[26] training's binary_logloss: 0.267463 valid_1's binary_logloss: 0.461921
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[21] training's binary_logloss: 0.295882 valid_1's binary_logloss: 0.477657
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[19] training's binary_logloss: 0.310727 valid_1's binary_logloss: 0.456699
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[41] training's binary_logloss: 0.185986 valid_1's binary_logloss: 0.451256
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[23] training's binary_logloss: 0.289137 valid_1's binary_logloss: 0.425866
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[22] training's binary_logloss: 0.298142 valid_1's binary_logloss: 0.440557
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[25] training's binary_logloss: 0.280682 valid_1's binary_logloss: 0.423601
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[21] training's binary_logloss: 0.291164 valid_1's binary_logloss: 0.492688
Training until validation scores don't improve for 20 rounds
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Early stopping, best iteration is:
[23] training's binary_logloss: 0.28603 valid_1's binary_logloss: 0.459694
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[27] training's binary_logloss: 0.260918 valid_1's binary_logloss: 0.426465
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[37] training's binary_logloss: 0.212882 valid_1's binary_logloss: 0.414531
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[20] training's binary_logloss: 0.308926 valid_1's binary_logloss: 0.453518
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[28] training's binary_logloss: 0.254784 valid_1's binary_logloss: 0.450411
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[21] training's binary_logloss: 0.285502 valid_1's binary_logloss: 0.504963
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[23] training's binary_logloss: 0.28371 valid_1's binary_logloss: 0.479116
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
The selected features: ['pclass' 'sex' 'title' 'age' 'fare']
The agnostic ranking: [2 2 1 1 1 2 2 1 2 1]
The naive ranking: ['ShadowVar9', 'ShadowVar10', 'ShadowVar7', 'ShadowVar4', 'ShadowVar1', 'ShadowVar8', 'ShadowVar6', 'ShadowVar2', 'ShadowVar3', 'ShadowVar5']
CPU times: user 1min 4s, sys: 769 ms, total: 1min 5s
Wall time: 30.5 s
BoostAGroota#
[10]:
# be sure to use the same but non-fitted estimator
model = LGBMClassifier(random_state=1258, verbose=-1, n_estimators=100)
# BoostAGroota
feat_selector = arfsgroot.BoostAGroota(
estimator=model, cutoff=1, iters=10, max_rounds=10, delta=0.1, importance="shap"
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)
# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
The selected features: ['pclass' 'sex' 'embarked' 'age' 'fare' 'random_num']
The agnostic ranking: [2 2 2 1 1 1 2 1 2 2]
The naive ranking: ['sex', 'pclass', 'fare', 'age', 'embarked', 'random_num', 'family_size', 'random_cat', 'title', 'is_alone']
Leshy#
You don’t have to worry about the encoding, everything is performed internally. Even if you used CatBoost, the categorical predictors are handled internally as well.
[11]:
cat_idx = [X.columns.get_loc(col) for col in titanic.categorical]
[12]:
%%time
# Leshy
model = LGBMClassifier(random_state=42, verbose=-1, n_estimators=100)
# Leshy, all the predictors, no-preprocessing
feat_selector = arfsgroot.Leshy(
model,
n_estimators=1000,
verbose=1,
max_iter=10,
random_state=42,
importance="fastshap",
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)
# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()
/home/bsatom/Documents/arfs/src/arfs/feature_selection/allrelevant.py:325: UserWarning: fasttreeshap is not installed. Fallback to shap.
warnings.warn("fasttreeshap is not installed. Fallback to shap.")
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
/home/bsatom/Documents/arfs_test_nbs/arfsuv/lib/python3.12/site-packages/shap/explainers/_tree.py:544: UserWarning: LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
warnings.warn(
Leshy finished running using shap var. imp.
Iteration: 1 / 10
Confirmed: 3
Tentative: 1
Rejected: 6
All relevant predictors selected in 00:00:27.46
The selected features: ['pclass' 'sex' 'fare']
The agnostic ranking: [1 1 4 6 8 7 2 5 1 3]
The naive ranking: ['sex', 'pclass', 'fare', 'age', 'random_num', 'embarked', 'family_size', 'random_cat', 'title', 'is_alone']
CPU times: user 52.7 s, sys: 1.72 s, total: 54.4 s
Wall time: 28 s
For CatBoost, you just have to configure the estimator without taking care of the categorical predictors. They will be passed to the fit method internally, as illustrated in the code snippet of the ARFS package
[...]
obj_feat = list(set(list(X_tr.columns)) - set(list(X_tr.select_dtypes(include=[np.number]))))
[...]
if check_if_tree_based(estimator):
try:
# handle cat features if supported by the fit method
if is_catboost(estimator) or ('cat_feature' in estimator.fit.__code__.co_varnames):
model = estimator.fit(X_tr, y_tr, sample_weight=w_tr, cat_features=cat_idx)
else:
model = estimator.fit(X_tr, y_tr, sample_weight=w_tr)
[...]
[13]:
%%time
model = CatBoostClassifier(random_state=42, verbose=0)
# Leshy, all the predictors, no-preprocessing
feat_selector = arfsgroot.Leshy(
model,
n_estimators=100,
verbose=1,
max_iter=10,
random_state=42,
importance="fastshap",
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)
# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()
/home/bsatom/Documents/arfs/src/arfs/feature_selection/allrelevant.py:325: UserWarning: fasttreeshap is not installed. Fallback to shap.
warnings.warn("fasttreeshap is not installed. Fallback to shap.")
Leshy finished running using shap var. imp.
Iteration: 1 / 10
Confirmed: 5
Tentative: 2
Rejected: 3
All relevant predictors selected in 00:00:08.15
The selected features: ['pclass' 'sex' 'embarked' 'title' 'fare']
The agnostic ranking: [1 1 1 3 5 1 2 2 1 4]
The naive ranking: ['sex', 'pclass', 'title', 'fare', 'embarked', 'age', 'family_size', 'random_cat', 'random_num', 'is_alone']
CPU times: user 8.97 s, sys: 985 ms, total: 9.96 s
Wall time: 8.67 s