ARFS - Using GPU#

You can leverage the GPU implementation of lightGBM (or other GBM flavours) but this often requires to compile or install some libraries or kit (such as CUDA)

[1]:
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:95% !important; }</style>"))
import time
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor

import arfs
from arfs.feature_selection import GrootCV, Leshy
from arfs.utils import load_data
from arfs.benchmark import highlight_tick

rng = np.random.RandomState(seed=42)

# import warnings
# warnings.filterwarnings('ignore')

GrootCV on GPU#

If the data is small, using a GPU mught not be the most efficient.

[2]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# Generate synthetic data with Poisson-distributed target variable
bias = 1

n_samples = 100_00 #1_000_000
n_features = 100
n_informative = 20

X, y, true_coef = make_regression(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=n_informative,
    noise=1,
    random_state=8,
    bias=bias,
    coef=True,
)
y = (y - y.mean()) / y.std()
y = np.exp(y)  # Transform to positive values for Poisson distribution
y = np.random.poisson(y)  # Add Poisson noise to the target variable
# dummy sample weight (e.g. exposure), smallest being 30 days
w = np.random.uniform(30 / 365, 1, size=len(y))
# make the count a Poisson rate (frequency)
y = y / w

X = pd.DataFrame(X)
X.columns = [f"pred_{i}" for i in range(X.shape[1])]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, w, test_size=0.5, random_state=42
)

true_coef = pd.Series(true_coef)
true_coef.index = X.columns
true_coef = pd.Series({**{"intercept": bias}, **true_coef})
true_coef

genuine_predictors = true_coef[true_coef > 0.0]

print(f"The true coefficient of the linear data generating process are:\n {true_coef}")
The true coefficient of the linear data generating process are:
 intercept     1.000000
pred_0        0.000000
pred_1        0.000000
pred_2        4.880441
pred_3        0.000000
               ...
pred_95       0.000000
pred_96       0.000000
pred_97      82.010316
pred_98       0.000000
pred_99       0.000000
Length: 101, dtype: float64

GPU

[3]:
%%time
feat_selector = GrootCV(
    objective="rmse",
    cutoff=1,
    n_folds=3,
    n_iter=3,
    silent=True,
    fastshap=True,
    n_jobs=0,
    lgbm_params={"device": "gpu", "gpu_device_id": 1},
)
feat_selector.fit(X_train, y_train, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
for name in true_coef.index:
    if name in genuine_predictors.index:
        fig = highlight_tick(figure=fig, str_match=name, color="green")
    else:
        fig = highlight_tick(figure=fig, str_match=name)

plt.show()
The selected features: ['pred_6' 'pred_11' 'pred_29' 'pred_34' 'pred_49' 'pred_55' 'pred_61'
 'pred_62' 'pred_64' 'pred_68' 'pred_75' 'pred_81' 'pred_84' 'pred_87'
 'pred_93' 'pred_97']
The agnostic ranking: [1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 2 2 1 2 1 1 1 2 1 1 1 1 1
 1 2 1 1 1 1 1 2 1 1 2 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1]
The naive ranking: ['pred_87', 'pred_29', 'pred_62', 'pred_97', 'pred_84', 'pred_75', 'pred_68', 'pred_11', 'pred_49', 'pred_55', 'pred_81', 'pred_64', 'pred_6', 'pred_93', 'pred_61', 'pred_34', 'pred_57', 'pred_76', 'pred_39', 'pred_91', 'pred_14', 'pred_25', 'pred_0', 'pred_37', 'pred_50', 'pred_5', 'pred_52', 'pred_58', 'pred_70', 'pred_48', 'pred_54', 'pred_78', 'pred_12', 'pred_27', 'pred_65', 'pred_53', 'pred_46', 'pred_47', 'pred_86', 'pred_90', 'pred_60', 'pred_9', 'pred_31', 'pred_99', 'pred_56', 'pred_88', 'pred_38', 'pred_7', 'pred_10', 'pred_74', 'pred_19', 'pred_15', 'pred_21', 'pred_40', 'pred_2', 'pred_98', 'pred_16', 'pred_67', 'pred_26', 'pred_66', 'pred_3', 'pred_71', 'pred_43', 'pred_42', 'pred_63', 'pred_13', 'pred_45', 'pred_77', 'pred_24', 'pred_85', 'pred_4', 'pred_32', 'pred_72', 'pred_82', 'pred_22', 'pred_80', 'pred_73', 'pred_30', 'pred_92', 'pred_20', 'pred_94', 'pred_96', 'pred_69', 'pred_44', 'pred_79', 'pred_8', 'pred_1', 'pred_83', 'pred_36', 'pred_41', 'pred_18', 'pred_35', 'pred_59', 'pred_17', 'pred_28', 'pred_51', 'pred_89', 'pred_95', 'pred_33', 'pred_23']
../_images/notebooks_arfs_on_GPU_5_2.png
CPU times: total: 2min 38s
Wall time: 50.1 s

CPU

[4]:
%%time
feat_selector = GrootCV(
    objective="rmse",
    cutoff=1,
    n_folds=3,
    n_iter=3,
    silent=True,
    fastshap=True,
    n_jobs=0,
    lgbm_params={"device": "cpu"},
)
feat_selector.fit(X_train, y_train, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
for name in true_coef.index:
    if name in genuine_predictors.index:
        fig = highlight_tick(figure=fig, str_match=name, color="green")
    else:
        fig = highlight_tick(figure=fig, str_match=name)

plt.show()
The selected features: ['pred_6' 'pred_11' 'pred_29' 'pred_34' 'pred_49' 'pred_55' 'pred_61'
 'pred_62' 'pred_64' 'pred_68' 'pred_75' 'pred_81' 'pred_84' 'pred_87'
 'pred_93' 'pred_97']
The agnostic ranking: [1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 2 2 1 2 1 1 1 2 1 1 1 1 1
 1 2 1 1 1 1 1 2 1 1 2 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1]
The naive ranking: ['pred_87', 'pred_29', 'pred_62', 'pred_97', 'pred_84', 'pred_75', 'pred_68', 'pred_11', 'pred_49', 'pred_55', 'pred_81', 'pred_6', 'pred_64', 'pred_93', 'pred_61', 'pred_34', 'pred_57', 'pred_76', 'pred_50', 'pred_91', 'pred_39', 'pred_37', 'pred_14', 'pred_0', 'pred_5', 'pred_25', 'pred_58', 'pred_65', 'pred_86', 'pred_48', 'pred_52', 'pred_53', 'pred_78', 'pred_47', 'pred_88', 'pred_70', 'pred_27', 'pred_99', 'pred_9', 'pred_7', 'pred_60', 'pred_16', 'pred_21', 'pred_15', 'pred_74', 'pred_46', 'pred_54', 'pred_82', 'pred_56', 'pred_32', 'pred_31', 'pred_22', 'pred_12', 'pred_2', 'pred_90', 'pred_40', 'pred_26', 'pred_94', 'pred_42', 'pred_66', 'pred_38', 'pred_45', 'pred_72', 'pred_10', 'pred_67', 'pred_98', 'pred_63', 'pred_19', 'pred_18', 'pred_59', 'pred_13', 'pred_30', 'pred_17', 'pred_85', 'pred_3', 'pred_92', 'pred_8', 'pred_71', 'pred_4', 'pred_69', 'pred_28', 'pred_35', 'pred_77', 'pred_80', 'pred_95', 'pred_44', 'pred_24', 'pred_43', 'pred_51', 'pred_1', 'pred_73', 'pred_89', 'pred_41', 'pred_20', 'pred_33', 'pred_96', 'pred_83', 'pred_36', 'pred_79', 'pred_23']
../_images/notebooks_arfs_on_GPU_7_2.png
CPU times: total: 1min 18s
Wall time: 32.1 s

On a smaller data set, for illustrative purposes.

[5]:
boston = load_data(name="Boston")
X, y = boston.data, boston.target
[6]:
%%time
feat_selector = GrootCV(
    objective="rmse",
    cutoff=1,
    n_folds=5,
    n_iter=5,
    silent=True,
    fastshap=True,
    n_jobs=0,
    lgbm_params={"device": "cpu"},
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()
The selected features: ['CRIM' 'NOX' 'RM' 'AGE' 'DIS' 'TAX' 'PTRATIO' 'B' 'LSTAT' 'genuine_num']
The agnostic ranking: [2 1 1 1 2 2 2 2 1 2 2 2 2 1 1 1 1 2]
The naive ranking: ['LSTAT', 'RM', 'genuine_num', 'PTRATIO', 'DIS', 'CRIM', 'NOX', 'AGE', 'TAX', 'B', 'random_num1', 'INDUS', 'random_cat', 'random_cat_2', 'RAD', 'ZN', 'random_num2', 'CHAS']
../_images/notebooks_arfs_on_GPU_10_2.png
CPU times: total: 1min 2s
Wall time: 20.1 s
[7]:
%%time
feat_selector = GrootCV(
    objective="rmse",
    cutoff=1,
    n_folds=5,
    n_iter=5,
    silent=True,
    fastshap=True,
    n_jobs=0,
    lgbm_params={"device": "gpu"},
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()
The selected features: ['CRIM' 'NOX' 'RM' 'AGE' 'DIS' 'TAX' 'PTRATIO' 'LSTAT' 'genuine_num']
The agnostic ranking: [2 1 1 1 2 2 2 2 1 2 2 1 2 1 1 1 1 2]
The naive ranking: ['LSTAT', 'RM', 'genuine_num', 'PTRATIO', 'DIS', 'CRIM', 'NOX', 'AGE', 'TAX', 'B', 'random_num1', 'INDUS', 'random_cat', 'random_cat_2', 'RAD', 'ZN', 'random_num2', 'CHAS']
../_images/notebooks_arfs_on_GPU_11_2.png
CPU times: total: 3min 55s
Wall time: 52.3 s
[8]:
%%time
feat_selector = GrootCV(
    objective="rmse",
    cutoff=1,
    n_folds=5,
    n_iter=5,
    silent=True,
    fastshap=True,
    n_jobs=0,
    lgbm_params={"device": "cuda"},
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()
---------------------------------------------------------------------------
LightGBMError                             Traceback (most recent call last)
File <timed exec>:11

File ~\OneDrive - Allianz\Documents\Projects\GitHub-TB\allrelevantfs\src\arfs\feature_selection\allrelevant.py:2056, in GrootCV.fit(self, X, y, sample_weight)
   2053 # internal encoding (ordinal encoding)
   2054 X, obj_feat, cat_idx = get_pandas_cat_codes(X)
-> 2056 self.selected_features_, self.cv_df, self.sha_cutoff = _reduce_vars_lgb_cv(
   2057     X,
   2058     y,
   2059     objective=self.objective,
   2060     cutoff=self.cutoff,
   2061     n_folds=self.n_folds,
   2062     n_iter=self.n_iter,
   2063     silent=self.silent,
   2064     weight=sample_weight,
   2065     rf=self.rf,
   2066     fastshap=self.fastshap,
   2067     lgbm_params=self.lgbm_params,
   2068     n_jobs=self.n_jobs,
   2069 )
   2071 self.selected_features_ = self.selected_features_.values
   2072 self.support_ = np.asarray(
   2073     [c in self.selected_features_ for c in self.feature_names_in_]
   2074 )

File ~\OneDrive - Allianz\Documents\Projects\GitHub-TB\allrelevantfs\src\arfs\feature_selection\allrelevant.py:2270, in _reduce_vars_lgb_cv(X, y, objective, n_folds, cutoff, n_iter, silent, weight, rf, fastshap, lgbm_params, n_jobs)
   2267 new_x_tr, shadow_names = _create_shadow(X_train)
   2268 new_x_val, _ = _create_shadow(X_val)
-> 2270 bst, shap_matrix, bst.best_iteration = _train_lgb_model(
   2271     new_x_tr,
   2272     y_train,
   2273     weight_tr,
   2274     new_x_val,
   2275     y_val,
   2276     weight_val,
   2277     category_cols=category_cols,
   2278     early_stopping_rounds=20,
   2279     fastshap=fastshap,
   2280     **params,
   2281 )
   2283 importance = _compute_importance(
   2284     new_x_tr, shap_matrix, params, objective, fastshap
   2285 )
   2286 df = _merge_importance_df(
   2287     df=df,
   2288     importance=importance,
   (...)
   2292     silent=silent,
   2293 )

File ~\OneDrive - Allianz\Documents\Projects\GitHub-TB\allrelevantfs\src\arfs\feature_selection\allrelevant.py:2511, in _train_lgb_model(X_train, y_train, weight_train, X_val, y_val, weight_val, category_cols, early_stopping_rounds, fastshap, **params)
   2506 d_valid = lgb.Dataset(
   2507     X_val, label=y_val, weight=weight_val, categorical_feature=category_cols
   2508 )
   2509 watchlist = [d_train, d_valid]
-> 2511 bst = lgb.train(
   2512     params,
   2513     train_set=d_train,
   2514     num_boost_round=10000,
   2515     valid_sets=watchlist,
   2516     categorical_feature=category_cols,
   2517     callbacks=[early_stopping(early_stopping_rounds, False, False)],
   2518 )
   2520 if fastshap:
   2521     try:

File c:\Users\xtbury\AppData\Local\mambaforge\envs\arfs\lib\site-packages\lightgbm\engine.py:271, in train(params, train_set, num_boost_round, valid_sets, valid_names, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, evals_result, verbose_eval, learning_rates, keep_training_booster, callbacks)
    269 # construct booster
    270 try:
--> 271     booster = Booster(params=params, train_set=train_set)
    272     if is_valid_contain_train:
    273         booster.set_train_data_name(train_data_name)

File c:\Users\xtbury\AppData\Local\mambaforge\envs\arfs\lib\site-packages\lightgbm\basic.py:2610, in Booster.__init__(self, params, train_set, model_file, model_str, silent)
   2608 params_str = param_dict_to_str(params)
   2609 self.handle = ctypes.c_void_p()
-> 2610 _safe_call(_LIB.LGBM_BoosterCreate(
   2611     train_set.handle,
   2612     c_str(params_str),
   2613     ctypes.byref(self.handle)))
   2614 # save reference to data
   2615 self.train_set = train_set

File c:\Users\xtbury\AppData\Local\mambaforge\envs\arfs\lib\site-packages\lightgbm\basic.py:125, in _safe_call(ret)
    117 """Check the return value from C API call.
    118
    119 Parameters
   (...)
    122     The return value from C API calls.
    123 """
    124 if ret != 0:
--> 125     raise LightGBMError(_LIB.LGBM_GetLastError().decode('utf-8'))

LightGBMError: CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1

Leshy on GPU#

[9]:
model = LGBMRegressor(random_state=42, verbose=-1, device="gpu")
[10]:
%%time
# Leshy
feat_selector = Leshy(
    model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance="native"
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()


Leshy finished running using native var. imp.

Iteration:      1 / 10
Confirmed:      9
Tentative:      2
Rejected:       7
All relevant predictors selected in 00:00:02.63
The selected features: ['CRIM' 'NOX' 'RM' 'AGE' 'DIS' 'PTRATIO' 'B' 'LSTAT' 'genuine_num']
The agnostic ranking: [1 7 3 8 1 1 1 1 4 2 1 1 1 2 7 3 5 1]
The naive ranking: ['RM', 'genuine_num', 'LSTAT', 'CRIM', 'NOX', 'DIS', 'AGE', 'PTRATIO', 'B', 'TAX', 'random_num1', 'INDUS', 'random_cat', 'RAD', 'random_cat_2', 'random_num2', 'ZN', 'CHAS']
../_images/notebooks_arfs_on_GPU_15_2.png
CPU times: total: 10.9 s
Wall time: 3.3 s
[11]:
model = LGBMRegressor(random_state=42, verbose=-1, device="cpu")
[12]:
%%time
# Leshy
feat_selector = Leshy(
    model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance="native"
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()


Leshy finished running using native var. imp.

Iteration:      1 / 10
Confirmed:      9
Tentative:      2
Rejected:       7
All relevant predictors selected in 00:00:00.44
The selected features: ['CRIM' 'NOX' 'RM' 'AGE' 'DIS' 'PTRATIO' 'B' 'LSTAT' 'genuine_num']
The agnostic ranking: [1 7 3 8 1 1 1 1 4 2 1 1 1 2 7 3 5 1]
The naive ranking: ['RM', 'genuine_num', 'LSTAT', 'CRIM', 'NOX', 'DIS', 'AGE', 'PTRATIO', 'B', 'TAX', 'random_num1', 'INDUS', 'random_cat', 'RAD', 'random_cat_2', 'random_num2', 'ZN', 'CHAS']
../_images/notebooks_arfs_on_GPU_17_2.png
CPU times: total: 2.97 s
Wall time: 1.16 s