ARFS - Using GPU#

You can leverage the GPU implementation of lightGBM (or other GBM flavours) but this often requires to compile or install some libraries or kit (such as CUDA)

[1]:

# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:95% !important; }</style>"))
import time
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor

import arfs
from arfs.feature_selection import GrootCV, Leshy
from arfs.utils import load_data
from arfs.benchmark import highlight_tick

rng = np.random.RandomState(seed=42)

# import warnings
# warnings.filterwarnings('ignore')

GrootCV on GPU#

If the data is small, using a GPU mught not be the most efficient.

[ ]:

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# Generate synthetic data with Poisson-distributed target variable
bias = 1

n_samples = 100_00 #1_000_000
n_features = 100
n_informative = 20

X, y, true_coef = make_regression(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=n_informative,
    noise=1,
    random_state=8,
    bias=bias,
    coef=True,
)
y = (y - y.mean()) / y.std()
y = np.exp(y)  # Transform to positive values for Poisson distribution
y = np.random.poisson(y)  # Add Poisson noise to the target variable
# dummy sample weight (e.g. exposure), smallest being 30 days
w = np.random.uniform(30 / 365, 1, size=len(y))
# make the count a Poisson rate (frequency)
y = y / w

X = pd.DataFrame(X)
X.columns = [f"pred_{i}" for i in range(X.shape[1])]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, w, test_size=0.5, random_state=42
)

true_coef = pd.Series(true_coef)
true_coef.index = X.columns
true_coef = pd.Series({**{"intercept": bias}, **true_coef})
true_coef

genuine_predictors = true_coef[true_coef > 0.0]

print(f"The true coefficient of the linear data generating process are:\n {true_coef}")

GPU

[ ]:

%%time
feat_selector = GrootCV(
    objective="rmse",
    cutoff=1,
    n_folds=3,
    n_iter=3,
    silent=True,
    fastshap=True,
    n_jobs=0,
    lgbm_params={"device": "gpu", "gpu_device_id": 1},
)
feat_selector.fit(X_train, y_train, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
for name in true_coef.index:
    if name in genuine_predictors.index:
        fig = highlight_tick(figure=fig, str_match=name, color="green")
    else:
        fig = highlight_tick(figure=fig, str_match=name)

plt.show()

CPU

[ ]:

%%time
feat_selector = GrootCV(
    objective="rmse",
    cutoff=1,
    n_folds=3,
    n_iter=3,
    silent=True,
    fastshap=True,
    n_jobs=0,
    lgbm_params={"device": "cpu"},
)
feat_selector.fit(X_train, y_train, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
for name in true_coef.index:
    if name in genuine_predictors.index:
        fig = highlight_tick(figure=fig, str_match=name, color="green")
    else:
        fig = highlight_tick(figure=fig, str_match=name)

plt.show()

On a smaller data set, for illustrative purposes.

[5]:

boston = load_data(name="Boston")
X, y = boston.data, boston.target

[ ]:

%%time
feat_selector = GrootCV(
    objective="rmse",
    cutoff=1,
    n_folds=5,
    n_iter=5,
    silent=True,
    fastshap=True,
    n_jobs=0,
    lgbm_params={"device": "cpu"},
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()

[ ]:

%%time
feat_selector = GrootCV(
    objective="rmse",
    cutoff=1,
    n_folds=5,
    n_iter=5,
    silent=True,
    fastshap=True,
    n_jobs=0,
    lgbm_params={"device": "gpu"},
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()

[ ]:

%%time
feat_selector = GrootCV(
    objective="rmse",
    cutoff=1,
    n_folds=5,
    n_iter=5,
    silent=True,
    fastshap=True,
    n_jobs=0,
    lgbm_params={"device": "cuda"},
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()

Leshy on GPU#

[9]:

model = LGBMRegressor(random_state=42, verbose=-1, device="gpu")

[ ]:

%%time
# Leshy
feat_selector = Leshy(
    model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance="native"
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()

[11]:

model = LGBMRegressor(random_state=42, verbose=-1, device="cpu")

[ ]:

%%time
# Leshy
feat_selector = Leshy(
    model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance="native"
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()