ARFS - Using GPU#
You can leverage the GPU implementation of lightGBM (or other GBM flavours) but this often requires to compile or install some libraries or kit (such as CUDA)
[1]:
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:95% !important; }</style>"))
import time
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
import arfs
from arfs.feature_selection import GrootCV, Leshy
from arfs.utils import load_data
from arfs.benchmark import highlight_tick
rng = np.random.RandomState(seed=42)
# import warnings
# warnings.filterwarnings('ignore')
GrootCV on GPU#
If the data is small, using a GPU mught not be the most efficient.
[ ]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
# Generate synthetic data with Poisson-distributed target variable
bias = 1
n_samples = 100_00 #1_000_000
n_features = 100
n_informative = 20
X, y, true_coef = make_regression(
n_samples=n_samples,
n_features=n_features,
n_informative=n_informative,
noise=1,
random_state=8,
bias=bias,
coef=True,
)
y = (y - y.mean()) / y.std()
y = np.exp(y) # Transform to positive values for Poisson distribution
y = np.random.poisson(y) # Add Poisson noise to the target variable
# dummy sample weight (e.g. exposure), smallest being 30 days
w = np.random.uniform(30 / 365, 1, size=len(y))
# make the count a Poisson rate (frequency)
y = y / w
X = pd.DataFrame(X)
X.columns = [f"pred_{i}" for i in range(X.shape[1])]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
X, y, w, test_size=0.5, random_state=42
)
true_coef = pd.Series(true_coef)
true_coef.index = X.columns
true_coef = pd.Series({**{"intercept": bias}, **true_coef})
true_coef
genuine_predictors = true_coef[true_coef > 0.0]
print(f"The true coefficient of the linear data generating process are:\n {true_coef}")
GPU
[ ]:
%%time
feat_selector = GrootCV(
objective="rmse",
cutoff=1,
n_folds=3,
n_iter=3,
silent=True,
fastshap=True,
n_jobs=0,
lgbm_params={"device": "gpu", "gpu_device_id": 1},
)
feat_selector.fit(X_train, y_train, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)
# highlight synthetic random variable
for name in true_coef.index:
if name in genuine_predictors.index:
fig = highlight_tick(figure=fig, str_match=name, color="green")
else:
fig = highlight_tick(figure=fig, str_match=name)
plt.show()
CPU
[ ]:
%%time
feat_selector = GrootCV(
objective="rmse",
cutoff=1,
n_folds=3,
n_iter=3,
silent=True,
fastshap=True,
n_jobs=0,
lgbm_params={"device": "cpu"},
)
feat_selector.fit(X_train, y_train, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)
# highlight synthetic random variable
for name in true_coef.index:
if name in genuine_predictors.index:
fig = highlight_tick(figure=fig, str_match=name, color="green")
else:
fig = highlight_tick(figure=fig, str_match=name)
plt.show()
On a smaller data set, for illustrative purposes.
[5]:
boston = load_data(name="Boston")
X, y = boston.data, boston.target
[ ]:
%%time
feat_selector = GrootCV(
objective="rmse",
cutoff=1,
n_folds=5,
n_iter=5,
silent=True,
fastshap=True,
n_jobs=0,
lgbm_params={"device": "cpu"},
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)
# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()
[ ]:
%%time
feat_selector = GrootCV(
objective="rmse",
cutoff=1,
n_folds=5,
n_iter=5,
silent=True,
fastshap=True,
n_jobs=0,
lgbm_params={"device": "gpu"},
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)
# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()
[ ]:
%%time
feat_selector = GrootCV(
objective="rmse",
cutoff=1,
n_folds=5,
n_iter=5,
silent=True,
fastshap=True,
n_jobs=0,
lgbm_params={"device": "cuda"},
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)
# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()
Leshy on GPU#
[9]:
model = LGBMRegressor(random_state=42, verbose=-1, device="gpu")
[ ]:
%%time
# Leshy
feat_selector = Leshy(
model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance="native"
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)
# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()
[11]:
model = LGBMRegressor(random_state=42, verbose=-1, device="cpu")
[ ]:
%%time
# Leshy
feat_selector = Leshy(
model, n_estimators=20, verbose=1, max_iter=10, random_state=42, importance="native"
)
feat_selector.fit(X, y, sample_weight=None)
print(f"The selected features: {feat_selector.get_feature_names_out()}")
print(f"The agnostic ranking: {feat_selector.ranking_}")
print(f"The naive ranking: {feat_selector.ranking_absolutes_}")
fig = feat_selector.plot_importance(n_feat_per_inch=5)
# highlight synthetic random variable
fig = highlight_tick(figure=fig, str_match="random")
fig = highlight_tick(figure=fig, str_match="genuine", color="green")
plt.show()