ARFS - Non Normal loss and Sample Weight#

Feel free to choose any loss function that suits your needs, whether it’s one provided by the library or a custom one that you define. The underlying estimator should support the use of custom loss functions for you to take full advantage of this flexibility.

[1]:

# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:95% !important; }</style>"))
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from lightgbm import LGBMRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import arfs
from arfs.feature_selection import GrootCV, Leshy, BoostAGroota, make_fs_summary
from arfs.feature_selection import MinRedundancyMaxRelevance
from arfs.utils import load_data
from arfs.benchmark import highlight_tick

rng = np.random.RandomState(seed=42)


def plot_y_vs_X(
    X: pd.DataFrame, y: pd.Series, ncols: int = 2, figsize: tuple = (10, 10)
) -> plt.Figure:
    """
    Create subplots of scatter plots showing the relationship between each column in X and the target variable y.

    Parameters
    ----------
        X : pd.DataFrame
            The input DataFrame containing the predictor variables.
        y : pd.Series
            The target variable to be plotted against.
        ncols : int, optional (default: 2)
            The number of columns in the subplot grid.
        figsize : tuple, optional (default: (10, 10))
            The size of the figure (width, height) in inches.

    Returns
    -------
        plt.Figure
            The generated Figure object containing the scatter plots.
    """

    n_rows, ncols_to_plot = divmod(X.shape[1], ncols)
    n_rows += int(ncols_to_plot > 0)

    # Create figure and axes
    f, axs = plt.subplots(nrows=n_rows, ncols=ncols, figsize=figsize)

    # Iterate through columns and plot against y
    for ax, col in zip(axs.flat, X.columns):
        ax.scatter(X[col], y, alpha=0.1)
        ax.set_title(col)

    # Remove any unused subplots
    for ax in axs.flat[len(X.columns) :]:
        ax.set_axis_off()

    # Display the figure
    plt.tight_layout()
    return f

[2]:

# Generate synthetic data with Poisson-distributed target variable
bias = 1
X, y, true_coef = make_regression(
    n_samples=2_000,
    n_features=10,
    n_informative=5,
    noise=1,
    random_state=8,
    bias=bias,
    coef=True,
)
y = (y - y.mean()) / y.std()
y = np.exp(y)  # Transform to positive values for Poisson distribution
y = np.random.poisson(y)  # Add Poisson noise to the target variable
# dummy sample weight (e.g. exposure), smallest being 30 days
w = np.random.uniform(30 / 365, 1, size=len(y))
# make the count a Poisson rate (frequency)
y = y / w

X = pd.DataFrame(X)
X.columns = [f"pred_{i}" for i in range(X.shape[1])]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, w, test_size=0.5, random_state=42
)

true_coef = pd.Series(true_coef)
true_coef.index = X.columns
true_coef = pd.Series({**{"intercept": bias}, **true_coef})
true_coef

genuine_predictors = true_coef[true_coef > 0.0]

print(f"The true coefficient of the linear data generating process are:\n {true_coef}")

f = plot_y_vs_X(X_train, y_train, ncols=5, figsize=(15, 5))

The true coefficient of the linear data generating process are:
 intercept     1.000000
pred_0        0.000000
pred_1       77.903644
pred_2        0.000000
pred_3        0.000000
pred_4       63.702300
pred_5        0.000000
pred_6       95.594997
pred_7       43.869903
pred_8        0.000000
pred_9        4.118619
dtype: float64

../_images/notebooks_arfs_non_normal_loss_and_sample_weight_2_1.png

[3]:

genuine_predictors

[3]:

intercept     1.000000
pred_1       77.903644
pred_4       63.702300
pred_6       95.594997
pred_7       43.869903
pred_9        4.118619
dtype: float64

illustrating the pipelining

[4]:

# Create a pipeline with LassoFeatureSelection and LinearRegression
pipeline = Pipeline(
    [
        ("scaler", StandardScaler().set_output(transform="pandas")),
        (
            "selector",
            GrootCV(
                objective="poisson",
                cutoff=1,
                n_folds=5,
                n_iter=5,
                silent=True,
                fastshap=True,
                n_jobs=0,
            ),
        ),
        ("glm", PoissonRegressor()),
    ]
)

# Fit the pipeline to the training data
pipeline.fit(
    X_train, y_train, selector__sample_weight=w_train, glm__sample_weight=w_train
)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate the residuals
residuals = y_test - y_pred

# Plot the predictions and residuals
plt.figure(figsize=(10, 5))

# Plot predictions
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.05)
plt.plot(
    [min(y_test), 10],
    [min(y_test), 10],
    color="#d1ae11",
    linestyle="--",
)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title("True Values vs. Predictions")


# Plot residuals
plt.subplot(1, 2, 2)
sns.histplot(residuals, kde=True)
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Distribution of Residuals")

plt.show()

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

../_images/notebooks_arfs_non_normal_loss_and_sample_weight_5_2.png

[5]:

print(
    f"The selected features: {pipeline.named_steps['selector'].get_feature_names_out()}"
)
print(f"The agnostic ranking: {pipeline.named_steps['selector'].ranking_}")
print(f"The naive ranking: {pipeline.named_steps['selector'].ranking_absolutes_}")
fig = pipeline.named_steps["selector"].plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
for name in true_coef.index:
    if name in genuine_predictors.index:
        fig = highlight_tick(figure=fig, str_match=name, color="green")
    else:
        fig = highlight_tick(figure=fig, str_match=name)

plt.show()

The selected features: ['pred_1' 'pred_4' 'pred_6' 'pred_7']
The agnostic ranking: [1 2 1 1 2 1 2 2 1 1]
The naive ranking: ['ShadowVar1', 'ShadowVar3', 'ShadowVar5', 'ShadowVar4', 'ShadowVar2', 'ShadowVar7', 'ShadowVar6', 'ShadowVar8', 'ShadowVar10', 'ShadowVar9']

../_images/notebooks_arfs_non_normal_loss_and_sample_weight_6_1.png

[6]:

make_fs_summary(pipeline)

Styler.applymap has been deprecated. Use Styler.map instead.

[6]:

	predictor	scaler	selector	glm
0	pred_0	nan	0	nan
1	pred_1	nan	1	nan
2	pred_2	nan	0	nan
3	pred_3	nan	0	nan
4	pred_4	nan	1	nan
5	pred_5	nan	0	nan
6	pred_6	nan	1	nan
7	pred_7	nan	1	nan
8	pred_8	nan	0	nan
9	pred_9	nan	0	nan

Leshy#

[7]:

model = LGBMRegressor(random_state=42, verbose=-1, objective="poisson")

pipeline = Pipeline(
    [
        ("scaler", StandardScaler().set_output(transform="pandas")),
        (
            "selector",
            Leshy(
                model,
                n_estimators=20,
                verbose=1,
                max_iter=10,
                random_state=42,
                importance="fastshap",
            ),
        ),
        ("glm", PoissonRegressor()),
    ]
)

# Fit the pipeline to the training data
pipeline.fit(
    X_train, y_train, selector__sample_weight=w_train, glm__sample_weight=w_train
)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate the residuals
residuals = y_test - y_pred

# Plot the predictions and residuals
plt.figure(figsize=(10, 5))

# Plot predictions
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.05)
plt.plot(
    [min(y_test), 10],
    [min(y_test), 10],
    color="#d1ae11",
    linestyle="--",
)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title("True Values vs. Predictions")


# Plot residuals
plt.subplot(1, 2, 2)
sns.histplot(residuals, kde=True)
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Distribution of Residuals")

plt.show()



Leshy finished running using native var. imp.

Iteration:      1 / 10
Confirmed:      4
Tentative:      3
Rejected:       3
All relevant predictors selected in 00:00:01.70

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

../_images/notebooks_arfs_non_normal_loss_and_sample_weight_9_3.png

[8]:

print(
    f"The selected features: {pipeline.named_steps['selector'].get_feature_names_out()}"
)
print(f"The agnostic ranking: {pipeline.named_steps['selector'].ranking_}")
print(f"The naive ranking: {pipeline.named_steps['selector'].ranking_absolutes_}")
fig = pipeline.named_steps["selector"].plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
for name in true_coef.index:
    if name in genuine_predictors.index:
        fig = highlight_tick(figure=fig, str_match=name, color="green")
    else:
        fig = highlight_tick(figure=fig, str_match=name)

plt.show()
plt.show()

The selected features: ['pred_1' 'pred_4' 'pred_6' 'pred_7']
The agnostic ranking: [2 1 3 5 1 2 1 1 2 4]
The naive ranking: ['pred_6', 'pred_1', 'pred_4', 'pred_7', 'pred_0', 'pred_5', 'pred_8', 'pred_2', 'pred_9', 'pred_3']

../_images/notebooks_arfs_non_normal_loss_and_sample_weight_10_1.png

BoostAGRoota#

[9]:

model = LGBMRegressor(random_state=42, verbose=-1, objective="poisson")

pipeline = Pipeline(
    [
        ("scaler", StandardScaler().set_output(transform="pandas")),
        (
            "selector",
            BoostAGroota(
                estimator=model,
                cutoff=1,
                iters=10,
                max_rounds=10,
                delta=0.1,
                importance="fastshap",
            ),
        ),
        ("glm", PoissonRegressor()),
    ]
)

# Fit the pipeline to the training data
pipeline.fit(
    X_train, y_train, selector__sample_weight=w_train, glm__sample_weight=w_train
)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate the residuals
residuals = y_test - y_pred

# Plot the predictions and residuals
plt.figure(figsize=(10, 5))

# Plot predictions
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.05)
plt.plot(
    [min(y_test), 10],
    [min(y_test), 10],
    color="#d1ae11",
    linestyle="--",
)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title("True Values vs. Predictions")

# Plot residuals
plt.subplot(1, 2, 2)
sns.histplot(residuals, kde=True)
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Distribution of Residuals")

plt.show()

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

../_images/notebooks_arfs_non_normal_loss_and_sample_weight_12_2.png

[10]:

print(
    f"The selected features: {pipeline.named_steps['selector'].get_feature_names_out()}"
)
print(f"The agnostic ranking: {pipeline.named_steps['selector'].ranking_}")
print(f"The naive ranking: {pipeline.named_steps['selector'].ranking_absolutes_}")
fig = pipeline.named_steps["selector"].plot_importance(n_feat_per_inch=5)

# highlight synthetic random variable
for name in true_coef.index:
    if name in genuine_predictors.index:
        fig = highlight_tick(figure=fig, str_match=name, color="green")
    else:
        fig = highlight_tick(figure=fig, str_match=name)

plt.show()

The selected features: ['pred_1' 'pred_4' 'pred_5' 'pred_6' 'pred_7']
The agnostic ranking: [1 2 1 1 2 2 2 2 1 1]
The naive ranking: ['pred_6', 'pred_1', 'pred_4', 'pred_7', 'pred_5', 'pred_0', 'pred_8', 'pred_9', 'pred_2', 'pred_3']

../_images/notebooks_arfs_non_normal_loss_and_sample_weight_13_1.png

MRmr#

[11]:

mrmr = MinRedundancyMaxRelevance(
    n_features_to_select=5,
    relevance_func=None,
    redundancy_func=None,
    task="regression",  # "classification",
    denominator_func=np.mean,
    only_same_domain=False,
    return_scores=False,
    show_progress=True,
    n_jobs=-1,
)

pipeline = Pipeline(
    [
        ("selector", mrmr),
        ("scaler", StandardScaler().set_output(transform="pandas")),
        ("glm", PoissonRegressor()),
    ]
)

# Fit the pipeline to the training data
pipeline.fit(
    X_train,
    pd.Series(y_train),
    selector__sample_weight=pd.Series(w_train),
    glm__sample_weight=w_train,
)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate the residuals
residuals = y_test - y_pred

# Plot the predictions and residuals
plt.figure(figsize=(10, 5))

# Plot predictions
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.05)
plt.plot(
    [min(y_test), 10],
    [min(y_test), 10],
    color="#d1ae11",
    linestyle="--",
)
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title("True Values vs. Predictions")


# Plot residuals
plt.subplot(1, 2, 2)
sns.histplot(residuals, kde=True)
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Distribution of Residuals")

plt.show()

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

../_images/notebooks_arfs_non_normal_loss_and_sample_weight_15_2.png

[12]:

genuine_predictors

[12]:

intercept     1.000000
pred_1       77.903644
pred_4       63.702300
pred_6       95.594997
pred_7       43.869903
pred_9        4.118619
dtype: float64

[13]:

print(
    f"The selected features: {pipeline.named_steps['selector'].get_feature_names_out()}"
)

The selected features: ['pred_1' 'pred_3' 'pred_4' 'pred_6' 'pred_7']

[14]:

pipeline.named_steps["selector"].ranking_

[14]:

	mrmr	relevance	redundancy
pred_6	inf	2.147684	0.000000
pred_1	25.663894	1.000965	0.039003
pred_4	17.704093	0.827022	0.046714
pred_7	0.630814	0.044338	0.070287
pred_3	-11.738380	-0.678216	0.057778