Source code for arfs.feature_selection.base

"""Base Submodule

This module provides a base class for selector using a statistic and a threshold

Module Structure:
-----------------
- ``BaseThresholdSelector``: parent class for the "treshold-based" selectors

"""

# Settings and libraries
from __future__ import print_function

# pandas
import pandas as pd

# numpy
import numpy as np

# sklearn

from sklearn.utils.validation import check_is_fitted
from sklearn.base import BaseEstimator
from sklearn.feature_selection._base import SelectorMixin


# fix random seed for reproducibility
np.random.seed(7)


[docs]class BaseThresholdSelector(SelectorMixin, BaseEstimator):
    """Base class for threshold-based feature selection

    Parameters
    ----------
    threshold : float, .05
        Features with a training-set missing greater/lower (geq/leq) than this threshold will be removed
    statistic_fn : callable, optional
        The function for computing the statistic series. The index should be the column names and the
        the values the computed statistic
    greater_than_threshold : bool, False
        Whether or not to reject the features if lower or greater than threshold

    Returns
    -------
    selected_features: list of str
        List of selected features.

    Attributes
    ----------
    n_features_in_ : int
        number of input predictors
    support_ : list of bool
        the list of the selected X-columns
    selected_features_ : list of str
        the list of names of selected features
    not_selected_features_ : list of str
        the list of names of rejected features

    """

    def __init__(
        self,
        threshold=0.05,
        statistic_fn=None,
        greater_than_threshold=False,
    ):
        self.threshold = threshold
        self.statistic_fn = statistic_fn
        self.greater_than_threshold = greater_than_threshold

[docs]    def fit(self, X, y=None, sample_weight=None):
        """Learn empirical statistics from X.

        Parameters
        ----------
        X : pd.DataFrame, shape (n_samples, n_features)
            Data from which to compute variances, where `n_samples` is
            the number of samples and `n_features` is the number of features.
        y : any, default=None
            Ignored. This parameter exists only for compatibility with
            sklearn.pipeline.Pipeline.
        sample_weight : pd.Series, optional, shape (n_samples,)
            weights for computing the statistics (e.g. weighted average)

        Returns
        -------
        self : object
            Returns the instance itself.
        """

        # Calculate the fraction of missing in each column

        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_numpy()
        else:
            raise TypeError("X is not a dataframe")

        self.statistic_series_ = self.statistic_fn(X)
        self.statistic_df_ = pd.DataFrame(self.statistic_series_).rename(
            columns={"index": "feature", 0: "statistic"}
        )

        # Sort with highest number of missing values on top
        self.statistic_df_ = self.statistic_df_.sort_values(
            "statistic", ascending=False
        )
        if self.greater_than_threshold:
            self.support_ = self.statistic_series_.values > self.threshold
        else:
            self.support_ = self.statistic_series_.values < self.threshold

        self.selected_features_ = self.feature_names_in_[self.support_]
        self.not_selected_features_ = self.feature_names_in_[~self.support_]

        return self

    def _get_support_mask(self):
        check_is_fitted(self)

        return self.support_

[docs]    def transform(self, X):
        """
        Transform the data, returns a transformed version of `X`.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input samples.

        Returns
        -------
        X_new : ndarray array of shape (n_samples, n_features_new)
            Transformed array.
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X is not a dataframe")
        return X[self.selected_features_]

[docs]    def fit_transform(self, X, y=None, sample_weight=None, **fit_params):
        """
        Fit to data, then transform it.
        Fits transformer to `X` and `y` with optional parameters `fit_params`
        and returns a transformed version of `X`.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input samples.
        y :  array-like of shape (n_samples,) or (n_samples, n_outputs), \
                default=None
            Target values (None for unsupervised transformations).
        sample_weight :  array-like of shape (n_samples,) or (n_samples, n_outputs), \
                default=None
            sample weight values.
        **fit_params : dict
            Additional fit parameters.
        Returns
        -------
        X_new : ndarray array of shape (n_samples, n_features_new)
            Transformed array.
        """
        return self.fit(X=X, y=y, sample_weight=sample_weight, **fit_params).transform(
            X
        )

    def _more_tags(self):
        return {"allow_nan": True}