Source code for arfs.feature_selection.unsupervised

"""Unsupervised Feature Selection

This module provides selectors using unsupervised statistics and a threshold

Module Structure:
-----------------
- ``MissingValueThreshold``: child class of the ``BaseThresholdSelector``, filter out columns with too many missing values
- ``UniqueValuesThreshold`` child of the ``BaseThresholdSelector``, filter out columns with zero variance
- ``CardinalityThreshold`` child of the ``BaseThresholdSelector``, filter out categorical columns with too many levels
- ``CollinearityThreshold`` child of the ``BaseThresholdSelector``, filter out collinear columns
"""

from __future__ import print_function
from tqdm.auto import trange

# pandas
import pandas as pd

# numpy
import numpy as np

# sklearn
from sklearn.utils.validation import check_is_fitted
from sklearn.base import BaseEstimator
from sklearn.feature_selection._base import SelectorMixin

# ARFS
from .base import BaseThresholdSelector
from ..utils import create_dtype_dict
from ..association import (
    association_matrix,
    xy_to_matrix,
    plot_association_matrix,
    weighted_theils_u,
    weighted_corr,
    correlation_ratio,
)
from ..preprocessing import OrdinalEncoderPandas


# fix random seed for reproducibility
np.random.seed(7)


def _missing_ratio(df):
    if not isinstance(df, pd.DataFrame):
        raise TypeError("df should be a pandas DataFrame")
    numeric_columns = df.select_dtypes(np.number).columns
    n_samples = len(df)

    missing_counts = {}
    for column in df.columns:
        if column in numeric_columns:
            missing_counts[column] = (
                df[column].isnull().sum() + np.isinf(df[column]).sum()
            ) / n_samples
        else:
            missing_counts[column] = df[column].isnull().sum() / n_samples
    return pd.Series(missing_counts)


[docs]class MissingValueThreshold(BaseThresholdSelector):
    """Feature selector that removes all high missing percentage features.
    This feature selection algorithm looks only at the features (X),
    not the desired outputs (y), and can thus be used for unsupervised learning.


    Parameters
    ----------
    threshold: float, default = .05
        Features with a training-set missing larger than this threshold will be removed.

    Returns
    -------
    selected_features: list of str
        List of selected features.

    Attributes
    ----------
    n_features_in_ : int
        number of input predictors
    support_ : list of bool
        the list of the selected X-columns
    selected_features_ : list of str
        the list of names of selected features
    not_selected_features_ : list of str
        the list of names of rejected features

    Example
    -------
    >>> from sklearn.datasets import make_classification, make_regression
    >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5
    >>> X = pd.DataFrame(X)
    >>> y = pd.Series(y)
    >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])]
    >>> X.columns = pred_name
    >>> selector = MissingValueThreshold(0.05)
    >>> selector.fit_transform(X)
    """

    def __init__(self, threshold=0.05):
        super().__init__(
            threshold=threshold,
            statistic_fn=_missing_ratio,
            greater_than_threshold=False,
        )


def _pandas_count_unique_values(X):
    if not isinstance(X, pd.DataFrame):
        raise TypeError("X should be a pandas DataFrame")
    return X.nunique()


[docs]class UniqueValuesThreshold(BaseThresholdSelector):
    """Feature selector that removes all features with zero variance (single unique values)
    or remove columns with less unique values than threshold
    This feature selection algorithm looks only at the features (X),
    not the desired outputs (y), and can thus be used for unsupervised learning.

    Parameters
    ----------
    threshold: int, default = 1
        Features with a training-set missing larger than this threshold will be removed.
        The thresold should be >= 1

    Returns
    -------
    selected_features: list of str
        List of selected features.

    Attributes
    ----------
    n_features_in_ : int
        number of input predictors
    support_ : list of bool
        the list of the selected X-columns
    selected_features_ : list of str
        the list of names of selected features
    not_selected_features_ : list of str
        the list of names of rejected features

    Example
    -------
    >>> from sklearn.datasets import make_classification, make_regression
    >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5
    >>> X = pd.DataFrame(X)
    >>> y = pd.Series(y)
    >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])]
    >>> X.columns = pred_name
    >>> selector = UniqueValuesThreshold(1)
    >>> selector.fit_transform(X)
    """

    def __init__(self, threshold=1):
        super().__init__(
            threshold=threshold,
            statistic_fn=_pandas_count_unique_values,
            greater_than_threshold=True,
        )


[docs]def _pandas_count_unique_values_cat_features(X):
    """
    Counts the number of unique values in categorical features of a pandas DataFrame.

    Parameters
    ----------
    X : pandas DataFrame
        The input data.

    Returns
    -------
    pandas Series
        The number of unique values in each categorical feature.

    Raises
    ------
    TypeError
        If the input data is not a pandas DataFrame.
    """
    if not isinstance(X, pd.DataFrame):
        raise TypeError("X should be a pandas DataFrame")
    count_series = pd.Series(data=0, index=X.columns)
    dtype_dic = create_dtype_dict(X, dic_keys="dtypes")
    for c in dtype_dic["cat"]:
        count_series[c] = X[c].nunique()
    return count_series


[docs]class CardinalityThreshold(BaseThresholdSelector):
    """Feature selector that removes all categorical features with more unique values than threshold
    This feature selection algorithm looks only at the features (X),
    not the desired outputs (y), and can thus be used for unsupervised learning.

    Parameters
    ----------
    threshold: int, default = 1000
        Features with a training-set missing larger than this threshold will be removed.
        The thresold should be >= 1

    Returns
    -------
    selected_features: list of str
        List of selected features.

    Attributes
    ----------
    n_features_in_ : int
        number of input predictors
    support_ : list of bool
        the list of the selected X-columns
    selected_features_ : list of str
        the list of names of selected features
    not_selected_features_ : list of str
        the list of names of rejected features

    Example
    -------
    >>> from sklearn.datasets import make_classification, make_regression
    >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5
    >>> X = pd.DataFrame(X)
    >>> y = pd.Series(y)
    >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])]
    >>> X.columns = pred_name
    >>> selector = CardinalityThreshold(100)
    >>> selector.fit_transform(X)
    """

    def __init__(self, threshold=1000):
        super().__init__(
            threshold=threshold,
            statistic_fn=_pandas_count_unique_values_cat_features,
            greater_than_threshold=False,
        )


[docs]class CollinearityThreshold(SelectorMixin, BaseEstimator):
    """Feature selector that removes collinear features.
    This feature selection algorithm looks only at the features (X),
    not the desired outputs (y), and can thus be used for unsupervised learning.
    It computes the association between features (continuous or categorical),
    store the pairs of collinear features and remove one of them for all pairs having
    an association value above the threshold.

    The association measures are the Spearman correlation coefficient, correlation ratio
    and Theil's U. The association matrix is not necessarily symmetrical.

    By changing the method to "correlation", data will be encoded as integer
    and the Spearman correlation coefficient will be used instead. Faster but not
    a best practice because the categorical variables are considered as numeric.

    Parameters
    ----------
    threshold : float, default = .8
        Features with a training-set missing larger than this threshold will be removed
        The thresold should be > 0 and =< 1
    method : str, default = "association"
        method for computing the association matrix. Either "association" or "correlation".
        Correlation leads to encoding of categorical variables as numeric
    n_jobs : int, default = -1
        the number of threads, -1 uses all the threads for computating the association matrix
    nom_nom_assoc : str or callable, default = "theil"
        the categorical-categorical association measure, by default Theil's U, not symmetrical!
    num_num_assoc : str or callable, default = "spearman"
        the numeric-numeric association measure
    nom_num_assoc : str or callable, default = "correlation_ratio"
        the numeric-categorical association measure

    Returns
    -------
    selected_features: list of str
        List of selected features.

    Attributes
    ----------
    n_features_in_ : int
        number of input predictors
    assoc_matrix_ : pd.DataFrame
        the square association matrix
    collinearity_summary_ : pd.DataFrame
        the pairs of collinear features and the association values
    support_ : list of bool
        the list of the selected X-columns
    selected_features_ : list of str
        the list of names of selected features
    not_selected_features_ : list of str
        the list of names of rejected features

    Example
    -------
    >>> from sklearn.datasets import make_classification, make_regression
    >>> X, y = make_regression(n_samples = 1000, n_features = 50, n_informative = 5, shuffle=False) # , n_redundant = 5
    >>> X = pd.DataFrame(X)
    >>> y = pd.Series(y)
    >>> pred_name = [f"pred_{i}" for i in range(X.shape[1])]
    >>> X.columns = pred_name
    >>> selector = CollinearityThreshold(threshold=0.75)
    >>> selector.fit_transform(X)
    """

    def __init__(
        self,
        threshold=0.80,
        method="association",
        n_jobs=1,
        nom_nom_assoc=weighted_theils_u,
        num_num_assoc=weighted_corr,
        nom_num_assoc=correlation_ratio,
    ):
        self.threshold = threshold
        self.method = method
        self.n_jobs = n_jobs
        self.nom_nom_assoc = nom_nom_assoc
        self.num_num_assoc = num_num_assoc
        self.nom_num_assoc = nom_num_assoc

        if self.method not in ["association", "correlation"]:
            raise ValueError("``method`` should be 'association' or 'correlation'")

        if (self.threshold > 1.0) or (self.threshold < 0.0):
            raise ValueError("``threshold`` should be larger than 0 and smaller than 1")

[docs]    def fit(self, X, y=None, sample_weight=None):
        """Learn empirical associtions from X.

        Parameters
        ----------
        X : pd.DataFrame, shape (n_samples, n_features)
            Data from which to compute variances, where `n_samples` is
            the number of samples and `n_features` is the number of features.
        y : any, default=None
            Ignored. This parameter exists only for compatibility with
            sklearn.pipeline.Pipeline.
        sample_weight : pd.Series, optional, shape (n_samples,)
            weights for computing the statistics (e.g. weighted average)

        Returns
        -------
        self : object
            Returns the instance itself.
        """

        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_numpy()
        else:
            raise TypeError("X is not a dataframe")

        self.suffix_dic = create_dtype_dict(X)

        if self.method == "correlation":
            encoder = OrdinalEncoderPandas()
            X = encoder.fit_transform(X)
            del encoder

        assoc_matrix = association_matrix(
            X=X,
            sample_weight=sample_weight,
            n_jobs=self.n_jobs,
            nom_nom_assoc=self.nom_nom_assoc,
            num_num_assoc=self.num_num_assoc,
            nom_num_assoc=self.nom_num_assoc,
        )
        self.assoc_matrix_ = xy_to_matrix(assoc_matrix)

        to_drop = _recursive_collinear_elimination(self.assoc_matrix_, self.threshold)

        self.support_ = np.asarray(
            [True if c not in to_drop else False for c in X.columns]
        )
        self.selected_features_ = self.feature_names_in_[self.support_]
        self.not_selected_features_ = self.feature_names_in_[~self.support_]

        return self

    def _get_support_mask(self):
        check_is_fitted(self)

        return self.support_

[docs]    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X is not a dataframe")
        return X[self.selected_features_]

    def _more_tags(self):
        return {"allow_nan": True}

[docs]    def plot_association(
        self, ax=None, cmap="PuOr", figsize=None, cbar_kw=None, imgshow_kw=None
    ):
        """plot_association plots the association matrix

        Parameters
        ----------
        ax : matplotlib.axes.Axes, optional
            the mpl axes if the figure object exists already, by default None
        cmap : str, optional
            colormap name, by default "PuOr"
        figsize : tuple of float, optional
            figure size, by default None
        cbar_kw : dict, optional
            colorbar kwargs, by default None
        imgshow_kw : dict, optional
            imgshow kwargs, by default None
        """

        if figsize is None:
            figsize = (self.assoc_matrix_.shape[0] / 3, self.assoc_matrix_.shape[0] / 3)

        f, ax = plot_association_matrix(
            assoc_mat=self.assoc_matrix_,
            suffix_dic=self.suffix_dic,
            ax=ax,
            cmap=cmap,
            cbarlabel="association value",
            figsize=figsize,
            show=True,
            cbar_kw=cbar_kw,
            imgshow_kw=imgshow_kw,
        )

        return f


def _most_collinear(association_matrix_abs, threshold):
    cols_to_drop = association_matrix_abs.loc[
        :, (association_matrix_abs > threshold).any(axis=0)
    ].columns.values
    rows_to_drop = association_matrix_abs.loc[
        (association_matrix_abs > threshold).any(axis=1), :
    ].index.values
    to_drop = list(set(cols_to_drop).union(set(rows_to_drop)))
    if not to_drop:
        return None, None
    # for features in `to_drop` sum up their column and row values to find
    # the most collinear feature
    most_collinear_series = (
        association_matrix_abs.loc[:, to_drop]
        .sum(axis=0)
    )
    most_collinear_series += (
        association_matrix_abs.loc[to_drop, :]
        .sum(axis=1)
    )
    # not necessarily but avoids exceeding 1
    most_collinear_series /= 2
    return most_collinear_series.sort_values(ascending=False).index[0], to_drop


def _recursive_collinear_elimination(association_matrix, threshold):
    dum = association_matrix.abs()
    most_collinear_features = []

    while True:
        most_collinear_feature, to_drop = _most_collinear(dum, threshold)

        # Break if no more features to drop
        if not to_drop:
            break
        # the if statement below can probably also be removed since we can only
        # remove features we have left in dum
        if most_collinear_feature not in most_collinear_features:
            most_collinear_features.append(most_collinear_feature)
            dum = dum.drop(columns=most_collinear_feature, index=most_collinear_feature)

    return most_collinear_features