Source code for rectools.metrics.auc

#  Copyright 2024 MTS (Mobile Telesystems)
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""ROC AUC based ranking recommendations metrics."""
import typing as tp
from enum import Enum

import attr
import pandas as pd
from attrs import define, field

from rectools import Columns
from rectools.metrics.base import outer_merge_reco
from rectools.metrics.debias import DebiasableMetrikAtK, calc_debiased_fit_task, debias_interactions


[docs]class InsufficientHandling(str, Enum):
    """Strategy for handling insufficient reommendations cases"""

    IGNORE = "ignore"
    EXCLUDE = "exclude"
    RAISE = "raise"


[docs]@attr.s
class AUCFitted:
    """
    Container with meta data got from `_AUCMetric.fit` method.

    Parameters
    ----------
    outer_merged_enriched : pd.DataFrame
        Recommendations outer merged with test interactions. Table has Columns.User, Columns.Item,
        Columns.Rank. Precomputed columns include "__test_positive", "__tp", "__fp", "__fp_cumsum",
        "__test_pos_cumcum". All ranks for all users are present with no skipping.
        Null ranks are specified for test interactions that were not predicted in recommendations.
    n_pos : pd.Series
        Number of positive items for each user in test insteractions.
    n_fp_insufficient : pd.Series
        Number of false positive items for each user in `outer_merged_enriched` that had at least
        one false negative. This users will be checked for insufficient cases processing.

    """

    outer_merged_enriched: pd.DataFrame = attr.ib()
    n_pos: pd.Series = attr.ib()
    n_fp_insufficient: pd.Series = attr.ib()


[docs]@define
class _AUCMetric(DebiasableMetrikAtK):
    """
    ROC AUC based metric base class.

    Warning: This class should not be used directly.
    Use derived classes instead.

    Parameters
    ----------
    k : int
        Number of items at the top of recommendations list that will be used to calculate metric.
    insufficient_handling : {"ignore", "raise", "exclude"}, default `"ignore"`
        Method of handling users with insufficient recommendations for metric calculation.
        ROC AUC based metrics with `k` parameter often need more then `k` recommendations
        for each user. This happens because this metrics calculate ROC AUC
        score for specific number of user false positives and ranked test positives that is derived
        from provided `k` parameter but is not equal to it.
        The following methods are available:
        - `ignore` - don't check for insufficient recommendations lists, handle all of insufficient
        cases as if algorithms are not able to retrieve users unpredicted test positives on any k
        level. This will understate the metric value;
        - `exclude` - exclude all users with insufficient recommendations lists from metrics
        computation;
        - `raise` - raise error if there are any users with insufficient recommendations lists. Use
        this option very carefully because some of the algorithms are unable to provide full required
        lists because of their inference logic. So can get errors even if you requested enough
        recommendations in `recommend` method. For example, ItemKNN generates recommendations only
        until the model has non-zero scores for the item in item-item similarity matrix. So with
        small `K` for neighbours in ItemKNN and big `K` for `recommend` and AUC based metric you
        will still get an error when `insufficient_handling` is set to `raise`.
    debias_config : DebiasConfig, optional, default None
        Config with debias method parameters (iqr_coef, random_state).
    """

    insufficient_handling: str = field(default="ignore")

    @insufficient_handling.validator
    def _check_insufficient_handling(self, attribute: str, value: str) -> None:
        possible_values = {item.value for item in InsufficientHandling.__members__.values()}
        if value not in possible_values:
            raise ValueError(f"`insufficient_handling` must be one of the {possible_values}. Got {value}.")

[docs]    @classmethod
    def fit(
        cls, reco: pd.DataFrame, interactions: pd.DataFrame, k_max: int, insufficient_handling_needed: bool
    ) -> AUCFitted:
        """
        Prepare intermediate data for effective calculation.

        You can use this method to prepare some intermediate data
        for later calculation. It can optimize calculations if
        you want calculate different AUC based metrics with different `k` parameter.
        """
        cls._check(reco, interactions=interactions)

        outer_merged = outer_merge_reco(reco, interactions)
        recommended_mask = ~outer_merged[Columns.Rank].isna()
        outer_merged["__tp"] = recommended_mask & outer_merged["__test_positive"]
        outer_merged["__fp"] = recommended_mask & ~outer_merged["__test_positive"]

        grouped = outer_merged.groupby(Columns.User, sort=False)

        # perform cumcum and sum aggs separately otherwise row order can be affected
        cumsum_stats = grouped.agg(__fp_cumsum=("__fp", "cumsum"), __test_pos_cumsum=("__test_positive", "cumsum"))
        stats = grouped.agg(n_pos=("__test_positive", "sum"), n_fp=("__fp", "sum"))

        n_pos = stats["n_pos"].dropna().rename_axis(Columns.User)
        outer_merged = pd.concat([outer_merged, cumsum_stats[["__fp_cumsum", "__test_pos_cumsum"]]], axis=1)

        if insufficient_handling_needed:
            # Every user with FP count more then k_max has sufficient recommendations for partial AUC based metrics
            # We calculate and keep number of false positives for all other users
            users_n_fp = stats["n_fp"].dropna().rename_axis(Columns.User)
            n_fp_insufficient = users_n_fp[users_n_fp < k_max]
            users_with_fn = outer_merged.loc[~recommended_mask, Columns.User].unique()
            n_fp_insufficient = n_fp_insufficient[n_fp_insufficient.index.isin(users_with_fn)]
        else:
            n_fp_insufficient = pd.Series([])

        return AUCFitted(outer_merged, n_pos, n_fp_insufficient)

    def _get_sufficient_reco_explanation(self) -> str:
        raise NotImplementedError()

    def _handle_insufficient_cases(
        self, outer_merged: pd.DataFrame, n_pos: pd.Series, n_fp_insufficient: pd.Series
    ) -> pd.Series:
        if self.insufficient_handling == InsufficientHandling.IGNORE:
            return outer_merged, n_pos

        insufficient_users = n_fp_insufficient[n_fp_insufficient < self.k].index.values
        if len(insufficient_users) == 0:
            return outer_merged, n_pos

        if self.insufficient_handling == InsufficientHandling.EXCLUDE:
            outer_merged_suf = outer_merged[~outer_merged[Columns.User].isin(insufficient_users)]
            n_pos_suf = n_pos[~n_pos.index.isin(insufficient_users)]
            return outer_merged_suf, n_pos_suf

        raise ValueError(
            f"""
            {self.__class__.__name__}@{self.k} metric requires at least {self.k} negatives in
            recommendations for each user. Or all items from user test interactions ranked in
            recommendations - meaning that all other recommended items will be negatives.
            There are {len(insufficient_users)} users with less then required negatives.
            For correct {self.__class__.__name__} computation please provide each user with sufficient number
            of recommended items. {self._get_sufficient_reco_explanation()}
            You can disable this error by specifying `insufficient_handling`="{InsufficientHandling.IGNORE}" or
            by excluding all users with insuffissient recommendations from metric computation
            with specifying `insufficient_handling` = "{InsufficientHandling.EXCLUDE}".
            """
        )

    def _calc_roc_auc(self, cropped_outer_merged: pd.DataFrame, n_pos: pd.Series) -> pd.Series:
        """
        Calculate ROC AUC given that all data has already been prepared, merged, enriched and cropped following
        metric specific logic.
        """
        auc_numenator_gain = (self.k - cropped_outer_merged["__fp_cumsum"]) * cropped_outer_merged["__tp"]
        auc_numenator_gain.name = "__auc_numenator_gain"
        user_auc_numenator = pd.concat([cropped_outer_merged[Columns.User], auc_numenator_gain], axis=1)
        auc_numenator = user_auc_numenator.groupby(Columns.User)["__auc_numenator_gain"].sum()
        auc_denominator = n_pos * self.k
        auc = auc_numenator.rename("numenator").to_frame().join(auc_denominator.rename("denominator"), how="outer")
        return (auc["numenator"] / auc["denominator"]).fillna(0)

[docs]    def calc(self, reco: pd.DataFrame, interactions: pd.DataFrame) -> float:
        """
        Calculate metric value.

        Parameters
        ----------
        reco : pd.DataFrame
            Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
        interactions : pd.DataFrame
            Interactions table with columns `Columns.User`, `Columns.Item`.

        Returns
        -------
        float
            Value of metric (average between users).
        """
        per_user = self.calc_per_user(reco, interactions)
        return per_user.mean()

[docs]    def calc_per_user(self, reco: pd.DataFrame, interactions: pd.DataFrame) -> pd.Series:
        """
        Calculate metric values for all users.

        Parameters
        ----------
        reco : pd.DataFrame
            Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
        interactions : pd.DataFrame
            Interactions table with columns `Columns.User`, `Columns.Item`.

        Returns
        -------
        pd.Series
            Values of metric (index - user id, values - metric value for every user).
        """
        is_debiased = False
        if self.debias_config is not None:
            interactions = debias_interactions(interactions, self.debias_config)
            is_debiased = True

        self._check(reco, interactions=interactions)
        insufficient_handling_needed = self.insufficient_handling != InsufficientHandling.IGNORE
        fitted = self.fit(reco, interactions, self.k, insufficient_handling_needed)
        return self.calc_per_user_from_fitted(fitted, is_debiased)

[docs]    def calc_from_fitted(self, fitted: AUCFitted, is_debiased: bool = False) -> float:
        """
        Calculate metric value from fitted data.

        Parameters
        ----------
        fitted : AUCFitted
            Meta data that got from `.fit` method.
        is_debiased : bool, default False
            An indicator of whether the debias transformation has been applied before or not.

        Returns
        -------
        float
            Value of metric (average between users).
        """
        per_user = self.calc_per_user_from_fitted(fitted, is_debiased)
        return per_user.mean()

[docs]    def calc_per_user_from_fitted(self, fitted: AUCFitted, is_debiased: bool = False) -> pd.Series:
        """
        Calculate metric values for all users from from fitted data.

        Parameters
        ----------
        fitted : AUCFitted
            Meta data that got from `.fit` method.
        is_debiased : bool, default False
            An indicator of whether the debias transformation has been applied before or not.

        Returns
        -------
        pd.Series
            Values of metric (index - user id, values - metric value for every user).
        """
        raise NotImplementedError()


[docs]class PartialAUC(_AUCMetric):
    r"""
    Partial AUC at k (pAUC@k).
    pAUC@k measures ROC AUC score for ranking of the top-k irrelevant items and all relevant items
    for each user. IMPORTANT: this metric requires more then `k` recommended items for each user.
    It fill be enough to have :math:`n^+` (number of user positives) + `k` recommended items for
    each user. Read more in `insufficient_handling` parameter description.

    Metric is averaged between users. For one user the formula is:

    .. math::
        pAUC@k = \frac{1}{kn_+}\sum_{{x_i}\in S^+}\sum_{{x_j}\in S^-}\mathbb{1}[s(x_i)\geq s(x_j)]

    where
        - :math:`k` is the number of user top scored negatives for metric computation
        - :math:`s` is a scoring function which provides scores to rank items for user
        - :math:`\mathbb{1}` is the indicator function
        - :math:`n_+` is the number of all user test positives
        - :math:`S^+` is the set of all positives for user
        - :math:`S^-` is the set of top :math:`k` negatives for user acquired by :math:`s`
        - :math:`x_i` and :math:`x_j` are user positives and negatives for metric computation

    Analysed in "Rich-Item Recommendations for Rich-Users: Exploiting Dynamic and Static Side
    Information": https://arxiv.org/abs/2001.10495, analysed in "Optimization and Analysis of the
    pAp@k Metric for Recommender Systems": https://proceedings.mlr.press/v119/hiranandani20a.html

    Parameters
    ----------
    k : int
        Number of top irrelevant items for user to be taken for ROC AUC computation. This does not
        equal `k` for classic `@k` metrics.
    insufficient_handling : {"ignore", "raise", "exclude"}, default `"ignore"`
        Method of handling users with insufficient recommendation lists for metric calculation.
        pAUC@k needs more then `k` recommendations for each user. This happens because this metris
        calculate ROC AUC score for specific number of user false positives and ranked test
        positives that is derived from provided `k` parameter but is not equal to it.
        It fill be enough to have :math:`n^+` (number of user positives) + `k` recommended items for
        each user.
        The following methods are available:
        - `ignore` - don't check for insufficient recommendations lists, handle all of insufficient
        cases as if algorithms are not able to retrieve users unpredicted test positives on any k
        level. This will understate the metric value if recommendation lists are not sufficient;
        - `exclude` - exclude all users with insufficient recommendations lists from metrics
        computation;
        - `raise` - raise error if there are any users with insufficient recommendations lists. Use
        this option very carefully because some of the algorithms are unable to provide full required
        lists because of their inference logic. So can get errors even if you requested enough
        recommendations in `recommend` method. For example, ItemKNN generates recommendations only
        until the model has non-zero scores for the item in item-item similarity matrix. So with
        small `K` for neighbours in ItemKNN and big `K` for `recommend` and AUC based metric you
        will still get an error when `insufficient_handling` is set to `raise`.
    debias_config : DebiasConfig, optional, default None
        Config with debias method parameters (iqr_coef, random_state).

    Examples
    --------
    >>> reco = pd.DataFrame(
    ...     {
    ...         Columns.User: [1, 1, 2, 2, 2, 3, 3],
    ...         Columns.Item: [1, 2, 3, 1, 2, 3, 2],
    ...         Columns.Rank: [1, 2, 1, 2, 3, 1, 2],
    ...     }
    ... )
    >>> interactions = pd.DataFrame(
    ...     {
    ...         Columns.User: [1, 1, 2, 2, 3, 3],
    ...         Columns.Item: [1, 2, 1, 3, 1, 2],
    ...     }
    ... )
    >>> PartialAUC(k=1).calc_per_user(reco, interactions).values
    array([1., 1., 0.])
    >>> PartialAUC(k=3).calc_per_user(reco, interactions).values
    array([1.        , 1.        , 0.33333333])
    >>> PartialAUC(k=3, insufficient_handling="exclude").calc_per_user(reco, interactions).values
    array([1., 1.])
    """

    def _get_sufficient_reco_explanation(self) -> str:
        return f"""
            It fill be enough to have `n_user_positives` + `PAUC_k` ({self.k}) recommended items for
            each user. For simplification it will be enough to have max(`n_user_positives`) +
            `PAUC_k` ({self.k}) recommended items for all users if max(`n_user_positives`) is
            not too high.
            """

[docs]    def calc_per_user_from_fitted(self, fitted: AUCFitted, is_debiased: bool = False) -> pd.Series:
        """
        Calculate metric values for all users from from fitted data.

        Parameters
        ----------
        fitted : AUCFitted
            Meta data that got from `.fit` method.
        is_debiased : bool, default False
            An indicator of whether the debias transformation has been applied before or not.

        Returns
        -------
        pd.Series
            Values of metric (index - user id, values - metric value for every user).
        """
        self._check_debias(is_debiased, obj_name="AUCFitted")
        outer_merged = fitted.outer_merged_enriched
        # Keep k first false positives for roc auc computation, keep all predicted test positives
        cropped = outer_merged[(outer_merged["__fp_cumsum"] < self.k) & (~outer_merged[Columns.Rank].isna())]
        cropped_suf, n_pos_suf = self._handle_insufficient_cases(
            outer_merged=cropped, n_pos=fitted.n_pos, n_fp_insufficient=fitted.n_fp_insufficient
        )
        return self._calc_roc_auc(cropped_suf, n_pos_suf)


[docs]class PAP(_AUCMetric):
    r"""
    Partial AUC + precision@k (pAp@k) joint classification and ranking metric.
    pAp@k measures AUC between the top-k irrelevant items and top-β relevant items, where β is the
    minimum of k and the number of relevant items. The metric behaves like prec@k when the number of
    relevant items are larger than k and like pAUC otherwise. IMPORTANT: this metric requires more
    then `k` recommended items for each user. It fill be enough to have `k` * 2 recommended items
    for each user. Read more in `insufficient_handling` parameter description.

    Metric is averaged between users. For one user the formula is:

    .. math::
        pAp@k = \frac{1}{k\beta}\sum_{{x_i}\in S^+}\sum_{{x_j}\in S^-}\mathbb{1}[s(x_i)\geq s(x_j)]

    where
        - :math:`k` is the number of top scored negatives and border for top scored positives
        - :math:`s` is a scoring function which provides scores to rank items for user
        - :math:`\mathbb{1}` is the indicator function
        - :math:`\beta` is the minimum between `k` and number of user test positives
        - :math:`S^+` is the set of top :math:`\beta` positives for user acquired by :math:`s`
        - :math:`S^-` is the set of top :math:`k` negatives for user acquired by :math:`s`
        - :math:`x_i` and :math:`x_j` are user positives and negatives for metric computation

    Introduced in "Rich-Item Recommendations for Rich-Users: Exploiting Dynamic and Static Side
    Information": https://arxiv.org/abs/2001.10495, analysed in "Optimization and Analysis of the
    pAp@k Metric for Recommender Systems": https://proceedings.mlr.press/v119/hiranandani20a.html

    Parameters
    ----------
    k : int
        Number of top irrelevant items for user to be taken for ROC AUC computation. This does not
        equal `k` for classic `@k` metrics.
    insufficient_handling : {"ignore", "raise", "exclude"}, default `"ignore"`
        Method of handling users with insufficient recommendation lists for metric calculation.
        pAp@k needs more then `k` recommendations for each user. This happens because this metris
        calculate ROC AUC score for specific number of user false positives and ranked test
        positives that is derived from provided `k` parameter but is not equal to it.
        It fill be enough to have `k` * 2 recommended items for each user.
        The following methods are available:
        - `ignore` - don't check for insufficient recommendations lists, handle all of insufficient
        cases as if algorithms are not able to retrieve users unpredicted test positives on any k
        level. This will understate the metric value if recommendation lists are not sufficient;
        - `exclude` - exclude all users with insufficient recommendations lists from metrics
        computation;
        - `raise` - raise error if there are any users with insufficient recommendations lists. Use
        this option very carefully because some of the algorithms are unable to provide full required
        lists because of their inference logic. So can get errors even if you requested enough
        recommendations in `recommend` method. For example, ItemKNN generates recommendations only
        until the model has non-zero scores for the item in item-item similarity matrix. So with
        small `K` for neighbours in ItemKNN and big `K` for `recommend` and AUC based metric you
        will still get an error when `insufficient_handling` is set to `raise`.
    debias_config : DebiasConfig, optional, default None
        Config with debias method parameters (iqr_coef, random_state).

    Examples
    --------
    >>> reco = pd.DataFrame(
    ...     {
    ...         Columns.User: [1, 1, 2, 2, 2, 3, 3],
    ...         Columns.Item: [1, 2, 3, 1, 2, 3, 2],
    ...         Columns.Rank: [1, 2, 1, 2, 3, 1, 2],
    ...     }
    ... )
    >>> interactions = pd.DataFrame(
    ...     {
    ...         Columns.User: [1, 1, 2, 2, 3, 3],
    ...         Columns.Item: [1, 2, 1, 3, 1, 2],
    ...     }
    ... )
    >>> PAP(k=1).calc_per_user(reco, interactions).values
    array([1., 1., 0.])
    >>> PAP(k=3).calc_per_user(reco, interactions).values
    array([1.        , 1.        , 0.33333333])
    >>> PAP(k=3, insufficient_handling="exclude").calc_per_user(reco, interactions).values
    array([1., 1.])
    """

    def _get_sufficient_reco_explanation(self) -> str:
        return f"""
            It fill be enough to have min(`n_user_positives`, `PAP_k` ({self.k}))  + `PAP_k`
            ({self.k}) recommended items for each user.
            For simplification it will be enough to have `PAP_k` ({self.k})) * 2 recommended items
            for all users.
            """

[docs]    def calc_per_user_from_fitted(self, fitted: AUCFitted, is_debiased: bool = False) -> pd.Series:
        """
        Calculate metric values for all users from outer merged recommendations.

        Parameters
        ----------
        fitted : AUCFitted
            Meta data that got from `.fit` method.
        is_debiased : bool, default False
            An indicator of whether the debias transformation has been applied before or not.

        Returns
        -------
        pd.Series
            Values of metric (index - user id, values - metric value for every user).
        """
        self._check_debias(is_debiased, obj_name="AUCFitted")
        outer_merged = fitted.outer_merged_enriched
        # Keep k first false positives and k first predicted test positives for roc auc computation
        cropped = outer_merged[
            (outer_merged["__test_pos_cumsum"] <= self.k)
            & (outer_merged["__fp_cumsum"] < self.k)
            & (~outer_merged[Columns.Rank].isna())
        ]

        cropped_suf, n_pos_suf = self._handle_insufficient_cases(
            outer_merged=cropped,
            n_pos=fitted.n_pos.clip(upper=self.k),
            n_fp_insufficient=fitted.n_fp_insufficient,
        )
        return self._calc_roc_auc(cropped_suf, n_pos_suf)


AucMetric = tp.Union[PartialAUC, PAP]


[docs]def calc_auc_metrics(
    metrics: tp.Dict[str, AucMetric],
    reco: pd.DataFrame,
    interactions: pd.DataFrame,
) -> tp.Dict[str, float]:
    """
    Calculate any ROC AUC based ranking metric.

    Works with pre-prepared data.

    Warning: It is not recommended to use this function directly.
    Use `calc_metrics` instead.

    Parameters
    ----------
    metrics : dict(str -> AucMetric)
        Dict of metric objects to calculate,
        where key is metric name and value is metric object.
    reco : pd.DataFrame
            Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
    interactions : pd.DataFrame, optional
        Interactions table with columns `Columns.User`, `Columns.Item`.
        Obligatory only for some types of metrics.

    Returns
    -------
    dict(str->float)
        Dictionary where keys are the same with keys in `metrics`
        and values are metric calculation results.
    """
    results = {}

    insufficient_handling_needed = any(
        metric.insufficient_handling != InsufficientHandling.IGNORE for metric in metrics.values()
    )

    debiased_fit_task = calc_debiased_fit_task(metrics.values(), interactions)
    fitted_debiased = {}
    for debias_config_name, (k_max_d, interactions_d) in debiased_fit_task.items():
        fitted_debiased[debias_config_name] = _AUCMetric.fit(
            reco, interactions_d, k_max_d, insufficient_handling_needed
        )

    for name, metric in metrics.items():
        is_debiased = metric.debias_config is not None
        results[name] = metric.calc_from_fitted(
            fitted=fitted_debiased[metric.debias_config],
            is_debiased=is_debiased,
        )

    return results