Source code for rectools.metrics.auc

#  Copyright 2024 MTS (Mobile Telesystems)
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""ROC AUC based ranking recommendations metrics."""
import typing as tp
from enum import Enum

import attr
import pandas as pd
from attrs import define, field

from rectools import Columns
from rectools.metrics.base import outer_merge_reco
from rectools.metrics.debias import DebiasableMetrikAtK, calc_debiased_fit_task, debias_interactions


[docs]class InsufficientHandling(str, Enum): """Strategy for handling insufficient reommendations cases""" IGNORE = "ignore" EXCLUDE = "exclude" RAISE = "raise"
[docs]@attr.s class AUCFitted: """ Container with meta data got from `_AUCMetric.fit` method. Parameters ---------- outer_merged_enriched : pd.DataFrame Recommendations outer merged with test interactions. Table has Columns.User, Columns.Item, Columns.Rank. Precomputed columns include "__test_positive", "__tp", "__fp", "__fp_cumsum", "__test_pos_cumcum". All ranks for all users are present with no skipping. Null ranks are specified for test interactions that were not predicted in recommendations. n_pos : pd.Series Number of positive items for each user in test insteractions. n_fp_insufficient : pd.Series Number of false positive items for each user in `outer_merged_enriched` that had at least one false negative. This users will be checked for insufficient cases processing. """ outer_merged_enriched: pd.DataFrame = attr.ib() n_pos: pd.Series = attr.ib() n_fp_insufficient: pd.Series = attr.ib()
[docs]@define class _AUCMetric(DebiasableMetrikAtK): """ ROC AUC based metric base class. Warning: This class should not be used directly. Use derived classes instead. Parameters ---------- k : int Number of items at the top of recommendations list that will be used to calculate metric. insufficient_handling : {"ignore", "raise", "exclude"}, default `"ignore"` Method of handling users with insufficient recommendations for metric calculation. ROC AUC based metrics with `k` parameter often need more then `k` recommendations for each user. This happens because this metrics calculate ROC AUC score for specific number of user false positives and ranked test positives that is derived from provided `k` parameter but is not equal to it. The following methods are available: - `ignore` - don't check for insufficient recommendations lists, handle all of insufficient cases as if algorithms are not able to retrieve users unpredicted test positives on any k level. This will understate the metric value; - `exclude` - exclude all users with insufficient recommendations lists from metrics computation; - `raise` - raise error if there are any users with insufficient recommendations lists. Use this option very carefully because some of the algorithms are unable to provide full required lists because of their inference logic. So can get errors even if you requested enough recommendations in `recommend` method. For example, ItemKNN generates recommendations only until the model has non-zero scores for the item in item-item similarity matrix. So with small `K` for neighbours in ItemKNN and big `K` for `recommend` and AUC based metric you will still get an error when `insufficient_handling` is set to `raise`. debias_config : DebiasConfig, optional, default None Config with debias method parameters (iqr_coef, random_state). """ insufficient_handling: str = field(default="ignore") @insufficient_handling.validator def _check_insufficient_handling(self, attribute: str, value: str) -> None: possible_values = {item.value for item in InsufficientHandling.__members__.values()} if value not in possible_values: raise ValueError(f"`insufficient_handling` must be one of the {possible_values}. Got {value}.")
[docs] @classmethod def fit( cls, reco: pd.DataFrame, interactions: pd.DataFrame, k_max: int, insufficient_handling_needed: bool ) -> AUCFitted: """ Prepare intermediate data for effective calculation. You can use this method to prepare some intermediate data for later calculation. It can optimize calculations if you want calculate different AUC based metrics with different `k` parameter. """ cls._check(reco, interactions=interactions) outer_merged = outer_merge_reco(reco, interactions) recommended_mask = ~outer_merged[Columns.Rank].isna() outer_merged["__tp"] = recommended_mask & outer_merged["__test_positive"] outer_merged["__fp"] = recommended_mask & ~outer_merged["__test_positive"] grouped = outer_merged.groupby(Columns.User, sort=False) # perform cumcum and sum aggs separately otherwise row order can be affected cumsum_stats = grouped.agg(__fp_cumsum=("__fp", "cumsum"), __test_pos_cumsum=("__test_positive", "cumsum")) stats = grouped.agg(n_pos=("__test_positive", "sum"), n_fp=("__fp", "sum")) n_pos = stats["n_pos"].dropna().rename_axis(Columns.User) outer_merged = pd.concat([outer_merged, cumsum_stats[["__fp_cumsum", "__test_pos_cumsum"]]], axis=1) if insufficient_handling_needed: # Every user with FP count more then k_max has sufficient recommendations for partial AUC based metrics # We calculate and keep number of false positives for all other users users_n_fp = stats["n_fp"].dropna().rename_axis(Columns.User) n_fp_insufficient = users_n_fp[users_n_fp < k_max] users_with_fn = outer_merged.loc[~recommended_mask, Columns.User].unique() n_fp_insufficient = n_fp_insufficient[n_fp_insufficient.index.isin(users_with_fn)] else: n_fp_insufficient = pd.Series([]) return AUCFitted(outer_merged, n_pos, n_fp_insufficient)
def _get_sufficient_reco_explanation(self) -> str: raise NotImplementedError() def _handle_insufficient_cases( self, outer_merged: pd.DataFrame, n_pos: pd.Series, n_fp_insufficient: pd.Series ) -> pd.Series: if self.insufficient_handling == InsufficientHandling.IGNORE: return outer_merged, n_pos insufficient_users = n_fp_insufficient[n_fp_insufficient < self.k].index.values if len(insufficient_users) == 0: return outer_merged, n_pos if self.insufficient_handling == InsufficientHandling.EXCLUDE: outer_merged_suf = outer_merged[~outer_merged[Columns.User].isin(insufficient_users)] n_pos_suf = n_pos[~n_pos.index.isin(insufficient_users)] return outer_merged_suf, n_pos_suf raise ValueError( f""" {self.__class__.__name__}@{self.k} metric requires at least {self.k} negatives in recommendations for each user. Or all items from user test interactions ranked in recommendations - meaning that all other recommended items will be negatives. There are {len(insufficient_users)} users with less then required negatives. For correct {self.__class__.__name__} computation please provide each user with sufficient number of recommended items. {self._get_sufficient_reco_explanation()} You can disable this error by specifying `insufficient_handling`="{InsufficientHandling.IGNORE}" or by excluding all users with insuffissient recommendations from metric computation with specifying `insufficient_handling` = "{InsufficientHandling.EXCLUDE}". """ ) def _calc_roc_auc(self, cropped_outer_merged: pd.DataFrame, n_pos: pd.Series) -> pd.Series: """ Calculate ROC AUC given that all data has already been prepared, merged, enriched and cropped following metric specific logic. """ auc_numenator_gain = (self.k - cropped_outer_merged["__fp_cumsum"]) * cropped_outer_merged["__tp"] auc_numenator_gain.name = "__auc_numenator_gain" user_auc_numenator = pd.concat([cropped_outer_merged[Columns.User], auc_numenator_gain], axis=1) auc_numenator = user_auc_numenator.groupby(Columns.User)["__auc_numenator_gain"].sum() auc_denominator = n_pos * self.k auc = auc_numenator.rename("numenator").to_frame().join(auc_denominator.rename("denominator"), how="outer") return (auc["numenator"] / auc["denominator"]).fillna(0)
[docs] def calc(self, reco: pd.DataFrame, interactions: pd.DataFrame) -> float: """ Calculate metric value. Parameters ---------- reco : pd.DataFrame Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. interactions : pd.DataFrame Interactions table with columns `Columns.User`, `Columns.Item`. Returns ------- float Value of metric (average between users). """ per_user = self.calc_per_user(reco, interactions) return per_user.mean()
[docs] def calc_per_user(self, reco: pd.DataFrame, interactions: pd.DataFrame) -> pd.Series: """ Calculate metric values for all users. Parameters ---------- reco : pd.DataFrame Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. interactions : pd.DataFrame Interactions table with columns `Columns.User`, `Columns.Item`. Returns ------- pd.Series Values of metric (index - user id, values - metric value for every user). """ is_debiased = False if self.debias_config is not None: interactions = debias_interactions(interactions, self.debias_config) is_debiased = True self._check(reco, interactions=interactions) insufficient_handling_needed = self.insufficient_handling != InsufficientHandling.IGNORE fitted = self.fit(reco, interactions, self.k, insufficient_handling_needed) return self.calc_per_user_from_fitted(fitted, is_debiased)
[docs] def calc_from_fitted(self, fitted: AUCFitted, is_debiased: bool = False) -> float: """ Calculate metric value from fitted data. Parameters ---------- fitted : AUCFitted Meta data that got from `.fit` method. is_debiased : bool, default False An indicator of whether the debias transformation has been applied before or not. Returns ------- float Value of metric (average between users). """ per_user = self.calc_per_user_from_fitted(fitted, is_debiased) return per_user.mean()
[docs] def calc_per_user_from_fitted(self, fitted: AUCFitted, is_debiased: bool = False) -> pd.Series: """ Calculate metric values for all users from from fitted data. Parameters ---------- fitted : AUCFitted Meta data that got from `.fit` method. is_debiased : bool, default False An indicator of whether the debias transformation has been applied before or not. Returns ------- pd.Series Values of metric (index - user id, values - metric value for every user). """ raise NotImplementedError()
[docs]class PartialAUC(_AUCMetric): r""" Partial AUC at k (pAUC@k). pAUC@k measures ROC AUC score for ranking of the top-k irrelevant items and all relevant items for each user. IMPORTANT: this metric requires more then `k` recommended items for each user. It fill be enough to have :math:`n^+` (number of user positives) + `k` recommended items for each user. Read more in `insufficient_handling` parameter description. Metric is averaged between users. For one user the formula is: .. math:: pAUC@k = \frac{1}{kn_+}\sum_{{x_i}\in S^+}\sum_{{x_j}\in S^-}\mathbb{1}[s(x_i)\geq s(x_j)] where - :math:`k` is the number of user top scored negatives for metric computation - :math:`s` is a scoring function which provides scores to rank items for user - :math:`\mathbb{1}` is the indicator function - :math:`n_+` is the number of all user test positives - :math:`S^+` is the set of all positives for user - :math:`S^-` is the set of top :math:`k` negatives for user acquired by :math:`s` - :math:`x_i` and :math:`x_j` are user positives and negatives for metric computation Analysed in "Rich-Item Recommendations for Rich-Users: Exploiting Dynamic and Static Side Information": https://arxiv.org/abs/2001.10495, analysed in "Optimization and Analysis of the pAp@k Metric for Recommender Systems": https://proceedings.mlr.press/v119/hiranandani20a.html Parameters ---------- k : int Number of top irrelevant items for user to be taken for ROC AUC computation. This does not equal `k` for classic `@k` metrics. insufficient_handling : {"ignore", "raise", "exclude"}, default `"ignore"` Method of handling users with insufficient recommendation lists for metric calculation. pAUC@k needs more then `k` recommendations for each user. This happens because this metris calculate ROC AUC score for specific number of user false positives and ranked test positives that is derived from provided `k` parameter but is not equal to it. It fill be enough to have :math:`n^+` (number of user positives) + `k` recommended items for each user. The following methods are available: - `ignore` - don't check for insufficient recommendations lists, handle all of insufficient cases as if algorithms are not able to retrieve users unpredicted test positives on any k level. This will understate the metric value if recommendation lists are not sufficient; - `exclude` - exclude all users with insufficient recommendations lists from metrics computation; - `raise` - raise error if there are any users with insufficient recommendations lists. Use this option very carefully because some of the algorithms are unable to provide full required lists because of their inference logic. So can get errors even if you requested enough recommendations in `recommend` method. For example, ItemKNN generates recommendations only until the model has non-zero scores for the item in item-item similarity matrix. So with small `K` for neighbours in ItemKNN and big `K` for `recommend` and AUC based metric you will still get an error when `insufficient_handling` is set to `raise`. debias_config : DebiasConfig, optional, default None Config with debias method parameters (iqr_coef, random_state). Examples -------- >>> reco = pd.DataFrame( ... { ... Columns.User: [1, 1, 2, 2, 2, 3, 3], ... Columns.Item: [1, 2, 3, 1, 2, 3, 2], ... Columns.Rank: [1, 2, 1, 2, 3, 1, 2], ... } ... ) >>> interactions = pd.DataFrame( ... { ... Columns.User: [1, 1, 2, 2, 3, 3], ... Columns.Item: [1, 2, 1, 3, 1, 2], ... } ... ) >>> PartialAUC(k=1).calc_per_user(reco, interactions).values array([1., 1., 0.]) >>> PartialAUC(k=3).calc_per_user(reco, interactions).values array([1. , 1. , 0.33333333]) >>> PartialAUC(k=3, insufficient_handling="exclude").calc_per_user(reco, interactions).values array([1., 1.]) """ def _get_sufficient_reco_explanation(self) -> str: return f""" It fill be enough to have `n_user_positives` + `PAUC_k` ({self.k}) recommended items for each user. For simplification it will be enough to have max(`n_user_positives`) + `PAUC_k` ({self.k}) recommended items for all users if max(`n_user_positives`) is not too high. """
[docs] def calc_per_user_from_fitted(self, fitted: AUCFitted, is_debiased: bool = False) -> pd.Series: """ Calculate metric values for all users from from fitted data. Parameters ---------- fitted : AUCFitted Meta data that got from `.fit` method. is_debiased : bool, default False An indicator of whether the debias transformation has been applied before or not. Returns ------- pd.Series Values of metric (index - user id, values - metric value for every user). """ self._check_debias(is_debiased, obj_name="AUCFitted") outer_merged = fitted.outer_merged_enriched # Keep k first false positives for roc auc computation, keep all predicted test positives cropped = outer_merged[(outer_merged["__fp_cumsum"] < self.k) & (~outer_merged[Columns.Rank].isna())] cropped_suf, n_pos_suf = self._handle_insufficient_cases( outer_merged=cropped, n_pos=fitted.n_pos, n_fp_insufficient=fitted.n_fp_insufficient ) return self._calc_roc_auc(cropped_suf, n_pos_suf)
[docs]class PAP(_AUCMetric): r""" Partial AUC + precision@k (pAp@k) joint classification and ranking metric. pAp@k measures AUC between the top-k irrelevant items and top-β relevant items, where β is the minimum of k and the number of relevant items. The metric behaves like prec@k when the number of relevant items are larger than k and like pAUC otherwise. IMPORTANT: this metric requires more then `k` recommended items for each user. It fill be enough to have `k` * 2 recommended items for each user. Read more in `insufficient_handling` parameter description. Metric is averaged between users. For one user the formula is: .. math:: pAp@k = \frac{1}{k\beta}\sum_{{x_i}\in S^+}\sum_{{x_j}\in S^-}\mathbb{1}[s(x_i)\geq s(x_j)] where - :math:`k` is the number of top scored negatives and border for top scored positives - :math:`s` is a scoring function which provides scores to rank items for user - :math:`\mathbb{1}` is the indicator function - :math:`\beta` is the minimum between `k` and number of user test positives - :math:`S^+` is the set of top :math:`\beta` positives for user acquired by :math:`s` - :math:`S^-` is the set of top :math:`k` negatives for user acquired by :math:`s` - :math:`x_i` and :math:`x_j` are user positives and negatives for metric computation Introduced in "Rich-Item Recommendations for Rich-Users: Exploiting Dynamic and Static Side Information": https://arxiv.org/abs/2001.10495, analysed in "Optimization and Analysis of the pAp@k Metric for Recommender Systems": https://proceedings.mlr.press/v119/hiranandani20a.html Parameters ---------- k : int Number of top irrelevant items for user to be taken for ROC AUC computation. This does not equal `k` for classic `@k` metrics. insufficient_handling : {"ignore", "raise", "exclude"}, default `"ignore"` Method of handling users with insufficient recommendation lists for metric calculation. pAp@k needs more then `k` recommendations for each user. This happens because this metris calculate ROC AUC score for specific number of user false positives and ranked test positives that is derived from provided `k` parameter but is not equal to it. It fill be enough to have `k` * 2 recommended items for each user. The following methods are available: - `ignore` - don't check for insufficient recommendations lists, handle all of insufficient cases as if algorithms are not able to retrieve users unpredicted test positives on any k level. This will understate the metric value if recommendation lists are not sufficient; - `exclude` - exclude all users with insufficient recommendations lists from metrics computation; - `raise` - raise error if there are any users with insufficient recommendations lists. Use this option very carefully because some of the algorithms are unable to provide full required lists because of their inference logic. So can get errors even if you requested enough recommendations in `recommend` method. For example, ItemKNN generates recommendations only until the model has non-zero scores for the item in item-item similarity matrix. So with small `K` for neighbours in ItemKNN and big `K` for `recommend` and AUC based metric you will still get an error when `insufficient_handling` is set to `raise`. debias_config : DebiasConfig, optional, default None Config with debias method parameters (iqr_coef, random_state). Examples -------- >>> reco = pd.DataFrame( ... { ... Columns.User: [1, 1, 2, 2, 2, 3, 3], ... Columns.Item: [1, 2, 3, 1, 2, 3, 2], ... Columns.Rank: [1, 2, 1, 2, 3, 1, 2], ... } ... ) >>> interactions = pd.DataFrame( ... { ... Columns.User: [1, 1, 2, 2, 3, 3], ... Columns.Item: [1, 2, 1, 3, 1, 2], ... } ... ) >>> PAP(k=1).calc_per_user(reco, interactions).values array([1., 1., 0.]) >>> PAP(k=3).calc_per_user(reco, interactions).values array([1. , 1. , 0.33333333]) >>> PAP(k=3, insufficient_handling="exclude").calc_per_user(reco, interactions).values array([1., 1.]) """ def _get_sufficient_reco_explanation(self) -> str: return f""" It fill be enough to have min(`n_user_positives`, `PAP_k` ({self.k})) + `PAP_k` ({self.k}) recommended items for each user. For simplification it will be enough to have `PAP_k` ({self.k})) * 2 recommended items for all users. """
[docs] def calc_per_user_from_fitted(self, fitted: AUCFitted, is_debiased: bool = False) -> pd.Series: """ Calculate metric values for all users from outer merged recommendations. Parameters ---------- fitted : AUCFitted Meta data that got from `.fit` method. is_debiased : bool, default False An indicator of whether the debias transformation has been applied before or not. Returns ------- pd.Series Values of metric (index - user id, values - metric value for every user). """ self._check_debias(is_debiased, obj_name="AUCFitted") outer_merged = fitted.outer_merged_enriched # Keep k first false positives and k first predicted test positives for roc auc computation cropped = outer_merged[ (outer_merged["__test_pos_cumsum"] <= self.k) & (outer_merged["__fp_cumsum"] < self.k) & (~outer_merged[Columns.Rank].isna()) ] cropped_suf, n_pos_suf = self._handle_insufficient_cases( outer_merged=cropped, n_pos=fitted.n_pos.clip(upper=self.k), n_fp_insufficient=fitted.n_fp_insufficient, ) return self._calc_roc_auc(cropped_suf, n_pos_suf)
AucMetric = tp.Union[PartialAUC, PAP]
[docs]def calc_auc_metrics( metrics: tp.Dict[str, AucMetric], reco: pd.DataFrame, interactions: pd.DataFrame, ) -> tp.Dict[str, float]: """ Calculate any ROC AUC based ranking metric. Works with pre-prepared data. Warning: It is not recommended to use this function directly. Use `calc_metrics` instead. Parameters ---------- metrics : dict(str -> AucMetric) Dict of metric objects to calculate, where key is metric name and value is metric object. reco : pd.DataFrame Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. interactions : pd.DataFrame, optional Interactions table with columns `Columns.User`, `Columns.Item`. Obligatory only for some types of metrics. Returns ------- dict(str->float) Dictionary where keys are the same with keys in `metrics` and values are metric calculation results. """ results = {} insufficient_handling_needed = any( metric.insufficient_handling != InsufficientHandling.IGNORE for metric in metrics.values() ) debiased_fit_task = calc_debiased_fit_task(metrics.values(), interactions) fitted_debiased = {} for debias_config_name, (k_max_d, interactions_d) in debiased_fit_task.items(): fitted_debiased[debias_config_name] = _AUCMetric.fit( reco, interactions_d, k_max_d, insufficient_handling_needed ) for name, metric in metrics.items(): is_debiased = metric.debias_config is not None results[name] = metric.calc_from_fitted( fitted=fitted_debiased[metric.debias_config], is_debiased=is_debiased, ) return results