# Copyright 2022-2024 MTS (Mobile Telesystems)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Base metric module."""
import typing as tp
import warnings
import attr
import pandas as pd
from rectools import Columns
ExternalItemId = tp.Union[str, int]
Catalog = tp.Collection[ExternalItemId]
[docs]@attr.s(auto_attribs=True)
class MetricAtK:
"""
Base class of metrics that depends on `k` -
a number of top recommendations used to calculate a metric.
Warning: This class should not be used directly.
Use derived classes instead.
Parameters
----------
k : int
Number of items at the top of recommendations list that will be used to calculate metric.
"""
k: int
@classmethod
def _check(
cls,
reco: pd.DataFrame,
interactions: tp.Optional[pd.DataFrame] = None,
prev_interactions: tp.Optional[pd.DataFrame] = None,
ref_reco: tp.Optional[pd.DataFrame] = None,
) -> None:
cls._check_columns(reco, "reco", (Columns.User, Columns.Item, Columns.Rank))
cls._check_columns(interactions, "interactions", (Columns.User, Columns.Item))
cls._check_columns(prev_interactions, "prev_interactions", (Columns.User, Columns.Item))
cls._check_columns(ref_reco, "ref_reco", (Columns.User, Columns.Item, Columns.Rank))
cls._check_rank_column(reco, "reco")
cls._check_rank_column(ref_reco, "ref_reco")
@staticmethod
def _check_columns(df: tp.Optional[pd.DataFrame], name: str, required_columns: tp.Iterable[str]) -> None:
if df is None:
return
required_columns = set(required_columns)
actual_columns = set(df.columns)
if not actual_columns >= required_columns:
raise KeyError(f"Missed columns {required_columns - actual_columns} in '{name}' dataframe")
@staticmethod
def _check_rank_column(reco: pd.DataFrame, df_name: str) -> None:
if reco is None or reco.empty:
return
if reco[Columns.Rank].dtype.kind not in ("i", "u"):
warnings.warn(f"Expected integer dtype of '{Columns.Rank}' column in '{df_name}' dataframe.")
if int(round(reco[Columns.Rank].min())) != 1:
warnings.warn(f"Expected min value of '{Columns.Rank}' column in '{df_name}' dataframe to be equal to 1.")
[docs]def merge_reco(reco: pd.DataFrame, interactions: pd.DataFrame) -> pd.DataFrame:
"""
Merge recommendation table with interactions table.
Parameters
----------
reco : pd.DataFrame
Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
interactions : pd.DataFrame
Interactions table with columns `Columns.User`, `Columns.Item`.
Returns
-------
pd.DataFrame
Result of merging.
"""
merged = pd.merge(
interactions.reindex(columns=Columns.UserItem),
reco.reindex(columns=Columns.UserItem + [Columns.Rank]),
on=Columns.UserItem,
how="left",
)
return merged
[docs]def outer_merge_reco(reco: pd.DataFrame, interactions: pd.DataFrame) -> pd.DataFrame:
"""
Merge recommendation table with interactions table with outer join. All ranks for all users are
present with no skipping. Null ranks will be specified for test interactions that were not
predicted in recommendations.
This method is useful for AUC based ranking metrics.
Parameters
----------
reco : pd.DataFrame
Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`.
interactions : pd.DataFrame
Interactions table with columns `Columns.User`, `Columns.Item`.
Returns
-------
pd.DataFrame
Result of merging with added `__test_positive` boolean column.
"""
prepared_interactions = interactions.reindex(columns=Columns.UserItem).drop_duplicates()
prepared_interactions["__test_positive"] = True
test_users = prepared_interactions[Columns.User].unique()
prepared_reco = reco[reco[Columns.User].isin(test_users)].reindex(columns=Columns.UserItem + [Columns.Rank])
merged = pd.merge(
prepared_interactions,
prepared_reco,
on=Columns.UserItem,
how="outer",
)
max_rank = prepared_reco.groupby(Columns.User)[Columns.Rank].max()
full_ranks = max_rank.apply(lambda a: list(range(1, a + 1))).explode().rename(Columns.Rank)
ranked_reco = merged.merge(full_ranks, on=[Columns.User, Columns.Rank], how="outer").sort_values(
[Columns.User, Columns.Rank]
)
ranked_reco["__test_positive"] = ranked_reco["__test_positive"].fillna(False)
return ranked_reco