Source code for rectools.models.rank

#  Copyright 2024 MTS (Mobile Telesystems)
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""Implicit ranker model."""

import typing as tp
from enum import Enum

import implicit.cpu
import numpy as np
from implicit.cpu.matrix_factorization_base import _filter_items_from_sparse_matrix as filter_items_from_sparse_matrix
from scipy import sparse

from rectools import InternalIds
from rectools.models.base import Scores
from rectools.types import InternalIdsArray


[docs]class Distance(Enum):
    """Distance metric"""

    DOT = 1  # Bigger value means closer vectors
    COSINE = 2  # Bigger value means closer vectors
    EUCLIDEAN = 3  # Smaller value means closer vectors


[docs]class ImplicitRanker:
    """
    Ranker model which uses implicit library matrix factorization topk method.

    This ranker is suitable for the following cases of scores calculation:
    1. subject_embeddings.dot(objects_embeddings)
    2. subject_interactions.dot(item-item-similarities)

    Parameters
    ----------
    distance : Distance
        Distance metric.
    subjects_factors : np.ndarray | sparse.csr_matrix
        Array of subjects embeddings, shape (n_subjects, n_factors).
        For item-item similarity models subjects vectors from ui_csr are viewed as factors.
    objects_factors : np.ndarray
        Array with embeddings of all objects, shape (n_objects, n_factors).
        For item-item similarity models item similarity vectors are viewed as factors.
    """

    def __init__(
        self, distance: Distance, subjects_factors: tp.Union[np.ndarray, sparse.csr_matrix], objects_factors: np.ndarray
    ) -> None:
        if isinstance(subjects_factors, sparse.csr_matrix) and distance != Distance.DOT:
            raise ValueError("To use `sparse.csr_matrix` distance must be `Distance.DOT`")

        self.distance = distance
        self.subjects_factors: np.ndarray = subjects_factors.astype(np.float32)
        self.objects_factors: np.ndarray = objects_factors.astype(np.float32)

        self.subjects_norms: np.ndarray
        if distance == Distance.COSINE:
            self.subjects_norms = self._calc_norms(self.subjects_factors, avoid_zeros=True)

        self.subjects_dots: np.ndarray
        if distance == Distance.EUCLIDEAN:
            self.subjects_dots = self._calc_dots(self.subjects_factors)

    def _get_neginf_score(self) -> float:
        # Adding 1 to avoid float calculation errors (we're comparing `scores <= neginf_score`)
        return float(-np.finfo(np.float32).max + 1)

    @staticmethod
    def _calc_dots(factors: np.ndarray) -> np.ndarray:
        return (factors**2).sum(axis=1)

    @staticmethod
    def _calc_norms(factors: np.ndarray, avoid_zeros: bool = False) -> np.ndarray:
        norms = np.linalg.norm(factors, axis=1)
        # Used for COSINE distance
        # If one or both vectors are zero, assume they're orthogonal, need to avoid 0 in denominator
        if avoid_zeros:
            norms[norms == 0] = 1e-10
        return norms

    def _get_mask_for_correct_scores(self, scores: np.ndarray) -> tp.List[bool]:
        """Filter scores from implicit library that are not relevant. Implicit library assigns `neginf` score
        to items that are meant to be filtered (e.g. blacklist items or already seen items)
        """
        num_masked = 0
        min_score = self._get_neginf_score()
        for el in np.flip(scores):
            if el <= min_score:
                num_masked += 1
            else:
                break
        return [True] * (len(scores) - num_masked) + [False] * num_masked

    def _process_implicit_scores(
        self, subject_ids: InternalIds, ids: np.ndarray, scores: np.ndarray
    ) -> tp.Tuple[InternalIds, InternalIds, Scores]:

        all_target_ids = []
        all_reco_ids: tp.List[np.ndarray] = []
        all_scores: tp.List[np.ndarray] = []

        for subject_id, object_ids, object_scores in zip(subject_ids, ids, scores):
            correct_mask = self._get_mask_for_correct_scores(object_scores)
            relevant_scores = object_scores[correct_mask]
            relevant_ids = object_ids[correct_mask]

            if self.distance == Distance.COSINE:
                subject_norm = self.subjects_norms[subject_id]
                relevant_scores /= subject_norm

            if self.distance == Distance.EUCLIDEAN:
                # Restore Euclidean distances from scores
                d2 = self.subjects_dots[subject_id] - relevant_scores
                # Theoretically d2 >= 0, but can be <0 because of rounding errors
                relevant_scores = np.sqrt(np.maximum(d2, 0))

            all_target_ids.extend([subject_id for _ in range(len(relevant_ids))])
            all_reco_ids.append(relevant_ids)
            all_scores.append(relevant_scores)

        return all_target_ids, np.concatenate(all_reco_ids), np.concatenate(all_scores)

[docs]    def rank(
        self,
        subject_ids: InternalIds,
        k: int,
        filter_pairs_csr: tp.Optional[sparse.csr_matrix] = None,
        sorted_object_whitelist: tp.Optional[InternalIdsArray] = None,
        num_threads: int = 0,
    ) -> tp.Tuple[InternalIds, InternalIds, Scores]:
        """Rank objects to proceed inference using implicit library topk cpu method.

        Parameters
        ----------
        subject_ids : csr_matrix
            Array of ids to recommend for.
        k : int
            Derived number of recommendations for every subject id.
        filter_pairs_csr : sparse.csr_matrix, optional, default ``None``
            Subject-object interactions that should be filtered from recommendations.
            This is relevant for u2i case.
        sorted_object_whitelist : sparse.csr_matrix, optional, default ``None``
            Whitelist of object ids.
            If given, only these items will be used for recommendations.
            Otherwise all items from dataset will be used.
        num_threads : int, default 0
            Will be used as `num_threads` parameter for `implicit.cpu.topk.topk`.

        Returns
        -------
        (InternalIds, InternalIds, Scores)
            Array of subject ids, array of recommended items, sorted by score descending and array of scores.
        """
        if sorted_object_whitelist is not None:
            object_factors = self.objects_factors[sorted_object_whitelist]

            if filter_pairs_csr is not None:
                #  filter ui_csr_for_filter matrix to contain only whitelist objects
                filter_query_items = filter_items_from_sparse_matrix(sorted_object_whitelist, filter_pairs_csr)
            else:
                filter_query_items = None

        else:
            # keep all objects and full ui_csr_for_filter
            object_factors = self.objects_factors
            filter_query_items = filter_pairs_csr

        subject_factors = self.subjects_factors[subject_ids]

        object_norms = None  # for DOT and EUCLIDEAN distance
        if self.distance == Distance.COSINE:
            object_norms = self._calc_norms(object_factors, avoid_zeros=True)

        if self.distance == Distance.EUCLIDEAN:
            # Transform factors to get top-k by Euclidean distance using Dot metric
            # https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf
            subject_factors = np.hstack((-np.ones((subject_factors.shape[0], 1)), 2 * subject_factors))
            object_factors = np.hstack(((object_factors**2).sum(axis=1).reshape(-1, 1), object_factors))

        real_k = min(k, object_factors.shape[0])

        ids, scores = implicit.cpu.topk.topk(  # pylint: disable=c-extension-no-member
            items=object_factors,
            query=subject_factors,
            k=real_k,
            item_norms=object_norms,  # query norms for COSINE distance are applied afterwards
            filter_query_items=filter_query_items,  # queries x objects csr matrix for getting neginf scores
            filter_items=None,  # rectools doesn't support blacklist for now
            num_threads=num_threads,
        )

        if sorted_object_whitelist is not None:
            ids = sorted_object_whitelist[ids]

        # filter neginf from implicit scores and apply transformations to scores (for COSINE and EUCLIDEAN distances)
        all_target_ids, all_reco_ids, all_scores = self._process_implicit_scores(subject_ids, ids, scores)

        return all_target_ids, all_reco_ids, all_scores