Source code for rectools.dataset.interactions

#  Copyright 2022-2025 MTS (Mobile Telesystems)
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""Structure for saving user-item interactions."""

import typing as tp

import attr
import numpy as np
import pandas as pd
from scipy import sparse

from rectools import Columns

from .identifiers import IdMap


[docs]@attr.s(frozen=True, slots=True) class Interactions: """ Structure to store info about user-item interactions. Usually it's more convenient to use `from_raw` method instead of direct creating. Parameters ---------- df : pd.DataFrame Table where every row contains user-item interaction and columns are: - `Columns.User` - internal user id (non-negative int values); - `Columns.Item` - internal item id (non-negative int values); - `Columns.Weight` - weight of interaction, float, use ``1`` if interactions have no weight; - `Columns.Datetime` - timestamp of interactions, assign random value if you're not going to use it later. Extra columns can also be present. """ df: pd.DataFrame = attr.ib() @staticmethod def _check_columns_present(df: pd.DataFrame) -> None: required_columns = {Columns.User, Columns.Item, Columns.Weight, Columns.Datetime} actual_columns = set(df.columns) if not actual_columns >= required_columns: raise KeyError(f"Missed columns {required_columns - actual_columns}")
[docs] @staticmethod def convert_weight_and_datetime_types(df: pd.DataFrame) -> None: """ Convert weight column to float and datetime column to datetime64[ns] in-place. This method ensures that the specified weight column contains numeric values and that the datetime column can be converted to pandas' datetime64[ns] format. The conversion is done in-place, so the original DataFrame will be modified. Parameters ---------- df : pd.DataFrame Input DataFrame that must contain the following columns: - `Columns.Weight` - interaction weight; - `Columns.Datetime` - interaction timestamp. """ try: df[Columns.Weight] = df[Columns.Weight].astype(float) except ValueError: raise TypeError(f"Column '{Columns.Weight}' must be numeric") try: df[Columns.Datetime] = df[Columns.Datetime].astype("datetime64[ns]") except ValueError: raise TypeError(f"Column '{Columns.Datetime}' must be convertible to 'datetime64' type")
@df.validator def _check_columns_present_validator(self, _: str, df: pd.DataFrame) -> None: self._check_columns_present(df) @df.validator def _check_ids(self, _: str, df: pd.DataFrame) -> None: for col in (Columns.User, Columns.Item): if not df[col].dtype.name.startswith(("int", "uint")): raise TypeError(f"Column '{col}' must be integer") if df[col].min() < 0: raise ValueError(f"Column '{col}' values must be >= 0") def __attrs_post_init__(self) -> None: """Convert datetime and weight columns to the right data types.""" self.convert_weight_and_datetime_types(self.df) @staticmethod def _add_extra_cols(df: pd.DataFrame, interactions: pd.DataFrame) -> None: extra_cols = [col for col in interactions.columns if col not in df.columns] for extra_col in extra_cols: df[extra_col] = interactions[extra_col].values
[docs] @classmethod def from_raw( cls, interactions: pd.DataFrame, user_id_map: IdMap, item_id_map: IdMap, keep_extra_cols: bool = False ) -> "Interactions": """ Create `Interactions` from dataset with external ids and id mappings. Parameters ---------- interactions : pd.DataFrame Table where every row contains user-item interaction and columns are: - `Columns.User` - user id; - `Columns.Item` - item id; - `Columns.Weight` - weight of interaction, float, use ``1`` if interactions have no weight; - `Columns.Datetime` - timestamp of interactions, assign random value if you're not going to use it later. user_id_map : IdMap User identifiers mapping. item_id_map : IdMap Item identifiers mapping. keep_extra_cols: bool, default ``False`` Flag to keep all columns from interactions besides the default ones. Returns ------- Interactions """ cls._check_columns_present(interactions) df = pd.DataFrame( { Columns.User: user_id_map.convert_to_internal(interactions[Columns.User]), Columns.Item: item_id_map.convert_to_internal(interactions[Columns.Item]), }, ) df[Columns.Weight] = interactions[Columns.Weight].values df[Columns.Datetime] = interactions[Columns.Datetime].values cls.convert_weight_and_datetime_types(df) if keep_extra_cols: cls._add_extra_cols(df, interactions) return cls(df)
[docs] def get_user_item_matrix(self, include_weights: bool = True, dtype: tp.Type = np.float32) -> sparse.csr_matrix: """ Form a user-item CSR matrix based on interactions data. Parameters ---------- include_weights : bool, default ``True`` Whether include interaction weights in matrix or not. If ``False``, all values in returned matrix will be equal to ``1``. Returns ------- csr_matrix """ if include_weights: values = self.df[Columns.Weight].values else: values = np.ones(len(self.df)) csr = sparse.csr_matrix( ( values.astype(dtype), ( self.df[Columns.User].values, self.df[Columns.Item].values, ), ), ) return csr
[docs] def to_external( self, user_id_map: IdMap, item_id_map: IdMap, include_weight: bool = True, include_datetime: bool = True, include_extra_cols: tp.Union[bool, tp.List[str]] = True, ) -> pd.DataFrame: """ Convert itself to `pd.DataFrame` with replacing internal user and item ids to external ones. Parameters ---------- user_id_map : IdMap User id map that has to be used for converting internal user ids to external ones. item_id_map : IdMap Item id map that has to be used for converting internal item ids to external ones. include_weight : bool, default ``True`` Whether to include weight column into resulting table or not include_datetime : bool, default ``True`` Whether to include datetime column into resulting table or not. include_extra_cols: bool or List[str], default ``True`` If bool, indicates whether to include all extra columns into resulting table or not. If list of strings, indicates which extra columns to include into resulting table. Returns ------- pd.DataFrame """ res = pd.DataFrame( { Columns.User: user_id_map.convert_to_external(self.df[Columns.User].values), Columns.Item: item_id_map.convert_to_external(self.df[Columns.Item].values), } ) cols_to_add = [] if include_weight: cols_to_add.append(Columns.Weight) if include_datetime: cols_to_add.append(Columns.Datetime) extra_cols = [] if isinstance(include_extra_cols, list): extra_cols = [col for col in include_extra_cols if col in self.df and col not in Columns.Interactions] elif include_extra_cols: extra_cols = [col for col in self.df if col not in Columns.Interactions] cols_to_add.extend(extra_cols) for col in cols_to_add: res[col] = self.df[col] return res