Example of model selection using cross-validation with RecTools

  • CV split

  • Training a variety of models

  • Measuring a variety of metrics

[2]:
from pprint import pprint

import numpy as np
import pandas as pd

from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender
from implicit.als import AlternatingLeastSquares

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplit

Load data

[3]:
%%time
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip
--2022-07-28 11:02:17--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5,6M) [application/zip]
Saving to: ‘ml-1m.zip.2’

ml-1m.zip.2         100%[===================>]   5,64M  3,38MB/s    in 1,7s

2022-07-28 11:02:19 (3,38 MB/s) - ‘ml-1m.zip.2’ saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat
  inflating: ml-1m/ratings.dat
  inflating: ml-1m/README
  inflating: ml-1m/users.dat
CPU times: user 46.9 ms, sys: 27.7 ms, total: 74.6 ms
Wall time: 2.67 s
[4]:
%%time
ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    engine="python",  # Because of 2-chars separators
    header=None,
    names=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
)
print(ratings.shape)
ratings.head()
(1000209, 4)
CPU times: user 4.01 s, sys: 156 ms, total: 4.16 s
Wall time: 4.17 s
[4]:
user_id item_id weight datetime
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
[5]:
ratings["user_id"].nunique(), ratings["item_id"].nunique()
[5]:
(6040, 3706)
[6]:
ratings["weight"].value_counts()
[6]:
4    348971
3    261197
5    226310
2    107557
1     56174
Name: weight, dtype: int64
[7]:
ratings["datetime"] = pd.to_datetime(ratings["datetime"] * 10 ** 9)
print("Time period")
ratings["datetime"].min(), ratings["datetime"].max()
Time period
[7]:
(Timestamp('2000-04-25 23:05:32'), Timestamp('2003-02-28 17:49:50'))

Split interactions for CV

We’ll use last 3 periods of 2 weeks to validate our models.

[8]:
n_folds = 3
unit = "W"
n_units = 2

last_date = ratings[Columns.Datetime].max().normalize()
start_date=last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  # Start date of first test fold
periods=n_folds + 1
freq=f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)

date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# Init generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(ratings)}")
start_date: 2003-01-10 00:00:00
last_date: 2003-02-28 00:00:00
periods: 4
freq: 2W

Test fold borders: ['2003-01-12' '2003-01-26' '2003-02-09' '2003-02-23']
Real number of folds: 3

Train models

[9]:
# Take few simple models to compare
models = {
    "random": RandomModel(),
    "popular": PopularModel(),
    "most_raited": PopularModel(popularity="sum_weight"),
    "tfidf_k=5": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=5)),
    "tfidf_k=10": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=10)),
    "bm25_k=10_k1=0.05_b=0.1": ImplicitItemKNNWrapperModel(model=BM25Recommender(K=5, K1=0.05, B=0.1)),
}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@1": Precision(k=1),
    "prec@10": Precision(k=10),
    "recall": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

K_RECOS = 10
[10]:
%%time

# For each fold generate train and test part of dataset
# Then fit every model, generate recommendations and calculate metrics

results = []

fold_iterator = cv.split(ratings, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = ratings.iloc[train_ids].copy()
    dataset = Dataset.construct(df_train)

    df_test = ratings.iloc[test_ids][Columns.UserItem].copy()
    test_users = np.unique(df_test[Columns.User])

    # Catalog is set of items that we recommend.
    # Sometimes we recommend not all items from train.
    catalog = df_train[Columns.Item].unique()

    for model_name, model in models.items():
        model.fit(dataset)
        recos = model.recommend(
            users=test_users,
            dataset=dataset,
            k=K_RECOS,
            filter_viewed=True,
        )
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
        res = {"fold": i_fold, "model": model_name}
        res.update(metric_values)
        results.append(res)

==================== Fold 0
{'End date': Timestamp('2003-01-26 00:00:00', freq='2W-SUN'),
 'Start date': Timestamp('2003-01-12 00:00:00', freq='2W-SUN'),
 'Test': 419,
 'Test items': 339,
 'Test users': 78,
 'Train': 997837,
 'Train items': 3706,
 'Train users': 6040}

==================== Fold 1
{'End date': Timestamp('2003-02-09 00:00:00', freq='2W-SUN'),
 'Start date': Timestamp('2003-01-26 00:00:00', freq='2W-SUN'),
 'Test': 1105,
 'Test items': 817,
 'Test users': 71,
 'Train': 998256,
 'Train items': 3706,
 'Train users': 6040}

==================== Fold 2
{'End date': Timestamp('2003-02-23 00:00:00', freq='2W-SUN'),
 'Start date': Timestamp('2003-02-09 00:00:00', freq='2W-SUN'),
 'Test': 535,
 'Test items': 443,
 'Test users': 68,
 'Train': 999361,
 'Train items': 3706,
 'Train users': 6040}
CPU times: user 14.3 s, sys: 671 ms, total: 15 s
Wall time: 15 s
[11]:
# Aggregate metrics by folds and compare models
pivot_results = pd.DataFrame(results).drop(columns="fold").groupby(["model"], sort=False).agg("mean")
pivot_results.round(5)
[11]:
prec@1 prec@10 recall novelty serendipity
model
random 0.00490 0.00429 0.00252 6.37545 0.00054
popular 0.04224 0.03695 0.03498 1.59682 0.00020
most_raited 0.03734 0.03691 0.02871 1.60839 0.00017
tfidf_k=5 0.05121 0.03477 0.02991 2.34470 0.00120
tfidf_k=10 0.03306 0.03580 0.02912 2.15511 0.00079
bm25_k=10_k1=0.05_b=0.1 0.04756 0.03964 0.04094 1.80611 0.00033