Example of model selection using cross-validation with RecTools

CV split
Training a variety of models
Measuring a variety of metrics

[2]:

from pprint import pprint

import numpy as np
import pandas as pd

from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender
from implicit.als import AlternatingLeastSquares

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplit

Load data

[3]:

%%time
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

--2022-07-28 11:02:17--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5,6M) [application/zip]
Saving to: ‘ml-1m.zip.2’

ml-1m.zip.2         100%[===================>]   5,64M  3,38MB/s    in 1,7s

2022-07-28 11:02:19 (3,38 MB/s) - ‘ml-1m.zip.2’ saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat
  inflating: ml-1m/ratings.dat
  inflating: ml-1m/README
  inflating: ml-1m/users.dat
CPU times: user 46.9 ms, sys: 27.7 ms, total: 74.6 ms
Wall time: 2.67 s

[4]:

%%time
ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    engine="python",  # Because of 2-chars separators
    header=None,
    names=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
)
print(ratings.shape)
ratings.head()

(1000209, 4)
CPU times: user 4.01 s, sys: 156 ms, total: 4.16 s
Wall time: 4.17 s

[4]:

	user_id	item_id	weight	datetime
0	1	1193	5	978300760
1	1	661	3	978302109
2	1	914	3	978301968
3	1	3408	4	978300275
4	1	2355	5	978824291

[5]:

ratings["user_id"].nunique(), ratings["item_id"].nunique()

[5]:

(6040, 3706)

[6]:

ratings["weight"].value_counts()

[6]:

4    348971
3    261197
5    226310
2    107557
1     56174
Name: weight, dtype: int64

[7]:

ratings["datetime"] = pd.to_datetime(ratings["datetime"] * 10 ** 9)
print("Time period")
ratings["datetime"].min(), ratings["datetime"].max()

Time period

[7]:

(Timestamp('2000-04-25 23:05:32'), Timestamp('2003-02-28 17:49:50'))

Split interactions for CV

We’ll use last 3 periods of 2 weeks to validate our models.

[8]:

n_folds = 3
unit = "W"
n_units = 2

last_date = ratings[Columns.Datetime].max().normalize()
start_date=last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  # Start date of first test fold
periods=n_folds + 1
freq=f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)

date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# Init generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(ratings)}")

start_date: 2003-01-10 00:00:00
last_date: 2003-02-28 00:00:00
periods: 4
freq: 2W

Test fold borders: ['2003-01-12' '2003-01-26' '2003-02-09' '2003-02-23']
Real number of folds: 3

Train models

[9]:

# Take few simple models to compare
models = {
    "random": RandomModel(),
    "popular": PopularModel(),
    "most_raited": PopularModel(popularity="sum_weight"),
    "tfidf_k=5": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=5)),
    "tfidf_k=10": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=10)),
    "bm25_k=10_k1=0.05_b=0.1": ImplicitItemKNNWrapperModel(model=BM25Recommender(K=5, K1=0.05, B=0.1)),
}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@1": Precision(k=1),
    "prec@10": Precision(k=10),
    "recall": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

K_RECOS = 10

[10]:

%%time

# For each fold generate train and test part of dataset
# Then fit every model, generate recommendations and calculate metrics

results = []

fold_iterator = cv.split(ratings, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = ratings.iloc[train_ids].copy()
    dataset = Dataset.construct(df_train)

    df_test = ratings.iloc[test_ids][Columns.UserItem].copy()
    test_users = np.unique(df_test[Columns.User])

    # Catalog is set of items that we recommend.
    # Sometimes we recommend not all items from train.
    catalog = df_train[Columns.Item].unique()

    for model_name, model in models.items():
        model.fit(dataset)
        recos = model.recommend(
            users=test_users,
            dataset=dataset,
            k=K_RECOS,
            filter_viewed=True,
        )
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
        res = {"fold": i_fold, "model": model_name}
        res.update(metric_values)
        results.append(res)


==================== Fold 0
{'End date': Timestamp('2003-01-26 00:00:00', freq='2W-SUN'),
 'Start date': Timestamp('2003-01-12 00:00:00', freq='2W-SUN'),
 'Test': 419,
 'Test items': 339,
 'Test users': 78,
 'Train': 997837,
 'Train items': 3706,
 'Train users': 6040}

==================== Fold 1
{'End date': Timestamp('2003-02-09 00:00:00', freq='2W-SUN'),
 'Start date': Timestamp('2003-01-26 00:00:00', freq='2W-SUN'),
 'Test': 1105,
 'Test items': 817,
 'Test users': 71,
 'Train': 998256,
 'Train items': 3706,
 'Train users': 6040}

==================== Fold 2
{'End date': Timestamp('2003-02-23 00:00:00', freq='2W-SUN'),
 'Start date': Timestamp('2003-02-09 00:00:00', freq='2W-SUN'),
 'Test': 535,
 'Test items': 443,
 'Test users': 68,
 'Train': 999361,
 'Train items': 3706,
 'Train users': 6040}
CPU times: user 14.3 s, sys: 671 ms, total: 15 s
Wall time: 15 s

[11]:

# Aggregate metrics by folds and compare models
pivot_results = pd.DataFrame(results).drop(columns="fold").groupby(["model"], sort=False).agg("mean")
pivot_results.round(5)

[11]:

	prec@1	prec@10	recall	novelty	serendipity
model
random	0.00490	0.00429	0.00252	6.37545	0.00054
popular	0.04224	0.03695	0.03498	1.59682	0.00020
most_raited	0.03734	0.03691	0.02871	1.60839	0.00017
tfidf_k=5	0.05121	0.03477	0.02991	2.34470	0.00120
tfidf_k=10	0.03306	0.03580	0.02912	2.15511	0.00079
bm25_k=10_k1=0.05_b=0.1	0.04756	0.03964	0.04094	1.80611	0.00033