py-grex/pygrex/evaluator/splitter.py

import sys
import random
import pandas as pd
import copy

from pygrex.data_reader.data_reader import DataReader


def fix_data_reader_mappings(source: DataReader, target: DataReader):
    target._num_user = source._num_user
    target._num_item = source._num_item
    #  Copy over the original ID mappings
    target.original_user_id = source.original_user_id
    target.original_item_id = source.original_item_id
    target.new_user_id = source.new_user_id
    target.new_item_id = source.new_item_id
    return target


class Splitter:
    """
    Super Splitting Class.
    args:
        data: DataReader object, which contains in its dataset attribute 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
    """

    def __init__(self):
        pass

    @staticmethod
    def split_leave_latest_out(data: DataReader, n_latest: int = 1):
        """
        Leave N latest interactions out train/test split.
        Ref:
        Campos, Pedro G., Fernando Díez, and Iván Cantador. "Time-aware recommender systems: a comprehensive survey and
        analysis of existing evaluation protocols." User Modeling and User-Adapted Interaction 24.1-2 (2014): 67-119.
        :param data:
        :param n_latest: int, number of latest interactions to be in the the test set.
        :returns train as DataReader, test as data.frames
        """

        # group items by suer id and rank them by timestamp
        rank_latest = data.dataset.groupby(["userId"])["timestamp"].rank(
            method="first", ascending=False
        )

        # keep in test items that are ranked higher than n_latest
        test = data.dataset[rank_latest <= n_latest]
        # keep in train the rest
        train = DataReader(dataframe=data.dataset.copy())
        train.dataset = data.dataset[rank_latest > n_latest]

        train = fix_data_reader_mappings(data, train)

        return train, test

    @staticmethod
    def split_leave_n_out(data: DataReader, n: int = 1, frac: float | None = None):
        """
        Leave N latest interactions out train/test split.
        Ref:
        Shani, Guy, and Asela Gunawardana. "Evaluating recommendation systems." Recommender systems handbook. Springer,
        Boston, MA, 2011. 257-297.
        :param data:
        :param n int, number of interactions to be in the the test set.
        :param frac float, fraction.
        :returns dataframe train and test
        """
        min_nr_ratings_user = min(data.dataset["userId"].value_counts())

        if min_nr_ratings_user < n:
            sys.exit(
                "split_leave_n_out: There are users with less ratings than n (required number of interactions "
                "in the test set)."
            )

        if frac is not None and frac > 1:
            sys.exit("f (i.e.) fraction should be smaller than 1.")

        # group items by user id and extraxt a random number of items per user
        grouped = data.dataset.groupby(["userId"])
        if frac is not None:
            test = grouped.sample(frac=frac)
        else:
            test = grouped.sample(n=n)

        test = test.reset_index(drop=True)
        train_pd = pd.merge(
            data.dataset,
            test,
            on=list(data.dataset.columns),
            how="outer",
            indicator=True,
        )
        train_pd = train_pd[train_pd["_merge"] == "left_only"]
        train_pd = train_pd.drop(columns="_merge")

        train = copy.deepcopy(data)
        train.dataset = train_pd
        train = fix_data_reader_mappings(data, train)
        assert test.shape[0] + train_pd.shape[0] == data.dataset.shape[0]

        return train, test

    def rel_plus_n(
        self,
        data,
        negative_sample_size: int = 99,
        splitting: str = "latest",
        n: int = 1,
    ):
        """
        RelPlusN: We build the users test set by extracting one relevant random item ($HR_u$) from the entire set of
        rated items. Then  a set of random items with unknown relevance ($NR_u$), is extracted for each user $u$, where $u$
        had no previous interaction with these items. Finally, for each item $i$ in $HR_u$, the algorithm requests a ranking
        of the top-$N$ items from the set $ {i} cup NR_u$, on which the evaluation is performed. The evaluation metrics
        are averaged over all the items in $HR_u$ and later over all the users. In the following, all experiments have been
        conducted according to this protocol.
        Ref:
        - Paolo Cremonesi, Yehuda Koren, and Roberto Turrin. 2010.   Performance of Recommender Algorithms on Top-n
        Recommendation Tasks. InProceedings ofthe Fourth ACM Conference on Recommender Systems (RecSys ’10).
        - Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. 2017. Neural Collaborative
        Filtering. In Proceedings of the 26th InternationalConference on World Wide Web (WWW ’17).
        :param data
        :param negative_sample_size how many negative items to compute
        :param splitting either latest for leave n latest out, or n for leave n out
        :param n how many to leave out

        """

        if splitting == "latest":
            train, test = self.split_leave_latest_out(data, n)
        elif splitting == "n":
            train, test = self.split_leave_n_out(data, n)
        else:
            sys.exit('splitting can be either "latest" or "n". ')

        neg_sample = self.sample_negative(data, negative_sample_size)

        return train, pd.concat([test, neg_sample], ignore_index=True)

    @staticmethod
    def sample_negative(data, negative_sample_size):
        """return all negative items"""

        item_catalogue = set(data.dataset["itemId"])

        interact_status = (
            data.dataset.groupby("userId")["itemId"]
            .apply(set)
            .reset_index()
            .rename(columns={"itemId": "interacted_items"})
        )
        interact_status["negative_items"] = interact_status["interacted_items"].apply(
            lambda x: item_catalogue - x
        )
        interact_status["negative_samples"] = interact_status["negative_items"].apply(
            lambda x: random.sample(x, negative_sample_size)
        )
        interact_status = interact_status[["userId", "negative_samples"]]

        userId = []
        itemId = []
        for row in interact_status.itertuples():
            for i in range(negative_sample_size):
                userId.append(int(row.userId))
                itemId.append(int(row.negative_samples[i]))

        return pd.DataFrame.from_dict({"userId": userId, "itemId": itemId})