public code v1

2026-05-22 10:02:10 +02:00
commit 46a9ecf065
166 changed files with 6982454 additions and 0 deletions
@@ -0,0 +1,169 @@
+import sys
+import random
+import pandas as pd
+import copy
+
+from pygrex.data_reader.data_reader import DataReader
+
+
+def fix_data_reader_mappings(source: DataReader, target: DataReader):
+    target._num_user = source._num_user
+    target._num_item = source._num_item
+    #  Copy over the original ID mappings
+    target.original_user_id = source.original_user_id
+    target.original_item_id = source.original_item_id
+    target.new_user_id = source.new_user_id
+    target.new_item_id = source.new_item_id
+    return target
+
+
+class Splitter:
+    """
+    Super Splitting Class.
+    args:
+        data: DataReader object, which contains in its dataset attribute 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
+    """
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def split_leave_latest_out(data: DataReader, n_latest: int = 1):
+        """
+        Leave N latest interactions out train/test split.
+        Ref:
+        Campos, Pedro G., Fernando Díez, and Iván Cantador. "Time-aware recommender systems: a comprehensive survey and
+        analysis of existing evaluation protocols." User Modeling and User-Adapted Interaction 24.1-2 (2014): 67-119.
+        :param data:
+        :param n_latest: int, number of latest interactions to be in the the test set.
+        :returns train as DataReader, test as data.frames
+        """
+
+        # group items by suer id and rank them by timestamp
+        rank_latest = data.dataset.groupby(["userId"])["timestamp"].rank(
+            method="first", ascending=False
+        )
+
+        # keep in test items that are ranked higher than n_latest
+        test = data.dataset[rank_latest <= n_latest]
+        # keep in train the rest
+        train = DataReader(dataframe=data.dataset.copy())
+        train.dataset = data.dataset[rank_latest > n_latest]
+
+        train = fix_data_reader_mappings(data, train)
+
+        return train, test
+
+    @staticmethod
+    def split_leave_n_out(data: DataReader, n: int = 1, frac: float | None = None):
+        """
+        Leave N latest interactions out train/test split.
+        Ref:
+        Shani, Guy, and Asela Gunawardana. "Evaluating recommendation systems." Recommender systems handbook. Springer,
+        Boston, MA, 2011. 257-297.
+        :param data:
+        :param n int, number of interactions to be in the the test set.
+        :param frac float, fraction.
+        :returns dataframe train and test
+        """
+        min_nr_ratings_user = min(data.dataset["userId"].value_counts())
+
+        if min_nr_ratings_user < n:
+            sys.exit(
+                "split_leave_n_out: There are users with less ratings than n (required number of interactions "
+                "in the test set)."
+            )
+
+        if frac is not None and frac > 1:
+            sys.exit("f (i.e.) fraction should be smaller than 1.")
+
+        # group items by user id and extraxt a random number of items per user
+        grouped = data.dataset.groupby(["userId"])
+        if frac is not None:
+            test = grouped.sample(frac=frac)
+        else:
+            test = grouped.sample(n=n)
+
+        test = test.reset_index(drop=True)
+        train_pd = pd.merge(
+            data.dataset,
+            test,
+            on=list(data.dataset.columns),
+            how="outer",
+            indicator=True,
+        )
+        train_pd = train_pd[train_pd["_merge"] == "left_only"]
+        train_pd = train_pd.drop(columns="_merge")
+
+        train = copy.deepcopy(data)
+        train.dataset = train_pd
+        train = fix_data_reader_mappings(data, train)
+        assert test.shape[0] + train_pd.shape[0] == data.dataset.shape[0]
+
+        return train, test
+
+    def rel_plus_n(
+        self,
+        data,
+        negative_sample_size: int = 99,
+        splitting: str = "latest",
+        n: int = 1,
+    ):
+        """
+        RelPlusN: We build the users test set by extracting one relevant random item ($HR_u$) from the entire set of
+        rated items. Then  a set of random items with unknown relevance ($NR_u$), is extracted for each user $u$, where $u$
+        had no previous interaction with these items. Finally, for each item $i$ in $HR_u$, the algorithm requests a ranking
+        of the top-$N$ items from the set $ {i} cup NR_u$, on which the evaluation is performed. The evaluation metrics
+        are averaged over all the items in $HR_u$ and later over all the users. In the following, all experiments have been
+        conducted according to this protocol.
+        Ref:
+        - Paolo Cremonesi, Yehuda Koren, and Roberto Turrin. 2010.   Performance of Recommender Algorithms on Top-n
+        Recommendation Tasks. InProceedings ofthe Fourth ACM Conference on Recommender Systems (RecSys ’10).
+        - Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. 2017. Neural Collaborative
+        Filtering. In Proceedings of the 26th InternationalConference on World Wide Web (WWW ’17).
+        :param data
+        :param negative_sample_size how many negative items to compute
+        :param splitting either latest for leave n latest out, or n for leave n out
+        :param n how many to leave out
+
+        """
+
+        if splitting == "latest":
+            train, test = self.split_leave_latest_out(data, n)
+        elif splitting == "n":
+            train, test = self.split_leave_n_out(data, n)
+        else:
+            sys.exit('splitting can be either "latest" or "n". ')
+
+        neg_sample = self.sample_negative(data, negative_sample_size)
+
+        return train, pd.concat([test, neg_sample], ignore_index=True)
+
+    @staticmethod
+    def sample_negative(data, negative_sample_size):
+        """return all negative items"""
+
+        item_catalogue = set(data.dataset["itemId"])
+
+        interact_status = (
+            data.dataset.groupby("userId")["itemId"]
+            .apply(set)
+            .reset_index()
+            .rename(columns={"itemId": "interacted_items"})
+        )
+        interact_status["negative_items"] = interact_status["interacted_items"].apply(
+            lambda x: item_catalogue - x
+        )
+        interact_status["negative_samples"] = interact_status["negative_items"].apply(
+            lambda x: random.sample(x, negative_sample_size)
+        )
+        interact_status = interact_status[["userId", "negative_samples"]]
+
+        userId = []
+        itemId = []
+        for row in interact_status.itertuples():
+            for i in range(negative_sample_size):
+                userId.append(int(row.userId))
+                itemId.append(int(row.negative_samples[i]))
+
+        return pd.DataFrame.from_dict({"userId": userId, "itemId": itemId})