public code v1

2026-05-22 10:02:10 +02:00
commit 46a9ecf065
166 changed files with 6982454 additions and 0 deletions
@@ -0,0 +1,15 @@
+from .splitter import Splitter
+from .model_evaluator import ModelEvaluator
+from .explainer_evaluator import ExplanationEvaluator
+from .evaluation_pipelines import (
+    run_evaluation_with_proper_split,
+    run_leave_one_out_evaluation,
+)
+
+__all__ = [
+    "Splitter",
+    "ModelEvaluator",
+    "ExplanationEvaluator",
+    "run_evaluation_with_proper_split",
+    "run_leave_one_out_evaluation",
+]
@@ -0,0 +1,251 @@
+import time
+from typing import Dict
+import pandas as pd
+import numpy as np
+from pygrex.data_reader.data_reader import DataReader
+from pygrex.evaluator import Splitter, ModelEvaluator
+
+
+def run_leave_one_out_evaluation(
+    data_reader: DataReader, model, top_n: int = 10
+) -> Dict:
+    print("Starting leave-one-out evaluation...")
+    start_time = time.time()
+
+    # 1. Proper leave-one-out split (one item per user)
+    train_dr, test_df = Splitter.split_leave_n_out(
+        data_reader, n=1
+    )  # n=1 for true leave-one-out
+    print(f"Split completed: {len(test_df)} test interactions")
+
+    train_users = set(train_dr.dataset["userId"].unique())
+    train_items = set(train_dr.dataset["itemId"].unique())
+
+    original_test_len = len(test_df)
+    test_df = test_df[
+        test_df["userId"].isin(train_users) & test_df["itemId"].isin(train_items)
+    ]
+    print(
+        f"Filtered test set: {len(test_df)} interactions remaining from {original_test_len}"
+    )
+
+    # 2. Train model on training data
+    print("Training model on reduced dataset...")
+    train_start = time.time()
+    model.fit(train_dr)
+    train_time = time.time() - train_start
+    print(f"Model training completed in {train_time:.2f} seconds")
+
+    # 3. Generate recommendations efficiently
+    print("Generating recommendations...")
+    rec_start = time.time()
+    recommendations = generate_recommendations_batch(model, train_dr, test_df, top_n)
+    rec_time = time.time() - rec_start
+    print(f"Recommendations generated in {rec_time:.2f} seconds")
+
+    # 4. Use the existing Evaluator class
+    evaluator = ModelEvaluator(test_df, top_n=top_n)
+
+    # Calculate metrics
+    hit_ratio = evaluator.cal_hit_ratio(recommendations)
+    ndcg = evaluator.cal_ndcg(recommendations)
+
+    total_time = time.time() - start_time
+    print(f"Total evaluation time: {total_time:.2f} seconds")
+
+    return {
+        "Hit Ratio": hit_ratio,
+        "NDCG": ndcg,  # Using standard NDCG instead of eNDCG for now
+        "evaluation_time": total_time,
+    }
+
+
+def generate_recommendations_batch(
+    model, train_dr: DataReader, test_df: pd.DataFrame, top_n: int
+) -> pd.DataFrame:
+    """
+    Generate recommendations in batch mode for efficiency.
+    Returns DataFrame with columns: ['userId', 'itemId', 'rank', 'score']
+    """
+    all_items = set(train_dr.dataset["itemId"].unique())
+    recommendations = []
+
+    test_users = test_df["userId"].unique()
+    print(f"Generating recommendations for {len(test_users)} users...")
+
+    for i, user_id in enumerate(test_users):
+        if i % 100 == 0:  # Progress indicator
+            print(f"Processing user {i}/{len(test_users)}")
+
+        # Get items the user has already interacted with
+        user_items = set(
+            train_dr.dataset[train_dr.dataset["userId"] == user_id]["itemId"]
+        )
+
+        # Candidate items (unseen items)
+        candidate_items = list(all_items - user_items)
+
+        # For efficiency, limit candidates if there are too many
+        if len(candidate_items) > 10000:  # Adjust this threshold based on your needs
+            candidate_items = np.random.choice(
+                candidate_items, 10000, replace=False
+            ).tolist()
+
+        # Generate predictions - try to use batch prediction if available
+        try:
+            # Check if model has batch prediction capability
+            if hasattr(model, "predict_batch") or hasattr(model, "recommend"):
+                user_recs = generate_recommendations_efficient(
+                    model, user_id, candidate_items, top_n
+                )
+            else:
+                # Fall back to individual predictions (slower)
+                user_recs = generate_recommendations_individual(
+                    model, user_id, candidate_items, top_n
+                )
+
+            recommendations.extend(user_recs)
+
+        except Exception as e:
+            print(f"Error generating recommendations for user {user_id}: {e}")
+            continue
+
+    # Convert to DataFrame
+    if recommendations:
+        rec_df = pd.DataFrame(
+            recommendations, columns=["userId", "itemId", "rank", "score"]
+        )
+    else:
+        # Return empty DataFrame with correct structure
+        rec_df = pd.DataFrame(columns=["userId", "itemId", "rank", "score"])
+
+    return rec_df
+
+
+def generate_recommendations_efficient(
+    model, user_id: int, candidate_items: list, top_n: int
+) -> list:
+    """
+    Try to use efficient recommendation methods if available.
+    """
+    recommendations = []
+
+    # Try different efficient methods based on model type
+    if hasattr(model, "recommend"):
+        # Some models have a recommend method
+        try:
+            recs = model.recommend(user_id, candidate_items, top_n)
+            for rank, (item_id, score) in enumerate(recs, 1):
+                recommendations.append((user_id, item_id, rank, score))
+        except Exception:
+            # Fall back to individual predictions
+            return generate_recommendations_individual(
+                model, user_id, candidate_items, top_n
+            )
+
+    elif hasattr(model, "predict_batch"):
+        # Batch prediction if available
+        try:
+            user_items_batch = [(user_id, item_id) for item_id in candidate_items]
+            scores = model.predict_batch(user_items_batch)
+
+            # Sort by score and get top-N
+            scored_items = list(zip(candidate_items, scores))
+            scored_items.sort(key=lambda x: x[1], reverse=True)
+
+            for rank, (item_id, score) in enumerate(scored_items[:top_n], 1):
+                recommendations.append((user_id, item_id, rank, score))
+        except Exception:
+            return generate_recommendations_individual(
+                model, user_id, candidate_items, top_n
+            )
+
+    else:
+        return generate_recommendations_individual(
+            model, user_id, candidate_items, top_n
+        )
+
+    return recommendations
+
+
+def generate_recommendations_individual(
+    model, user_id: int, candidate_items: list, top_n: int
+) -> list:
+    """
+    Fall back to individual predictions (slower but works with any model).
+    """
+    predictions = []
+
+    # Batch the individual predictions for better performance
+    batch_size = 100
+    for i in range(0, len(candidate_items), batch_size):
+        batch_items = candidate_items[i : i + batch_size]
+
+        for item_id in batch_items:
+            try:
+                score = model.predict(user_id, item_id)
+                predictions.append((item_id, score))
+            except Exception as e:
+                print(f"Prediction error for user {user_id}, item {item_id}: {e}")
+                # Skip items that cause prediction errors
+                continue
+
+    # Sort by score and get top-N
+    predictions.sort(key=lambda x: x[1], reverse=True)
+    top_predictions = predictions[:top_n]
+
+    recommendations = []
+    for rank, (item_id, score) in enumerate(top_predictions, 1):
+        recommendations.append((user_id, item_id, rank, score))
+
+    return recommendations
+
+
+def run_evaluation_with_proper_split(
+    data_reader: DataReader, model, test_size: float = 0.2, top_n: int = 10
+) -> Dict:
+    """
+    Alternative evaluation using a proper train/test split instead of leave-one-out.
+    """
+    print(f"Starting evaluation with {test_size * 100}% test split...")
+    start_time = time.time()
+
+    # 1. Split data into train/test
+    train_dr, test_df = Splitter.split_leave_n_out(data_reader, frac=test_size)
+    print(f"Split completed: {len(test_df)} test interactions")
+
+    # 2. Filter test set to ensure all users/items exist in the training set
+    train_users = set(train_dr.dataset["userId"].unique())
+    train_items = set(train_dr.dataset["itemId"].unique())
+
+    original_test_len = len(test_df)
+    test_df = test_df[
+        test_df["userId"].isin(train_users) & test_df["itemId"].isin(train_items)
+    ]
+    print(
+        f"Filtered test set: {len(test_df)} interactions remaining from {original_test_len}"
+    )
+
+    # 2. Train model
+    print("Training model...")
+    model.fit(train_dr)
+
+    # 3. Generate recommendations
+    print("Generating recommendations...")
+    recommendations = generate_recommendations_batch(model, train_dr, test_df, top_n)
+
+    # 4. Evaluate
+    evaluator = ModelEvaluator(test_df, top_n=top_n)
+    hit_ratio = evaluator.cal_hit_ratio(recommendations)
+    ndcg = evaluator.cal_ndcg(recommendations)
+
+    total_time = time.time() - start_time
+    print(f"Evaluation completed in {total_time:.2f} seconds")
+
+    return {
+        "Hit Ratio": hit_ratio,
+        "NDCG": ndcg,
+        "evaluation_time": total_time,
+        "test_interactions": len(test_df),
+        "total_recommendations": len(recommendations),
+    }
@@ -0,0 +1,68 @@
+from typing import Dict, Any
+
+from pygrex.utils import calculate_gild_for_explanations
+
+
+class ExplanationEvaluator:
+    """
+    A unified evaluator for different explanation methods.
+
+    This class takes the results generated by an explainer and calculates
+    a standard set of quality metrics, such as Fidelity and Diversity (GILD).
+    """
+
+    def __init__(self):
+        """Initializes the ExplanationEvaluator."""
+        # This class is stateless, so __init__ is simple.
+        pass
+
+    def evaluate(
+        self, explanation_results: Dict[str, Any], explainer_type: str
+    ) -> Dict[str, float]:
+        """
+        Calculates all relevant metrics for a given explanation result.
+
+        Args:
+            explanation_results: The dictionary returned by an explainer's
+                                 `find_explanation` method.
+            explainer_type: A string identifier for the explainer used
+                            (e.g., "LORE4Groups", "EXPGRS").
+
+        Returns:
+            A dictionary containing the calculated metric scores.
+        """
+        if not explanation_results:
+            return {"fidelity": 0.0, "gild": 0.0}
+
+        fidelity = self._calculate_fidelity(explanation_results)
+        gild = self._calculate_gild(explanation_results, explainer_type)
+
+        return {"fidelity": fidelity, "gild": gild}
+
+    def _calculate_fidelity(self, explanation_results: Dict[str, Any]) -> float:
+        """
+        Extracts the fidelity score from the explanation results.
+
+        Fidelity is computed by the explainer itself, as it's the ratio of
+        items it was able to explain. This method standardizes its retrieval.
+        """
+        return explanation_results.get("fidelity", 0.0)
+
+    def _calculate_gild(
+        self, explanation_results: Dict[str, Any], explainer_type: str
+    ) -> float:
+        """
+        Calculates the Gaussian Inter-List Diversity (GILD) of the explanations.
+
+        This is a wrapper around the utility function that handles the details.
+        It uses the 'details' part of the explanation results.
+        """
+        explanation_details = explanation_results.get("details", {})
+        if not explanation_details:
+            return 0.0
+
+        # The GILD function is now called from a central, logical place.
+        gild_score = calculate_gild_for_explanations(
+            explanation_details, explainer_type
+        )
+        return gild_score
@@ -0,0 +1,179 @@
+import numpy as np
+import pandas as pd
+
+
+class ModelEvaluator:
+    disc_functions = ["log", "linear"]
+
+    def __init__(self, test_set, top_n: int = 10, discount_function: str = "log"):
+        self.test_set = test_set
+        self._top_n = top_n
+        assert discount_function in self.disc_functions, "Wrong Discount Function."
+        self._discount_function = discount_function
+        self.num_users = self.test_set.userId.nunique()
+
+    @property
+    def top_n(self):
+        return self._top_n
+
+    @top_n.setter
+    def top_n(self, top_n: int):
+        self._top_n = top_n
+
+    @property
+    def discount_function(self):
+        return self._discount_function
+
+    @discount_function.setter
+    def discount_function(self, discount_function: str):
+        assert discount_function in self.disc_functions, "Wrong Discount Function."
+        self._discount_function = discount_function
+
+    def cal_hit_ratio(self, recommendations):
+        """
+        Hit Ratio
+        :param recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
+        :return: hit rate.
+        """
+        test_in_top_n = self.get_hits(recommendations)
+        # count hits per user
+        hits_per_user = self.count_positives(test_in_top_n)
+        # merge with the entire list of positive items for user
+        hits_per_user = hits_per_user.merge(
+            self.count_positives(self.test_set),
+            on="userId",
+            suffixes=("_true", ""),
+            how="right",
+        )
+        # if there are users with 0 hits the merge will have NA.
+        hits_per_user = hits_per_user.fillna(0)
+        # get the hit rate per user
+        hit_rate = hits_per_user.positive_true / hits_per_user.positive
+        # average
+        hit_rate = hit_rate.mean()
+        return hit_rate
+
+    def get_hits(self, recommendations):
+        """
+        Find which items in the test set have a hit on the recommendations.
+        :param recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
+        :return: dataframe, removing the rows missing in the test set.
+        """
+        # check whether there are top_n items per user
+        top_n_recommendations = self.filter_to_top_n(recommendations)
+        # find the hits
+        test_in_top_n = pd.merge(
+            top_n_recommendations, self.test_set, on=["userId", "itemId"]
+        )
+        return test_in_top_n
+
+    def filter_to_top_n(self, dataset):
+        """
+        if rank > top_n, we do not use it for evaluation
+        :param dataset: dataframe, columns = ['userId', 'itemId', 'rank']
+        :return: dataframe, columns = ['userId', 'itemId', 'rank']
+        """
+        return dataset[dataset["rank"] <= self.top_n]
+
+    def cal_ndcg(self, recommendations):
+        r"""
+        For evaluating the top-N recommendation list, we also provide the normalized Discounted Cumulative Gain at N
+        recommendation (nDCG@N)  computed as the ratio of the Discounted Cumulative Gain(DCG) with the ideal Discounted
+        Cumulative Gain(IDCG):
+         DGC_{pos} = rel_1 + \sum_{i=2}^{pos} \frac{rel_i}{\log_2i} \qquad \qquad
+        IDGC_{pos} = rel_1 + \sum_{i=2}^{|h|-1} \frac{rel_i}{\log_2i} \\
+        nDCG_{pos} = \frac{DCG}{IDCG}
+        where pos denotes the position up to which relevance is accumulated, and $rel_i$ is the relevance of the recommended item at position \textit{i}.
+        Ref: Y. Wang, L. Wang, Y. Li, D. He, T.-Y. Liu, and W. Chen.
+            A theoretical analysis of ndcgtype ranking measures.
+        :param recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
+        :return: nDCG
+        """
+        # get hits
+        hits = self.get_hits(recommendations)
+
+        DCG = self.cal_dcg(hits)
+        iDCG = self.cal_idcg()
+
+        # join to check if there are users in the test without hits
+        nDCG = iDCG.merge(DCG, on="userId", how="left")
+        nDCG = nDCG.fillna(0)
+        # normalize
+        nDCG["ndcg"] = nDCG["dcg"] / nDCG["idcg"]
+
+        return nDCG["ndcg"].mean()
+
+    def cal_dcg(self, hits):
+        """
+        Discounted Comulative Gain
+        :param hits: recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
+        :return: DCG
+        """
+        # todo: the gain so far is set to a constant.
+
+        if self.discount_function == "log":
+            hits["discounted_gain"] = np.log(2) / np.log(hits["rank"] + 1)
+        elif self.discount_function == "linear":
+            hits["discounted_gain"] = 1 / hits["rank"]
+
+        DCG = hits.groupby("userId")["discounted_gain"].sum()
+
+        return pd.DataFrame(
+            {"userId": hits["userId"].unique(), "dcg": DCG}
+        ).reset_index(drop=True)
+
+    def cal_idcg(self):
+        """
+        the Ideal DCG, is the DCG for the best ranking possible (i.e. all true positives were recommended first).
+        :return: iDCG
+        """
+        # create a fake ranking for test set items.
+        # We assume that the items in the test set are all on the Top-N list.
+        count_positives = self.count_positives(self.test_set)
+        ideal_rank = [i for x in count_positives["positive"] for i in (range(1, x + 1))]
+        test_ideal_ranking = self.test_set.copy()
+        test_ideal_ranking["rank"] = ideal_rank
+        # Filter to have at most top-N items.
+        test_ideal_ranking = self.filter_to_top_n(test_ideal_ranking)
+        # get the dcg for the ideal ranking
+        idcg = self.cal_dcg(test_ideal_ranking)
+        idcg = idcg.rename(columns={"dcg": "idcg"})
+        return idcg
+
+    @staticmethod
+    def count_positives(dataset):
+        """
+        Returns the positives count.
+        :param dataset: dataframe, columns = ['userId', 'itemId', 'rank']
+        :return: dataframe, columns = ['userId', 'positive']
+        """
+        users_with_positives = dataset.userId.unique()
+        positives_per_user = dataset.groupby("userId")["itemId"].count()
+        positives_per_user = pd.DataFrame(
+            {"userId": users_with_positives, "positive": positives_per_user}
+        )
+
+        return positives_per_user.reset_index(drop=True)
+
+
+# if __name__ == '__main__':
+##    recoms = pd.DataFrame({
+#        'userId': [1, 1, 1, 2, 2, 2, 3, 3, 3],
+#        'itemId': [1, 2, 3, 4, 1, 2, 2, 3, 4],
+#        'rank': [1, 2, 3, 1, 2, 3, 1, 2, 3]
+#    })
+
+#    test = pd.DataFrame({
+#        'userId': [1, 1, 2, 3],
+#        'itemId': [1, 4, 1, 5]
+#    })
+
+#    eval = Evaluator(test_set=test, top_n=2)
+
+#    assert eval.num_users == 3, 'number of users'
+#    assert eval.top_n == 2, 'number of top n'
+#    eval.top_n = 3
+#    assert eval.top_n == 3, 'changing of top n'
+
+#   print(eval.cal_hit_ratio(recoms))
+#   print(eval.cal_ndcg(recoms))
@@ -0,0 +1,169 @@
+import sys
+import random
+import pandas as pd
+import copy
+
+from pygrex.data_reader.data_reader import DataReader
+
+
+def fix_data_reader_mappings(source: DataReader, target: DataReader):
+    target._num_user = source._num_user
+    target._num_item = source._num_item
+    #  Copy over the original ID mappings
+    target.original_user_id = source.original_user_id
+    target.original_item_id = source.original_item_id
+    target.new_user_id = source.new_user_id
+    target.new_item_id = source.new_item_id
+    return target
+
+
+class Splitter:
+    """
+    Super Splitting Class.
+    args:
+        data: DataReader object, which contains in its dataset attribute 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
+    """
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def split_leave_latest_out(data: DataReader, n_latest: int = 1):
+        """
+        Leave N latest interactions out train/test split.
+        Ref:
+        Campos, Pedro G., Fernando Díez, and Iván Cantador. "Time-aware recommender systems: a comprehensive survey and
+        analysis of existing evaluation protocols." User Modeling and User-Adapted Interaction 24.1-2 (2014): 67-119.
+        :param data:
+        :param n_latest: int, number of latest interactions to be in the the test set.
+        :returns train as DataReader, test as data.frames
+        """
+
+        # group items by suer id and rank them by timestamp
+        rank_latest = data.dataset.groupby(["userId"])["timestamp"].rank(
+            method="first", ascending=False
+        )
+
+        # keep in test items that are ranked higher than n_latest
+        test = data.dataset[rank_latest <= n_latest]
+        # keep in train the rest
+        train = DataReader(dataframe=data.dataset.copy())
+        train.dataset = data.dataset[rank_latest > n_latest]
+
+        train = fix_data_reader_mappings(data, train)
+
+        return train, test
+
+    @staticmethod
+    def split_leave_n_out(data: DataReader, n: int = 1, frac: float | None = None):
+        """
+        Leave N latest interactions out train/test split.
+        Ref:
+        Shani, Guy, and Asela Gunawardana. "Evaluating recommendation systems." Recommender systems handbook. Springer,
+        Boston, MA, 2011. 257-297.
+        :param data:
+        :param n int, number of interactions to be in the the test set.
+        :param frac float, fraction.
+        :returns dataframe train and test
+        """
+        min_nr_ratings_user = min(data.dataset["userId"].value_counts())
+
+        if min_nr_ratings_user < n:
+            sys.exit(
+                "split_leave_n_out: There are users with less ratings than n (required number of interactions "
+                "in the test set)."
+            )
+
+        if frac is not None and frac > 1:
+            sys.exit("f (i.e.) fraction should be smaller than 1.")
+
+        # group items by user id and extraxt a random number of items per user
+        grouped = data.dataset.groupby(["userId"])
+        if frac is not None:
+            test = grouped.sample(frac=frac)
+        else:
+            test = grouped.sample(n=n)
+
+        test = test.reset_index(drop=True)
+        train_pd = pd.merge(
+            data.dataset,
+            test,
+            on=list(data.dataset.columns),
+            how="outer",
+            indicator=True,
+        )
+        train_pd = train_pd[train_pd["_merge"] == "left_only"]
+        train_pd = train_pd.drop(columns="_merge")
+
+        train = copy.deepcopy(data)
+        train.dataset = train_pd
+        train = fix_data_reader_mappings(data, train)
+        assert test.shape[0] + train_pd.shape[0] == data.dataset.shape[0]
+
+        return train, test
+
+    def rel_plus_n(
+        self,
+        data,
+        negative_sample_size: int = 99,
+        splitting: str = "latest",
+        n: int = 1,
+    ):
+        """
+        RelPlusN: We build the users test set by extracting one relevant random item ($HR_u$) from the entire set of
+        rated items. Then  a set of random items with unknown relevance ($NR_u$), is extracted for each user $u$, where $u$
+        had no previous interaction with these items. Finally, for each item $i$ in $HR_u$, the algorithm requests a ranking
+        of the top-$N$ items from the set $ {i} cup NR_u$, on which the evaluation is performed. The evaluation metrics
+        are averaged over all the items in $HR_u$ and later over all the users. In the following, all experiments have been
+        conducted according to this protocol.
+        Ref:
+        - Paolo Cremonesi, Yehuda Koren, and Roberto Turrin. 2010.   Performance of Recommender Algorithms on Top-n
+        Recommendation Tasks. InProceedings ofthe Fourth ACM Conference on Recommender Systems (RecSys ’10).
+        - Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. 2017. Neural Collaborative
+        Filtering. In Proceedings of the 26th InternationalConference on World Wide Web (WWW ’17).
+        :param data
+        :param negative_sample_size how many negative items to compute
+        :param splitting either latest for leave n latest out, or n for leave n out
+        :param n how many to leave out
+
+        """
+
+        if splitting == "latest":
+            train, test = self.split_leave_latest_out(data, n)
+        elif splitting == "n":
+            train, test = self.split_leave_n_out(data, n)
+        else:
+            sys.exit('splitting can be either "latest" or "n". ')
+
+        neg_sample = self.sample_negative(data, negative_sample_size)
+
+        return train, pd.concat([test, neg_sample], ignore_index=True)
+
+    @staticmethod
+    def sample_negative(data, negative_sample_size):
+        """return all negative items"""
+
+        item_catalogue = set(data.dataset["itemId"])
+
+        interact_status = (
+            data.dataset.groupby("userId")["itemId"]
+            .apply(set)
+            .reset_index()
+            .rename(columns={"itemId": "interacted_items"})
+        )
+        interact_status["negative_items"] = interact_status["interacted_items"].apply(
+            lambda x: item_catalogue - x
+        )
+        interact_status["negative_samples"] = interact_status["negative_items"].apply(
+            lambda x: random.sample(x, negative_sample_size)
+        )
+        interact_status = interact_status[["userId", "negative_samples"]]
+
+        userId = []
+        itemId = []
+        for row in interact_status.itertuples():
+            for i in range(negative_sample_size):
+                userId.append(int(row.userId))
+                itemId.append(int(row.negative_samples[i]))
+
+        return pd.DataFrame.from_dict({"userId": userId, "itemId": itemId})