public code v1

This commit is contained in:
2026-05-22 10:02:10 +02:00
commit 46a9ecf065
166 changed files with 6982454 additions and 0 deletions
+15
View File
@@ -0,0 +1,15 @@
from .splitter import Splitter
from .model_evaluator import ModelEvaluator
from .explainer_evaluator import ExplanationEvaluator
from .evaluation_pipelines import (
run_evaluation_with_proper_split,
run_leave_one_out_evaluation,
)
__all__ = [
"Splitter",
"ModelEvaluator",
"ExplanationEvaluator",
"run_evaluation_with_proper_split",
"run_leave_one_out_evaluation",
]
+251
View File
@@ -0,0 +1,251 @@
import time
from typing import Dict
import pandas as pd
import numpy as np
from pygrex.data_reader.data_reader import DataReader
from pygrex.evaluator import Splitter, ModelEvaluator
def run_leave_one_out_evaluation(
data_reader: DataReader, model, top_n: int = 10
) -> Dict:
print("Starting leave-one-out evaluation...")
start_time = time.time()
# 1. Proper leave-one-out split (one item per user)
train_dr, test_df = Splitter.split_leave_n_out(
data_reader, n=1
) # n=1 for true leave-one-out
print(f"Split completed: {len(test_df)} test interactions")
train_users = set(train_dr.dataset["userId"].unique())
train_items = set(train_dr.dataset["itemId"].unique())
original_test_len = len(test_df)
test_df = test_df[
test_df["userId"].isin(train_users) & test_df["itemId"].isin(train_items)
]
print(
f"Filtered test set: {len(test_df)} interactions remaining from {original_test_len}"
)
# 2. Train model on training data
print("Training model on reduced dataset...")
train_start = time.time()
model.fit(train_dr)
train_time = time.time() - train_start
print(f"Model training completed in {train_time:.2f} seconds")
# 3. Generate recommendations efficiently
print("Generating recommendations...")
rec_start = time.time()
recommendations = generate_recommendations_batch(model, train_dr, test_df, top_n)
rec_time = time.time() - rec_start
print(f"Recommendations generated in {rec_time:.2f} seconds")
# 4. Use the existing Evaluator class
evaluator = ModelEvaluator(test_df, top_n=top_n)
# Calculate metrics
hit_ratio = evaluator.cal_hit_ratio(recommendations)
ndcg = evaluator.cal_ndcg(recommendations)
total_time = time.time() - start_time
print(f"Total evaluation time: {total_time:.2f} seconds")
return {
"Hit Ratio": hit_ratio,
"NDCG": ndcg, # Using standard NDCG instead of eNDCG for now
"evaluation_time": total_time,
}
def generate_recommendations_batch(
model, train_dr: DataReader, test_df: pd.DataFrame, top_n: int
) -> pd.DataFrame:
"""
Generate recommendations in batch mode for efficiency.
Returns DataFrame with columns: ['userId', 'itemId', 'rank', 'score']
"""
all_items = set(train_dr.dataset["itemId"].unique())
recommendations = []
test_users = test_df["userId"].unique()
print(f"Generating recommendations for {len(test_users)} users...")
for i, user_id in enumerate(test_users):
if i % 100 == 0: # Progress indicator
print(f"Processing user {i}/{len(test_users)}")
# Get items the user has already interacted with
user_items = set(
train_dr.dataset[train_dr.dataset["userId"] == user_id]["itemId"]
)
# Candidate items (unseen items)
candidate_items = list(all_items - user_items)
# For efficiency, limit candidates if there are too many
if len(candidate_items) > 10000: # Adjust this threshold based on your needs
candidate_items = np.random.choice(
candidate_items, 10000, replace=False
).tolist()
# Generate predictions - try to use batch prediction if available
try:
# Check if model has batch prediction capability
if hasattr(model, "predict_batch") or hasattr(model, "recommend"):
user_recs = generate_recommendations_efficient(
model, user_id, candidate_items, top_n
)
else:
# Fall back to individual predictions (slower)
user_recs = generate_recommendations_individual(
model, user_id, candidate_items, top_n
)
recommendations.extend(user_recs)
except Exception as e:
print(f"Error generating recommendations for user {user_id}: {e}")
continue
# Convert to DataFrame
if recommendations:
rec_df = pd.DataFrame(
recommendations, columns=["userId", "itemId", "rank", "score"]
)
else:
# Return empty DataFrame with correct structure
rec_df = pd.DataFrame(columns=["userId", "itemId", "rank", "score"])
return rec_df
def generate_recommendations_efficient(
model, user_id: int, candidate_items: list, top_n: int
) -> list:
"""
Try to use efficient recommendation methods if available.
"""
recommendations = []
# Try different efficient methods based on model type
if hasattr(model, "recommend"):
# Some models have a recommend method
try:
recs = model.recommend(user_id, candidate_items, top_n)
for rank, (item_id, score) in enumerate(recs, 1):
recommendations.append((user_id, item_id, rank, score))
except Exception:
# Fall back to individual predictions
return generate_recommendations_individual(
model, user_id, candidate_items, top_n
)
elif hasattr(model, "predict_batch"):
# Batch prediction if available
try:
user_items_batch = [(user_id, item_id) for item_id in candidate_items]
scores = model.predict_batch(user_items_batch)
# Sort by score and get top-N
scored_items = list(zip(candidate_items, scores))
scored_items.sort(key=lambda x: x[1], reverse=True)
for rank, (item_id, score) in enumerate(scored_items[:top_n], 1):
recommendations.append((user_id, item_id, rank, score))
except Exception:
return generate_recommendations_individual(
model, user_id, candidate_items, top_n
)
else:
return generate_recommendations_individual(
model, user_id, candidate_items, top_n
)
return recommendations
def generate_recommendations_individual(
model, user_id: int, candidate_items: list, top_n: int
) -> list:
"""
Fall back to individual predictions (slower but works with any model).
"""
predictions = []
# Batch the individual predictions for better performance
batch_size = 100
for i in range(0, len(candidate_items), batch_size):
batch_items = candidate_items[i : i + batch_size]
for item_id in batch_items:
try:
score = model.predict(user_id, item_id)
predictions.append((item_id, score))
except Exception as e:
print(f"Prediction error for user {user_id}, item {item_id}: {e}")
# Skip items that cause prediction errors
continue
# Sort by score and get top-N
predictions.sort(key=lambda x: x[1], reverse=True)
top_predictions = predictions[:top_n]
recommendations = []
for rank, (item_id, score) in enumerate(top_predictions, 1):
recommendations.append((user_id, item_id, rank, score))
return recommendations
def run_evaluation_with_proper_split(
data_reader: DataReader, model, test_size: float = 0.2, top_n: int = 10
) -> Dict:
"""
Alternative evaluation using a proper train/test split instead of leave-one-out.
"""
print(f"Starting evaluation with {test_size * 100}% test split...")
start_time = time.time()
# 1. Split data into train/test
train_dr, test_df = Splitter.split_leave_n_out(data_reader, frac=test_size)
print(f"Split completed: {len(test_df)} test interactions")
# 2. Filter test set to ensure all users/items exist in the training set
train_users = set(train_dr.dataset["userId"].unique())
train_items = set(train_dr.dataset["itemId"].unique())
original_test_len = len(test_df)
test_df = test_df[
test_df["userId"].isin(train_users) & test_df["itemId"].isin(train_items)
]
print(
f"Filtered test set: {len(test_df)} interactions remaining from {original_test_len}"
)
# 2. Train model
print("Training model...")
model.fit(train_dr)
# 3. Generate recommendations
print("Generating recommendations...")
recommendations = generate_recommendations_batch(model, train_dr, test_df, top_n)
# 4. Evaluate
evaluator = ModelEvaluator(test_df, top_n=top_n)
hit_ratio = evaluator.cal_hit_ratio(recommendations)
ndcg = evaluator.cal_ndcg(recommendations)
total_time = time.time() - start_time
print(f"Evaluation completed in {total_time:.2f} seconds")
return {
"Hit Ratio": hit_ratio,
"NDCG": ndcg,
"evaluation_time": total_time,
"test_interactions": len(test_df),
"total_recommendations": len(recommendations),
}
+68
View File
@@ -0,0 +1,68 @@
from typing import Dict, Any
from pygrex.utils import calculate_gild_for_explanations
class ExplanationEvaluator:
"""
A unified evaluator for different explanation methods.
This class takes the results generated by an explainer and calculates
a standard set of quality metrics, such as Fidelity and Diversity (GILD).
"""
def __init__(self):
"""Initializes the ExplanationEvaluator."""
# This class is stateless, so __init__ is simple.
pass
def evaluate(
self, explanation_results: Dict[str, Any], explainer_type: str
) -> Dict[str, float]:
"""
Calculates all relevant metrics for a given explanation result.
Args:
explanation_results: The dictionary returned by an explainer's
`find_explanation` method.
explainer_type: A string identifier for the explainer used
(e.g., "LORE4Groups", "EXPGRS").
Returns:
A dictionary containing the calculated metric scores.
"""
if not explanation_results:
return {"fidelity": 0.0, "gild": 0.0}
fidelity = self._calculate_fidelity(explanation_results)
gild = self._calculate_gild(explanation_results, explainer_type)
return {"fidelity": fidelity, "gild": gild}
def _calculate_fidelity(self, explanation_results: Dict[str, Any]) -> float:
"""
Extracts the fidelity score from the explanation results.
Fidelity is computed by the explainer itself, as it's the ratio of
items it was able to explain. This method standardizes its retrieval.
"""
return explanation_results.get("fidelity", 0.0)
def _calculate_gild(
self, explanation_results: Dict[str, Any], explainer_type: str
) -> float:
"""
Calculates the Gaussian Inter-List Diversity (GILD) of the explanations.
This is a wrapper around the utility function that handles the details.
It uses the 'details' part of the explanation results.
"""
explanation_details = explanation_results.get("details", {})
if not explanation_details:
return 0.0
# The GILD function is now called from a central, logical place.
gild_score = calculate_gild_for_explanations(
explanation_details, explainer_type
)
return gild_score
+179
View File
@@ -0,0 +1,179 @@
import numpy as np
import pandas as pd
class ModelEvaluator:
disc_functions = ["log", "linear"]
def __init__(self, test_set, top_n: int = 10, discount_function: str = "log"):
self.test_set = test_set
self._top_n = top_n
assert discount_function in self.disc_functions, "Wrong Discount Function."
self._discount_function = discount_function
self.num_users = self.test_set.userId.nunique()
@property
def top_n(self):
return self._top_n
@top_n.setter
def top_n(self, top_n: int):
self._top_n = top_n
@property
def discount_function(self):
return self._discount_function
@discount_function.setter
def discount_function(self, discount_function: str):
assert discount_function in self.disc_functions, "Wrong Discount Function."
self._discount_function = discount_function
def cal_hit_ratio(self, recommendations):
"""
Hit Ratio
:param recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
:return: hit rate.
"""
test_in_top_n = self.get_hits(recommendations)
# count hits per user
hits_per_user = self.count_positives(test_in_top_n)
# merge with the entire list of positive items for user
hits_per_user = hits_per_user.merge(
self.count_positives(self.test_set),
on="userId",
suffixes=("_true", ""),
how="right",
)
# if there are users with 0 hits the merge will have NA.
hits_per_user = hits_per_user.fillna(0)
# get the hit rate per user
hit_rate = hits_per_user.positive_true / hits_per_user.positive
# average
hit_rate = hit_rate.mean()
return hit_rate
def get_hits(self, recommendations):
"""
Find which items in the test set have a hit on the recommendations.
:param recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
:return: dataframe, removing the rows missing in the test set.
"""
# check whether there are top_n items per user
top_n_recommendations = self.filter_to_top_n(recommendations)
# find the hits
test_in_top_n = pd.merge(
top_n_recommendations, self.test_set, on=["userId", "itemId"]
)
return test_in_top_n
def filter_to_top_n(self, dataset):
"""
if rank > top_n, we do not use it for evaluation
:param dataset: dataframe, columns = ['userId', 'itemId', 'rank']
:return: dataframe, columns = ['userId', 'itemId', 'rank']
"""
return dataset[dataset["rank"] <= self.top_n]
def cal_ndcg(self, recommendations):
r"""
For evaluating the top-N recommendation list, we also provide the normalized Discounted Cumulative Gain at N
recommendation (nDCG@N) computed as the ratio of the Discounted Cumulative Gain(DCG) with the ideal Discounted
Cumulative Gain(IDCG):
DGC_{pos} = rel_1 + \sum_{i=2}^{pos} \frac{rel_i}{\log_2i} \qquad \qquad
IDGC_{pos} = rel_1 + \sum_{i=2}^{|h|-1} \frac{rel_i}{\log_2i} \\
nDCG_{pos} = \frac{DCG}{IDCG}
where pos denotes the position up to which relevance is accumulated, and $rel_i$ is the relevance of the recommended item at position \textit{i}.
Ref: Y. Wang, L. Wang, Y. Li, D. He, T.-Y. Liu, and W. Chen.
A theoretical analysis of ndcgtype ranking measures.
:param recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
:return: nDCG
"""
# get hits
hits = self.get_hits(recommendations)
DCG = self.cal_dcg(hits)
iDCG = self.cal_idcg()
# join to check if there are users in the test without hits
nDCG = iDCG.merge(DCG, on="userId", how="left")
nDCG = nDCG.fillna(0)
# normalize
nDCG["ndcg"] = nDCG["dcg"] / nDCG["idcg"]
return nDCG["ndcg"].mean()
def cal_dcg(self, hits):
"""
Discounted Comulative Gain
:param hits: recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
:return: DCG
"""
# todo: the gain so far is set to a constant.
if self.discount_function == "log":
hits["discounted_gain"] = np.log(2) / np.log(hits["rank"] + 1)
elif self.discount_function == "linear":
hits["discounted_gain"] = 1 / hits["rank"]
DCG = hits.groupby("userId")["discounted_gain"].sum()
return pd.DataFrame(
{"userId": hits["userId"].unique(), "dcg": DCG}
).reset_index(drop=True)
def cal_idcg(self):
"""
the Ideal DCG, is the DCG for the best ranking possible (i.e. all true positives were recommended first).
:return: iDCG
"""
# create a fake ranking for test set items.
# We assume that the items in the test set are all on the Top-N list.
count_positives = self.count_positives(self.test_set)
ideal_rank = [i for x in count_positives["positive"] for i in (range(1, x + 1))]
test_ideal_ranking = self.test_set.copy()
test_ideal_ranking["rank"] = ideal_rank
# Filter to have at most top-N items.
test_ideal_ranking = self.filter_to_top_n(test_ideal_ranking)
# get the dcg for the ideal ranking
idcg = self.cal_dcg(test_ideal_ranking)
idcg = idcg.rename(columns={"dcg": "idcg"})
return idcg
@staticmethod
def count_positives(dataset):
"""
Returns the positives count.
:param dataset: dataframe, columns = ['userId', 'itemId', 'rank']
:return: dataframe, columns = ['userId', 'positive']
"""
users_with_positives = dataset.userId.unique()
positives_per_user = dataset.groupby("userId")["itemId"].count()
positives_per_user = pd.DataFrame(
{"userId": users_with_positives, "positive": positives_per_user}
)
return positives_per_user.reset_index(drop=True)
# if __name__ == '__main__':
## recoms = pd.DataFrame({
# 'userId': [1, 1, 1, 2, 2, 2, 3, 3, 3],
# 'itemId': [1, 2, 3, 4, 1, 2, 2, 3, 4],
# 'rank': [1, 2, 3, 1, 2, 3, 1, 2, 3]
# })
# test = pd.DataFrame({
# 'userId': [1, 1, 2, 3],
# 'itemId': [1, 4, 1, 5]
# })
# eval = Evaluator(test_set=test, top_n=2)
# assert eval.num_users == 3, 'number of users'
# assert eval.top_n == 2, 'number of top n'
# eval.top_n = 3
# assert eval.top_n == 3, 'changing of top n'
# print(eval.cal_hit_ratio(recoms))
# print(eval.cal_ndcg(recoms))
+169
View File
@@ -0,0 +1,169 @@
import sys
import random
import pandas as pd
import copy
from pygrex.data_reader.data_reader import DataReader
def fix_data_reader_mappings(source: DataReader, target: DataReader):
target._num_user = source._num_user
target._num_item = source._num_item
# Copy over the original ID mappings
target.original_user_id = source.original_user_id
target.original_item_id = source.original_item_id
target.new_user_id = source.new_user_id
target.new_item_id = source.new_item_id
return target
class Splitter:
"""
Super Splitting Class.
args:
data: DataReader object, which contains in its dataset attribute 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
"""
def __init__(self):
pass
@staticmethod
def split_leave_latest_out(data: DataReader, n_latest: int = 1):
"""
Leave N latest interactions out train/test split.
Ref:
Campos, Pedro G., Fernando Díez, and Iván Cantador. "Time-aware recommender systems: a comprehensive survey and
analysis of existing evaluation protocols." User Modeling and User-Adapted Interaction 24.1-2 (2014): 67-119.
:param data:
:param n_latest: int, number of latest interactions to be in the the test set.
:returns train as DataReader, test as data.frames
"""
# group items by suer id and rank them by timestamp
rank_latest = data.dataset.groupby(["userId"])["timestamp"].rank(
method="first", ascending=False
)
# keep in test items that are ranked higher than n_latest
test = data.dataset[rank_latest <= n_latest]
# keep in train the rest
train = DataReader(dataframe=data.dataset.copy())
train.dataset = data.dataset[rank_latest > n_latest]
train = fix_data_reader_mappings(data, train)
return train, test
@staticmethod
def split_leave_n_out(data: DataReader, n: int = 1, frac: float | None = None):
"""
Leave N latest interactions out train/test split.
Ref:
Shani, Guy, and Asela Gunawardana. "Evaluating recommendation systems." Recommender systems handbook. Springer,
Boston, MA, 2011. 257-297.
:param data:
:param n int, number of interactions to be in the the test set.
:param frac float, fraction.
:returns dataframe train and test
"""
min_nr_ratings_user = min(data.dataset["userId"].value_counts())
if min_nr_ratings_user < n:
sys.exit(
"split_leave_n_out: There are users with less ratings than n (required number of interactions "
"in the test set)."
)
if frac is not None and frac > 1:
sys.exit("f (i.e.) fraction should be smaller than 1.")
# group items by user id and extraxt a random number of items per user
grouped = data.dataset.groupby(["userId"])
if frac is not None:
test = grouped.sample(frac=frac)
else:
test = grouped.sample(n=n)
test = test.reset_index(drop=True)
train_pd = pd.merge(
data.dataset,
test,
on=list(data.dataset.columns),
how="outer",
indicator=True,
)
train_pd = train_pd[train_pd["_merge"] == "left_only"]
train_pd = train_pd.drop(columns="_merge")
train = copy.deepcopy(data)
train.dataset = train_pd
train = fix_data_reader_mappings(data, train)
assert test.shape[0] + train_pd.shape[0] == data.dataset.shape[0]
return train, test
def rel_plus_n(
self,
data,
negative_sample_size: int = 99,
splitting: str = "latest",
n: int = 1,
):
"""
RelPlusN: We build the users test set by extracting one relevant random item ($HR_u$) from the entire set of
rated items. Then a set of random items with unknown relevance ($NR_u$), is extracted for each user $u$, where $u$
had no previous interaction with these items. Finally, for each item $i$ in $HR_u$, the algorithm requests a ranking
of the top-$N$ items from the set $ {i} cup NR_u$, on which the evaluation is performed. The evaluation metrics
are averaged over all the items in $HR_u$ and later over all the users. In the following, all experiments have been
conducted according to this protocol.
Ref:
- Paolo Cremonesi, Yehuda Koren, and Roberto Turrin. 2010. Performance of Recommender Algorithms on Top-n
Recommendation Tasks. InProceedings ofthe Fourth ACM Conference on Recommender Systems (RecSys 10).
- Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. 2017. Neural Collaborative
Filtering. In Proceedings of the 26th InternationalConference on World Wide Web (WWW 17).
:param data
:param negative_sample_size how many negative items to compute
:param splitting either latest for leave n latest out, or n for leave n out
:param n how many to leave out
"""
if splitting == "latest":
train, test = self.split_leave_latest_out(data, n)
elif splitting == "n":
train, test = self.split_leave_n_out(data, n)
else:
sys.exit('splitting can be either "latest" or "n". ')
neg_sample = self.sample_negative(data, negative_sample_size)
return train, pd.concat([test, neg_sample], ignore_index=True)
@staticmethod
def sample_negative(data, negative_sample_size):
"""return all negative items"""
item_catalogue = set(data.dataset["itemId"])
interact_status = (
data.dataset.groupby("userId")["itemId"]
.apply(set)
.reset_index()
.rename(columns={"itemId": "interacted_items"})
)
interact_status["negative_items"] = interact_status["interacted_items"].apply(
lambda x: item_catalogue - x
)
interact_status["negative_samples"] = interact_status["negative_items"].apply(
lambda x: random.sample(x, negative_sample_size)
)
interact_status = interact_status[["userId", "negative_samples"]]
userId = []
itemId = []
for row in interact_status.itertuples():
for i in range(negative_sample_size):
userId.append(int(row.userId))
itemId.append(int(row.negative_samples[i]))
return pd.DataFrame.from_dict({"userId": userId, "itemId": itemId})