public code v1

This commit is contained in:
2026-05-22 10:02:10 +02:00
commit 46a9ecf065
166 changed files with 6982454 additions and 0 deletions
+169
View File
@@ -0,0 +1,169 @@
import sys
import random
import pandas as pd
import copy
from pygrex.data_reader.data_reader import DataReader
def fix_data_reader_mappings(source: DataReader, target: DataReader):
target._num_user = source._num_user
target._num_item = source._num_item
# Copy over the original ID mappings
target.original_user_id = source.original_user_id
target.original_item_id = source.original_item_id
target.new_user_id = source.new_user_id
target.new_item_id = source.new_item_id
return target
class Splitter:
"""
Super Splitting Class.
args:
data: DataReader object, which contains in its dataset attribute 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
"""
def __init__(self):
pass
@staticmethod
def split_leave_latest_out(data: DataReader, n_latest: int = 1):
"""
Leave N latest interactions out train/test split.
Ref:
Campos, Pedro G., Fernando Díez, and Iván Cantador. "Time-aware recommender systems: a comprehensive survey and
analysis of existing evaluation protocols." User Modeling and User-Adapted Interaction 24.1-2 (2014): 67-119.
:param data:
:param n_latest: int, number of latest interactions to be in the the test set.
:returns train as DataReader, test as data.frames
"""
# group items by suer id and rank them by timestamp
rank_latest = data.dataset.groupby(["userId"])["timestamp"].rank(
method="first", ascending=False
)
# keep in test items that are ranked higher than n_latest
test = data.dataset[rank_latest <= n_latest]
# keep in train the rest
train = DataReader(dataframe=data.dataset.copy())
train.dataset = data.dataset[rank_latest > n_latest]
train = fix_data_reader_mappings(data, train)
return train, test
@staticmethod
def split_leave_n_out(data: DataReader, n: int = 1, frac: float | None = None):
"""
Leave N latest interactions out train/test split.
Ref:
Shani, Guy, and Asela Gunawardana. "Evaluating recommendation systems." Recommender systems handbook. Springer,
Boston, MA, 2011. 257-297.
:param data:
:param n int, number of interactions to be in the the test set.
:param frac float, fraction.
:returns dataframe train and test
"""
min_nr_ratings_user = min(data.dataset["userId"].value_counts())
if min_nr_ratings_user < n:
sys.exit(
"split_leave_n_out: There are users with less ratings than n (required number of interactions "
"in the test set)."
)
if frac is not None and frac > 1:
sys.exit("f (i.e.) fraction should be smaller than 1.")
# group items by user id and extraxt a random number of items per user
grouped = data.dataset.groupby(["userId"])
if frac is not None:
test = grouped.sample(frac=frac)
else:
test = grouped.sample(n=n)
test = test.reset_index(drop=True)
train_pd = pd.merge(
data.dataset,
test,
on=list(data.dataset.columns),
how="outer",
indicator=True,
)
train_pd = train_pd[train_pd["_merge"] == "left_only"]
train_pd = train_pd.drop(columns="_merge")
train = copy.deepcopy(data)
train.dataset = train_pd
train = fix_data_reader_mappings(data, train)
assert test.shape[0] + train_pd.shape[0] == data.dataset.shape[0]
return train, test
def rel_plus_n(
self,
data,
negative_sample_size: int = 99,
splitting: str = "latest",
n: int = 1,
):
"""
RelPlusN: We build the users test set by extracting one relevant random item ($HR_u$) from the entire set of
rated items. Then a set of random items with unknown relevance ($NR_u$), is extracted for each user $u$, where $u$
had no previous interaction with these items. Finally, for each item $i$ in $HR_u$, the algorithm requests a ranking
of the top-$N$ items from the set $ {i} cup NR_u$, on which the evaluation is performed. The evaluation metrics
are averaged over all the items in $HR_u$ and later over all the users. In the following, all experiments have been
conducted according to this protocol.
Ref:
- Paolo Cremonesi, Yehuda Koren, and Roberto Turrin. 2010. Performance of Recommender Algorithms on Top-n
Recommendation Tasks. InProceedings ofthe Fourth ACM Conference on Recommender Systems (RecSys 10).
- Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. 2017. Neural Collaborative
Filtering. In Proceedings of the 26th InternationalConference on World Wide Web (WWW 17).
:param data
:param negative_sample_size how many negative items to compute
:param splitting either latest for leave n latest out, or n for leave n out
:param n how many to leave out
"""
if splitting == "latest":
train, test = self.split_leave_latest_out(data, n)
elif splitting == "n":
train, test = self.split_leave_n_out(data, n)
else:
sys.exit('splitting can be either "latest" or "n". ')
neg_sample = self.sample_negative(data, negative_sample_size)
return train, pd.concat([test, neg_sample], ignore_index=True)
@staticmethod
def sample_negative(data, negative_sample_size):
"""return all negative items"""
item_catalogue = set(data.dataset["itemId"])
interact_status = (
data.dataset.groupby("userId")["itemId"]
.apply(set)
.reset_index()
.rename(columns={"itemId": "interacted_items"})
)
interact_status["negative_items"] = interact_status["interacted_items"].apply(
lambda x: item_catalogue - x
)
interact_status["negative_samples"] = interact_status["negative_items"].apply(
lambda x: random.sample(x, negative_sample_size)
)
interact_status = interact_status[["userId", "negative_samples"]]
userId = []
itemId = []
for row in interact_status.itertuples():
for i in range(negative_sample_size):
userId.append(int(row.userId))
itemId.append(int(row.negative_samples[i]))
return pd.DataFrame.from_dict({"userId": userId, "itemId": itemId})