170 lines
6.5 KiB
Python
170 lines
6.5 KiB
Python
import sys
|
||
import random
|
||
import pandas as pd
|
||
import copy
|
||
|
||
from pygrex.data_reader.data_reader import DataReader
|
||
|
||
|
||
def fix_data_reader_mappings(source: DataReader, target: DataReader):
|
||
target._num_user = source._num_user
|
||
target._num_item = source._num_item
|
||
# Copy over the original ID mappings
|
||
target.original_user_id = source.original_user_id
|
||
target.original_item_id = source.original_item_id
|
||
target.new_user_id = source.new_user_id
|
||
target.new_item_id = source.new_item_id
|
||
return target
|
||
|
||
|
||
class Splitter:
|
||
"""
|
||
Super Splitting Class.
|
||
args:
|
||
data: DataReader object, which contains in its dataset attribute 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
|
||
"""
|
||
|
||
def __init__(self):
|
||
pass
|
||
|
||
@staticmethod
|
||
def split_leave_latest_out(data: DataReader, n_latest: int = 1):
|
||
"""
|
||
Leave N latest interactions out train/test split.
|
||
Ref:
|
||
Campos, Pedro G., Fernando Díez, and Iván Cantador. "Time-aware recommender systems: a comprehensive survey and
|
||
analysis of existing evaluation protocols." User Modeling and User-Adapted Interaction 24.1-2 (2014): 67-119.
|
||
:param data:
|
||
:param n_latest: int, number of latest interactions to be in the the test set.
|
||
:returns train as DataReader, test as data.frames
|
||
"""
|
||
|
||
# group items by suer id and rank them by timestamp
|
||
rank_latest = data.dataset.groupby(["userId"])["timestamp"].rank(
|
||
method="first", ascending=False
|
||
)
|
||
|
||
# keep in test items that are ranked higher than n_latest
|
||
test = data.dataset[rank_latest <= n_latest]
|
||
# keep in train the rest
|
||
train = DataReader(dataframe=data.dataset.copy())
|
||
train.dataset = data.dataset[rank_latest > n_latest]
|
||
|
||
train = fix_data_reader_mappings(data, train)
|
||
|
||
return train, test
|
||
|
||
@staticmethod
|
||
def split_leave_n_out(data: DataReader, n: int = 1, frac: float | None = None):
|
||
"""
|
||
Leave N latest interactions out train/test split.
|
||
Ref:
|
||
Shani, Guy, and Asela Gunawardana. "Evaluating recommendation systems." Recommender systems handbook. Springer,
|
||
Boston, MA, 2011. 257-297.
|
||
:param data:
|
||
:param n int, number of interactions to be in the the test set.
|
||
:param frac float, fraction.
|
||
:returns dataframe train and test
|
||
"""
|
||
min_nr_ratings_user = min(data.dataset["userId"].value_counts())
|
||
|
||
if min_nr_ratings_user < n:
|
||
sys.exit(
|
||
"split_leave_n_out: There are users with less ratings than n (required number of interactions "
|
||
"in the test set)."
|
||
)
|
||
|
||
if frac is not None and frac > 1:
|
||
sys.exit("f (i.e.) fraction should be smaller than 1.")
|
||
|
||
# group items by user id and extraxt a random number of items per user
|
||
grouped = data.dataset.groupby(["userId"])
|
||
if frac is not None:
|
||
test = grouped.sample(frac=frac)
|
||
else:
|
||
test = grouped.sample(n=n)
|
||
|
||
test = test.reset_index(drop=True)
|
||
train_pd = pd.merge(
|
||
data.dataset,
|
||
test,
|
||
on=list(data.dataset.columns),
|
||
how="outer",
|
||
indicator=True,
|
||
)
|
||
train_pd = train_pd[train_pd["_merge"] == "left_only"]
|
||
train_pd = train_pd.drop(columns="_merge")
|
||
|
||
train = copy.deepcopy(data)
|
||
train.dataset = train_pd
|
||
train = fix_data_reader_mappings(data, train)
|
||
assert test.shape[0] + train_pd.shape[0] == data.dataset.shape[0]
|
||
|
||
return train, test
|
||
|
||
def rel_plus_n(
|
||
self,
|
||
data,
|
||
negative_sample_size: int = 99,
|
||
splitting: str = "latest",
|
||
n: int = 1,
|
||
):
|
||
"""
|
||
RelPlusN: We build the users test set by extracting one relevant random item ($HR_u$) from the entire set of
|
||
rated items. Then a set of random items with unknown relevance ($NR_u$), is extracted for each user $u$, where $u$
|
||
had no previous interaction with these items. Finally, for each item $i$ in $HR_u$, the algorithm requests a ranking
|
||
of the top-$N$ items from the set $ {i} cup NR_u$, on which the evaluation is performed. The evaluation metrics
|
||
are averaged over all the items in $HR_u$ and later over all the users. In the following, all experiments have been
|
||
conducted according to this protocol.
|
||
Ref:
|
||
- Paolo Cremonesi, Yehuda Koren, and Roberto Turrin. 2010. Performance of Recommender Algorithms on Top-n
|
||
Recommendation Tasks. InProceedings ofthe Fourth ACM Conference on Recommender Systems (RecSys ’10).
|
||
- Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. 2017. Neural Collaborative
|
||
Filtering. In Proceedings of the 26th InternationalConference on World Wide Web (WWW ’17).
|
||
:param data
|
||
:param negative_sample_size how many negative items to compute
|
||
:param splitting either latest for leave n latest out, or n for leave n out
|
||
:param n how many to leave out
|
||
|
||
"""
|
||
|
||
if splitting == "latest":
|
||
train, test = self.split_leave_latest_out(data, n)
|
||
elif splitting == "n":
|
||
train, test = self.split_leave_n_out(data, n)
|
||
else:
|
||
sys.exit('splitting can be either "latest" or "n". ')
|
||
|
||
neg_sample = self.sample_negative(data, negative_sample_size)
|
||
|
||
return train, pd.concat([test, neg_sample], ignore_index=True)
|
||
|
||
@staticmethod
|
||
def sample_negative(data, negative_sample_size):
|
||
"""return all negative items"""
|
||
|
||
item_catalogue = set(data.dataset["itemId"])
|
||
|
||
interact_status = (
|
||
data.dataset.groupby("userId")["itemId"]
|
||
.apply(set)
|
||
.reset_index()
|
||
.rename(columns={"itemId": "interacted_items"})
|
||
)
|
||
interact_status["negative_items"] = interact_status["interacted_items"].apply(
|
||
lambda x: item_catalogue - x
|
||
)
|
||
interact_status["negative_samples"] = interact_status["negative_items"].apply(
|
||
lambda x: random.sample(x, negative_sample_size)
|
||
)
|
||
interact_status = interact_status[["userId", "negative_samples"]]
|
||
|
||
userId = []
|
||
itemId = []
|
||
for row in interact_status.itertuples():
|
||
for i in range(negative_sample_size):
|
||
userId.append(int(row.userId))
|
||
itemId.append(int(row.negative_samples[i]))
|
||
|
||
return pd.DataFrame.from_dict({"userId": userId, "itemId": itemId})
|