public code v1
This commit is contained in:
@@ -0,0 +1,169 @@
|
||||
import sys
|
||||
import random
|
||||
import pandas as pd
|
||||
import copy
|
||||
|
||||
from pygrex.data_reader.data_reader import DataReader
|
||||
|
||||
|
||||
def fix_data_reader_mappings(source: DataReader, target: DataReader):
|
||||
target._num_user = source._num_user
|
||||
target._num_item = source._num_item
|
||||
# Copy over the original ID mappings
|
||||
target.original_user_id = source.original_user_id
|
||||
target.original_item_id = source.original_item_id
|
||||
target.new_user_id = source.new_user_id
|
||||
target.new_item_id = source.new_item_id
|
||||
return target
|
||||
|
||||
|
||||
class Splitter:
|
||||
"""
|
||||
Super Splitting Class.
|
||||
args:
|
||||
data: DataReader object, which contains in its dataset attribute 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def split_leave_latest_out(data: DataReader, n_latest: int = 1):
|
||||
"""
|
||||
Leave N latest interactions out train/test split.
|
||||
Ref:
|
||||
Campos, Pedro G., Fernando Díez, and Iván Cantador. "Time-aware recommender systems: a comprehensive survey and
|
||||
analysis of existing evaluation protocols." User Modeling and User-Adapted Interaction 24.1-2 (2014): 67-119.
|
||||
:param data:
|
||||
:param n_latest: int, number of latest interactions to be in the the test set.
|
||||
:returns train as DataReader, test as data.frames
|
||||
"""
|
||||
|
||||
# group items by suer id and rank them by timestamp
|
||||
rank_latest = data.dataset.groupby(["userId"])["timestamp"].rank(
|
||||
method="first", ascending=False
|
||||
)
|
||||
|
||||
# keep in test items that are ranked higher than n_latest
|
||||
test = data.dataset[rank_latest <= n_latest]
|
||||
# keep in train the rest
|
||||
train = DataReader(dataframe=data.dataset.copy())
|
||||
train.dataset = data.dataset[rank_latest > n_latest]
|
||||
|
||||
train = fix_data_reader_mappings(data, train)
|
||||
|
||||
return train, test
|
||||
|
||||
@staticmethod
|
||||
def split_leave_n_out(data: DataReader, n: int = 1, frac: float | None = None):
|
||||
"""
|
||||
Leave N latest interactions out train/test split.
|
||||
Ref:
|
||||
Shani, Guy, and Asela Gunawardana. "Evaluating recommendation systems." Recommender systems handbook. Springer,
|
||||
Boston, MA, 2011. 257-297.
|
||||
:param data:
|
||||
:param n int, number of interactions to be in the the test set.
|
||||
:param frac float, fraction.
|
||||
:returns dataframe train and test
|
||||
"""
|
||||
min_nr_ratings_user = min(data.dataset["userId"].value_counts())
|
||||
|
||||
if min_nr_ratings_user < n:
|
||||
sys.exit(
|
||||
"split_leave_n_out: There are users with less ratings than n (required number of interactions "
|
||||
"in the test set)."
|
||||
)
|
||||
|
||||
if frac is not None and frac > 1:
|
||||
sys.exit("f (i.e.) fraction should be smaller than 1.")
|
||||
|
||||
# group items by user id and extraxt a random number of items per user
|
||||
grouped = data.dataset.groupby(["userId"])
|
||||
if frac is not None:
|
||||
test = grouped.sample(frac=frac)
|
||||
else:
|
||||
test = grouped.sample(n=n)
|
||||
|
||||
test = test.reset_index(drop=True)
|
||||
train_pd = pd.merge(
|
||||
data.dataset,
|
||||
test,
|
||||
on=list(data.dataset.columns),
|
||||
how="outer",
|
||||
indicator=True,
|
||||
)
|
||||
train_pd = train_pd[train_pd["_merge"] == "left_only"]
|
||||
train_pd = train_pd.drop(columns="_merge")
|
||||
|
||||
train = copy.deepcopy(data)
|
||||
train.dataset = train_pd
|
||||
train = fix_data_reader_mappings(data, train)
|
||||
assert test.shape[0] + train_pd.shape[0] == data.dataset.shape[0]
|
||||
|
||||
return train, test
|
||||
|
||||
def rel_plus_n(
|
||||
self,
|
||||
data,
|
||||
negative_sample_size: int = 99,
|
||||
splitting: str = "latest",
|
||||
n: int = 1,
|
||||
):
|
||||
"""
|
||||
RelPlusN: We build the users test set by extracting one relevant random item ($HR_u$) from the entire set of
|
||||
rated items. Then a set of random items with unknown relevance ($NR_u$), is extracted for each user $u$, where $u$
|
||||
had no previous interaction with these items. Finally, for each item $i$ in $HR_u$, the algorithm requests a ranking
|
||||
of the top-$N$ items from the set $ {i} cup NR_u$, on which the evaluation is performed. The evaluation metrics
|
||||
are averaged over all the items in $HR_u$ and later over all the users. In the following, all experiments have been
|
||||
conducted according to this protocol.
|
||||
Ref:
|
||||
- Paolo Cremonesi, Yehuda Koren, and Roberto Turrin. 2010. Performance of Recommender Algorithms on Top-n
|
||||
Recommendation Tasks. InProceedings ofthe Fourth ACM Conference on Recommender Systems (RecSys ’10).
|
||||
- Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. 2017. Neural Collaborative
|
||||
Filtering. In Proceedings of the 26th InternationalConference on World Wide Web (WWW ’17).
|
||||
:param data
|
||||
:param negative_sample_size how many negative items to compute
|
||||
:param splitting either latest for leave n latest out, or n for leave n out
|
||||
:param n how many to leave out
|
||||
|
||||
"""
|
||||
|
||||
if splitting == "latest":
|
||||
train, test = self.split_leave_latest_out(data, n)
|
||||
elif splitting == "n":
|
||||
train, test = self.split_leave_n_out(data, n)
|
||||
else:
|
||||
sys.exit('splitting can be either "latest" or "n". ')
|
||||
|
||||
neg_sample = self.sample_negative(data, negative_sample_size)
|
||||
|
||||
return train, pd.concat([test, neg_sample], ignore_index=True)
|
||||
|
||||
@staticmethod
|
||||
def sample_negative(data, negative_sample_size):
|
||||
"""return all negative items"""
|
||||
|
||||
item_catalogue = set(data.dataset["itemId"])
|
||||
|
||||
interact_status = (
|
||||
data.dataset.groupby("userId")["itemId"]
|
||||
.apply(set)
|
||||
.reset_index()
|
||||
.rename(columns={"itemId": "interacted_items"})
|
||||
)
|
||||
interact_status["negative_items"] = interact_status["interacted_items"].apply(
|
||||
lambda x: item_catalogue - x
|
||||
)
|
||||
interact_status["negative_samples"] = interact_status["negative_items"].apply(
|
||||
lambda x: random.sample(x, negative_sample_size)
|
||||
)
|
||||
interact_status = interact_status[["userId", "negative_samples"]]
|
||||
|
||||
userId = []
|
||||
itemId = []
|
||||
for row in interact_status.itertuples():
|
||||
for i in range(negative_sample_size):
|
||||
userId.append(int(row.userId))
|
||||
itemId.append(int(row.negative_samples[i]))
|
||||
|
||||
return pd.DataFrame.from_dict({"userId": userId, "itemId": itemId})
|
||||
Reference in New Issue
Block a user