public code v1
This commit is contained in:
@@ -0,0 +1,23 @@
|
||||
from .als_model import ALS
|
||||
from .bpr_model import BPR
|
||||
from .gmf_model import GMFModel
|
||||
from .emf_model import EMFModel
|
||||
from .autoencoder_model import ExplAutoencoderTorch
|
||||
from .mlp_model import MLPModel
|
||||
from .emf_model import PyTorchModel
|
||||
from .knn_basic_model import KNNBasic
|
||||
from .svd_model import SVD
|
||||
from .recommender_model import RecommenderModel
|
||||
|
||||
__all__ = [
|
||||
"ALS",
|
||||
"BPR",
|
||||
"GMFModel",
|
||||
"EMFModel",
|
||||
"PyTorchModel",
|
||||
"MLPModel",
|
||||
"ExplAutoencoderTorch",
|
||||
"KNNBasic",
|
||||
"SVD",
|
||||
"RecommenderModel",
|
||||
]
|
||||
@@ -0,0 +1,31 @@
|
||||
import implicit
|
||||
|
||||
from .mf_implicit_model import MFImplicitModel
|
||||
|
||||
|
||||
class ALS(MFImplicitModel):
|
||||
def __init__(
|
||||
self,
|
||||
latent_dim,
|
||||
reg_term,
|
||||
epochs,
|
||||
random_state=42,
|
||||
num_users=None,
|
||||
num_items=None,
|
||||
**kwargs,
|
||||
):
|
||||
super(ALS, self).__init__(
|
||||
latent_dim=latent_dim,
|
||||
reg_term=reg_term,
|
||||
epochs=epochs,
|
||||
learning_rate=None,
|
||||
num_users=num_users,
|
||||
num_items=num_items,
|
||||
)
|
||||
|
||||
self.model = implicit.als.AlternatingLeastSquares(
|
||||
factors=self.latent_dim,
|
||||
regularization=self.reg_term,
|
||||
iterations=self.epochs,
|
||||
random_state=random_state,
|
||||
)
|
||||
@@ -0,0 +1,223 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim
|
||||
from scipy import sparse
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm.auto import tqdm
|
||||
from typing import Optional, Union, List
|
||||
|
||||
from pygrex.utils.torch_utils import use_cuda, use_optimizer
|
||||
from pygrex.data_reader import UserItemDict, DataReader
|
||||
from .recommender_model import RecommenderModel
|
||||
|
||||
|
||||
class ExplAutoencoderTorch(RecommenderModel, nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
hidden_layer_features: int,
|
||||
learning_rate: float,
|
||||
positive_threshold: float,
|
||||
weight_decay: float,
|
||||
epochs: int,
|
||||
knn: int,
|
||||
cuda: bool,
|
||||
optimizer_name: str,
|
||||
expl: bool,
|
||||
device_id: Optional[int] = None,
|
||||
):
|
||||
super().__init__()
|
||||
if optimizer_name not in ["sgd", "adam", "rmsprop"]:
|
||||
raise Exception("Wrong optimizer.")
|
||||
if cuda:
|
||||
use_cuda(True, device_id if device_id is not None else 0)
|
||||
|
||||
self.positive_threshold = positive_threshold
|
||||
self.weight_decay = weight_decay
|
||||
self.knn = knn
|
||||
self.learning_rate = learning_rate
|
||||
self.epochs = epochs
|
||||
self.use_gpu = cuda
|
||||
self.optimizer_name = optimizer_name
|
||||
self.hidden_layer_features = hidden_layer_features
|
||||
self.expl = expl
|
||||
|
||||
self.dataset = None
|
||||
self.data = None
|
||||
self.embedding_user = None
|
||||
self.embedding_item = None
|
||||
self.optimizer: Optional[torch.optim.Optimizer] = None
|
||||
|
||||
self.explainability_matrix = None
|
||||
self.sim_users = {}
|
||||
|
||||
self.criterion = nn.MSELoss()
|
||||
|
||||
def fit(self, data: DataReader):
|
||||
self.data = data
|
||||
self.dataset = data.dataset
|
||||
num_items = self.data.num_item
|
||||
|
||||
self.encoder_hidden_layer = nn.Linear(
|
||||
in_features=num_items, out_features=self.hidden_layer_features
|
||||
)
|
||||
|
||||
self.decoder_output_layer = nn.Linear(
|
||||
in_features=self.hidden_layer_features, out_features=num_items
|
||||
)
|
||||
|
||||
self.compute_explainability()
|
||||
optimizer = use_optimizer(
|
||||
network=self,
|
||||
weight_decay=self.weight_decay,
|
||||
learning_rate=self.learning_rate,
|
||||
optimizer_name=self.optimizer_name,
|
||||
)
|
||||
|
||||
assert isinstance(optimizer, torch.optim.Optimizer)
|
||||
self.optimizer = optimizer
|
||||
|
||||
with tqdm(total=self.epochs) as progress:
|
||||
train_loader = self.instance_a_train_loader()
|
||||
for epoch in range(self.epochs):
|
||||
loss = self.train_an_epoch(train_loader)
|
||||
progress.update(1)
|
||||
progress.set_postfix({"loss": loss})
|
||||
|
||||
def compute_explainability(self):
|
||||
assert self.dataset is not None
|
||||
assert self.data is not None
|
||||
ds = self.dataset.pivot(index="userId", columns="itemId", values="rating")
|
||||
ds = ds.fillna(0)
|
||||
ds = sparse.csr_matrix(ds)
|
||||
sim_matrix = cosine_similarity(ds)
|
||||
min_val = sim_matrix.min() - 1
|
||||
|
||||
for i in range(self.data.num_user):
|
||||
sim_matrix[i, i] = min_val
|
||||
|
||||
knn_to_user_i = (-sim_matrix[i, :]).argsort()[: self.knn]
|
||||
self.sim_users[i] = knn_to_user_i
|
||||
|
||||
self.explainability_matrix = np.zeros((self.data.num_user, self.data.num_item))
|
||||
|
||||
filter_dataset_on_threshold = self.dataset[
|
||||
self.dataset["rating"] >= self.positive_threshold
|
||||
]
|
||||
|
||||
for i in range(self.data.num_user):
|
||||
knn_to_user_i = self.sim_users[i]
|
||||
|
||||
rated_items_by_sim_users = filter_dataset_on_threshold[
|
||||
filter_dataset_on_threshold["userId"].isin(knn_to_user_i)
|
||||
]
|
||||
|
||||
sim_scores = rated_items_by_sim_users.groupby(by="itemId")
|
||||
sim_scores = sim_scores["rating"].sum()
|
||||
sim_scores = sim_scores.reset_index()
|
||||
|
||||
self.explainability_matrix[i, sim_scores.itemId] = (
|
||||
sim_scores.rating.to_list()
|
||||
)
|
||||
|
||||
self.explainability_matrix = MinMaxScaler().fit_transform(
|
||||
self.explainability_matrix
|
||||
)
|
||||
|
||||
self.explainability_matrix = torch.from_numpy(self.explainability_matrix)
|
||||
|
||||
def instance_a_train_loader(self):
|
||||
"""instance train loader for one training epoch"""
|
||||
assert self.dataset is not None
|
||||
assert self.explainability_matrix is not None
|
||||
self.user_item_dict = UserItemDict(
|
||||
self.dataset, self.explainability_matrix, self.expl
|
||||
)
|
||||
return DataLoader(self.user_item_dict, shuffle=True)
|
||||
|
||||
def train_an_epoch(self, train_loader):
|
||||
self.train()
|
||||
cnt = 0
|
||||
total_loss = 0
|
||||
for batch_id, batch in enumerate(train_loader):
|
||||
assert isinstance(batch[0], torch.Tensor)
|
||||
rating = batch[0]
|
||||
rating = rating.float()
|
||||
loss = self.train_single_user(rating)
|
||||
total_loss += loss
|
||||
cnt += 1
|
||||
return total_loss / cnt
|
||||
|
||||
def train_single_user(self, ratings):
|
||||
if self.use_gpu:
|
||||
ratings = ratings.cuda()
|
||||
|
||||
assert self.optimizer is not None
|
||||
self.optimizer.zero_grad()
|
||||
ratings_pred = self(ratings)
|
||||
loss = self.criterion(ratings_pred, ratings)
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
loss = loss.item()
|
||||
return loss
|
||||
|
||||
def forward(self, user_adjusted_ratings):
|
||||
activation = self.encoder_hidden_layer(user_adjusted_ratings)
|
||||
code = torch.relu(activation)
|
||||
activation = self.decoder_output_layer(code)
|
||||
reconstructed_ratings = torch.relu(activation)
|
||||
return reconstructed_ratings
|
||||
|
||||
def predict(
|
||||
self, user_id: Union[int, List[int], str], item_id: Union[int, List[int], str]
|
||||
) -> list:
|
||||
try:
|
||||
if isinstance(user_id, str):
|
||||
user_id = int(user_id)
|
||||
elif isinstance(user_id, list):
|
||||
user_id = [int(u) for u in user_id]
|
||||
if isinstance(item_id, str):
|
||||
item_id = int(item_id)
|
||||
elif isinstance(item_id, list):
|
||||
item_id = [int(i) for i in item_id]
|
||||
except (ValueError, TypeError):
|
||||
raise ValueError(
|
||||
"User and item IDs must be integers or strings that can be converted to integers."
|
||||
)
|
||||
|
||||
single_user = isinstance(user_id, int)
|
||||
single_item = isinstance(item_id, int)
|
||||
|
||||
if isinstance(user_id, int):
|
||||
user_id = [user_id]
|
||||
if isinstance(item_id, int):
|
||||
item_id = [item_id]
|
||||
|
||||
with torch.no_grad():
|
||||
assert self.user_item_dict is not None, "The model has not been fitted yet."
|
||||
|
||||
# Collect ratings for all users
|
||||
ratings_list = []
|
||||
for uid in user_id:
|
||||
rating = self.user_item_dict[uid] # Pass scalar user_id to dict
|
||||
ratings_list.append(rating)
|
||||
|
||||
rating = torch.stack(ratings_list)
|
||||
rating = rating.float()
|
||||
if self.use_gpu:
|
||||
rating = rating.cuda()
|
||||
pred = self.forward(rating).cpu()
|
||||
predictions = pred[:, item_id].tolist()
|
||||
|
||||
# Flatten the nested list if it contains only one user's predictions
|
||||
if single_user and single_item:
|
||||
return (
|
||||
predictions[0][0]
|
||||
if isinstance(predictions[0], list)
|
||||
else predictions[0]
|
||||
)
|
||||
elif single_user:
|
||||
return predictions[0]
|
||||
return predictions
|
||||
@@ -0,0 +1,25 @@
|
||||
import implicit
|
||||
|
||||
from .mf_implicit_model import MFImplicitModel
|
||||
|
||||
|
||||
class BPR(MFImplicitModel):
|
||||
""""""
|
||||
def __init__(self,
|
||||
latent_dim,
|
||||
reg_term,
|
||||
learning_rate,
|
||||
epochs,
|
||||
**kwargs):
|
||||
|
||||
super(BPR, self).__init__(latent_dim=latent_dim,
|
||||
reg_term=reg_term,
|
||||
learning_rate=learning_rate,
|
||||
epochs=epochs)
|
||||
|
||||
self.model = implicit.bpr.BayesianPersonalizedRanking(
|
||||
factors=self.latent_dim,
|
||||
learning_rate=self.learning_rate,
|
||||
regularization=self.reg_term,
|
||||
iterations=self.epochs
|
||||
)
|
||||
@@ -0,0 +1,391 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from scipy import sparse
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm.auto import tqdm
|
||||
from typing import Union
|
||||
|
||||
from pygrex.data_reader import UserItemRatingDataset, DataReader
|
||||
from pygrex.utils import EMFLoss
|
||||
from .py_torch_model import PyTorchModel
|
||||
from .recommender_model import RecommenderModel
|
||||
|
||||
|
||||
class EMFModel(RecommenderModel):
|
||||
def __init__(
|
||||
self,
|
||||
learning_rate: float,
|
||||
reg_term: float,
|
||||
expl_reg_term: float,
|
||||
positive_threshold: float,
|
||||
latent_dim: int,
|
||||
epochs: int,
|
||||
knn: int,
|
||||
):
|
||||
self.latent_dim = latent_dim
|
||||
self.learning_rate = learning_rate
|
||||
self.epochs = epochs
|
||||
|
||||
self.dataset = None
|
||||
self.data = None
|
||||
self.embedding_user = None
|
||||
self.embedding_item = None
|
||||
self.optimizer = None
|
||||
|
||||
self.reg_term = reg_term
|
||||
self.expl_reg_term = expl_reg_term
|
||||
self.positive_threshold = positive_threshold
|
||||
self.knn = knn
|
||||
|
||||
self.explainability_matrix = None
|
||||
self.sim_users = {}
|
||||
|
||||
self.affine_output = nn.Linear(in_features=self.latent_dim, out_features=1)
|
||||
|
||||
self.criterion = EMFLoss()
|
||||
|
||||
def fit(self, data: DataReader) -> None:
|
||||
self.data = data
|
||||
self.dataset = data.dataset
|
||||
|
||||
assert self.data is not None
|
||||
num_users = self.data.num_user
|
||||
num_items = self.data.num_item
|
||||
|
||||
self.embedding_user = np.random.uniform(
|
||||
low=0, high=0.5 / self.latent_dim, size=(num_users, self.latent_dim)
|
||||
)
|
||||
|
||||
self.embedding_item = np.random.uniform(
|
||||
low=0, high=0.5 / self.latent_dim, size=(num_items, self.latent_dim)
|
||||
)
|
||||
|
||||
self.compute_explainability()
|
||||
|
||||
with tqdm(total=self.epochs) as progress:
|
||||
assert self.dataset is not None
|
||||
for epoch in range(self.epochs):
|
||||
self.dataset = self.dataset.sample(frac=1)
|
||||
loss = []
|
||||
for _, row in self.dataset.iterrows():
|
||||
user_id = int(row.userId)
|
||||
item_id = int(row.itemId)
|
||||
|
||||
p_ui = self.predict(user_id, item_id)
|
||||
|
||||
e_ui = row.rating - p_ui
|
||||
|
||||
loss.append(e_ui**2)
|
||||
|
||||
assert self.embedding_item is not None
|
||||
assert self.embedding_user is not None
|
||||
delta_u = 2 * e_ui * self.embedding_item[item_id, :]
|
||||
delta_u -= self.reg_term * self.embedding_user[user_id, :]
|
||||
temp = np.sign(
|
||||
self.embedding_item[item_id, :]
|
||||
- self.embedding_user[user_id, :]
|
||||
)
|
||||
assert self.explainability_matrix is not None
|
||||
temp *= (
|
||||
self.expl_reg_term
|
||||
* self.explainability_matrix[user_id, item_id]
|
||||
)
|
||||
delta_u -= temp
|
||||
|
||||
delta_v = 2 * e_ui * self.embedding_user[user_id, :]
|
||||
delta_v -= self.reg_term * self.embedding_item[item_id, :]
|
||||
temp = np.sign(
|
||||
self.embedding_user[user_id, :]
|
||||
- self.embedding_item[item_id, :]
|
||||
)
|
||||
assert self.explainability_matrix is not None
|
||||
temp *= (
|
||||
self.expl_reg_term
|
||||
* self.explainability_matrix[user_id, item_id]
|
||||
)
|
||||
delta_v -= temp
|
||||
|
||||
self.embedding_user[user_id, :] += self.learning_rate * delta_u
|
||||
self.embedding_item[item_id, :] += self.learning_rate * delta_v
|
||||
|
||||
progress.update(1)
|
||||
|
||||
progress.set_postfix({"MSE": sum(loss) / len(loss)})
|
||||
|
||||
def compute_explainability(self):
|
||||
assert self.dataset is not None
|
||||
ds = self.dataset.pivot(index="userId", columns="itemId", values="rating")
|
||||
ds = ds.fillna(0)
|
||||
ds = sparse.csr_matrix(ds)
|
||||
sim_matrix = cosine_similarity(ds)
|
||||
min_val = sim_matrix.min() - 1
|
||||
|
||||
assert self.data is not None
|
||||
for i in range(self.data.num_user):
|
||||
sim_matrix[i, i] = min_val
|
||||
|
||||
knn_to_user_i = (-sim_matrix[i, :]).argsort()[: self.knn]
|
||||
self.sim_users[i] = knn_to_user_i
|
||||
|
||||
self.explainability_matrix = np.zeros((self.data.num_user, self.data.num_item))
|
||||
|
||||
filter_dataset_on_threshold = self.dataset[
|
||||
self.dataset["rating"] >= self.positive_threshold
|
||||
]
|
||||
|
||||
for i in range(self.data.num_user):
|
||||
knn_to_user_i = self.sim_users[i]
|
||||
|
||||
rated_items_by_sim_users = filter_dataset_on_threshold[
|
||||
filter_dataset_on_threshold["userId"].isin(knn_to_user_i)
|
||||
]
|
||||
|
||||
sim_scores = rated_items_by_sim_users.groupby(by="itemId")
|
||||
sim_scores = sim_scores["rating"].sum()
|
||||
sim_scores = sim_scores.reset_index()
|
||||
|
||||
self.explainability_matrix[i, sim_scores.itemId.astype(int)] = (
|
||||
sim_scores.rating.to_list()
|
||||
)
|
||||
|
||||
self.explainability_matrix = MinMaxScaler().fit_transform(
|
||||
self.explainability_matrix
|
||||
)
|
||||
|
||||
def predict(
|
||||
self, user_id: Union[int, str], item_id: Union[int, str]
|
||||
) -> Union[float, list]:
|
||||
user_id_processed = user_id
|
||||
item_id_processed = item_id
|
||||
|
||||
if isinstance(user_id_processed, np.ndarray):
|
||||
user_id_processed = user_id_processed.tolist()
|
||||
if isinstance(item_id_processed, np.ndarray):
|
||||
item_id_processed = item_id_processed.tolist()
|
||||
|
||||
is_list_input = isinstance(user_id_processed, list) or isinstance(
|
||||
item_id_processed, list
|
||||
)
|
||||
|
||||
if is_list_input:
|
||||
user_id_list = (
|
||||
user_id_processed
|
||||
if isinstance(user_id_processed, list)
|
||||
else [user_id_processed]
|
||||
)
|
||||
item_id_list = (
|
||||
item_id_processed
|
||||
if isinstance(item_id_processed, list)
|
||||
else [item_id_processed]
|
||||
)
|
||||
predictions = []
|
||||
for u in user_id_list:
|
||||
assert self.embedding_user is not None
|
||||
assert self.embedding_item is not None
|
||||
pred = [
|
||||
np.dot(
|
||||
self.embedding_user[int(u), :], self.embedding_item[int(i), :]
|
||||
)
|
||||
for i in item_id_list
|
||||
]
|
||||
predictions.append(pred)
|
||||
predictions_np = np.array(predictions)
|
||||
|
||||
if len(user_id_list) == 1 or len(item_id_list) == 1:
|
||||
predictions_np = predictions_np.flatten()
|
||||
|
||||
return predictions_np.tolist()
|
||||
|
||||
else:
|
||||
assert self.embedding_user is not None
|
||||
assert self.embedding_item is not None
|
||||
return np.dot(
|
||||
self.embedding_user[int(user_id), :],
|
||||
self.embedding_item[int(item_id), :],
|
||||
)
|
||||
|
||||
def user_embedding(self):
|
||||
return self.embedding_user
|
||||
|
||||
def item_embedding(self):
|
||||
return self.embedding_item
|
||||
|
||||
|
||||
class EMFTorchModel(PyTorchModel):
|
||||
def __init__(
|
||||
self,
|
||||
learning_rate: float,
|
||||
reg_term: float,
|
||||
expl_reg_term: float,
|
||||
positive_threshold: float,
|
||||
momentum: float,
|
||||
weight_decay: float,
|
||||
latent_dim: int,
|
||||
epochs: int,
|
||||
batch_size: int,
|
||||
knn: int,
|
||||
cuda: bool,
|
||||
optimizer_name: str,
|
||||
device_id=None,
|
||||
):
|
||||
super().__init__(
|
||||
learning_rate=learning_rate,
|
||||
latent_dim=latent_dim,
|
||||
epochs=epochs,
|
||||
batch_size=batch_size,
|
||||
cuda=cuda,
|
||||
optimizer_name=optimizer_name,
|
||||
device_id=device_id,
|
||||
)
|
||||
|
||||
self.reg_term = reg_term
|
||||
self.expl_reg_term = expl_reg_term
|
||||
self.positive_threshold = positive_threshold
|
||||
self.momentum = momentum
|
||||
self.weight_decay = weight_decay
|
||||
self.knn = knn
|
||||
|
||||
self.explainability_matrix = None
|
||||
self.sim_users = {}
|
||||
|
||||
self.affine_output = nn.Linear(in_features=self.latent_dim, out_features=1)
|
||||
|
||||
self.criterion = EMFLoss()
|
||||
|
||||
def fit(self, data: DataReader) -> None:
|
||||
self.data = data
|
||||
self.dataset = data.dataset
|
||||
|
||||
assert self.data is not None
|
||||
num_users = self.data.num_user
|
||||
num_items = self.data.num_item
|
||||
|
||||
self.embedding_user = nn.Embedding(
|
||||
num_embeddings=num_users, embedding_dim=self.latent_dim
|
||||
)
|
||||
|
||||
self.embedding_item = nn.Embedding(
|
||||
num_embeddings=num_items, embedding_dim=self.latent_dim
|
||||
)
|
||||
|
||||
self.compute_explainability()
|
||||
|
||||
self.optimizer = torch.optim.SGD(
|
||||
self.parameters(),
|
||||
lr=self.learning_rate,
|
||||
momentum=self.momentum,
|
||||
weight_decay=self.weight_decay,
|
||||
)
|
||||
|
||||
with tqdm(total=self.epochs) as progress:
|
||||
for epoch in range(self.epochs):
|
||||
train_loader = self.instance_a_train_loader(self.batch_size)
|
||||
loss = self.train_an_epoch(train_loader)
|
||||
progress.update(1)
|
||||
progress.set_postfix({"loss": loss})
|
||||
|
||||
def compute_explainability(self):
|
||||
assert self.dataset is not None
|
||||
ds = self.dataset.pivot(index="userId", columns="itemId", values="rating")
|
||||
ds = ds.fillna(0)
|
||||
ds = sparse.csr_matrix(ds)
|
||||
sim_matrix = cosine_similarity(ds)
|
||||
min_val = sim_matrix.min() - 1
|
||||
|
||||
assert self.data is not None
|
||||
for i in range(self.data.num_user):
|
||||
sim_matrix[i, i] = min_val
|
||||
|
||||
knn_to_user_i = (-sim_matrix[i, :]).argsort()[: self.knn]
|
||||
self.sim_users[i] = knn_to_user_i
|
||||
|
||||
self.explainability_matrix = np.zeros((self.data.num_user, self.data.num_item))
|
||||
|
||||
filter_dataset_on_threshold = self.dataset[
|
||||
self.dataset["rating"] >= self.positive_threshold
|
||||
]
|
||||
|
||||
for i in range(self.data.num_user):
|
||||
knn_to_user_i = self.sim_users[i]
|
||||
|
||||
rated_items_by_sim_users = filter_dataset_on_threshold[
|
||||
filter_dataset_on_threshold["userId"].isin(knn_to_user_i)
|
||||
]
|
||||
|
||||
sim_scores = rated_items_by_sim_users.groupby(by="itemId")
|
||||
sim_scores = sim_scores["rating"].sum()
|
||||
sim_scores = sim_scores.reset_index()
|
||||
|
||||
self.explainability_matrix[i, sim_scores.itemId.astype(int)] = (
|
||||
sim_scores.rating.to_list()
|
||||
)
|
||||
|
||||
self.explainability_matrix = MinMaxScaler().fit_transform(
|
||||
self.explainability_matrix
|
||||
)
|
||||
|
||||
self.explainability_matrix = torch.from_numpy(self.explainability_matrix)
|
||||
|
||||
def instance_a_train_loader(self, batch_size):
|
||||
assert self.dataset is not None
|
||||
dataset = UserItemRatingDataset(
|
||||
user_tensor=torch.LongTensor(self.dataset.userId.values),
|
||||
item_tensor=torch.LongTensor(self.dataset.itemId.values),
|
||||
target_tensor=torch.FloatTensor(self.dataset.rating.values),
|
||||
)
|
||||
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
|
||||
def train_an_epoch(self, train_loader):
|
||||
self.train()
|
||||
cnt = 0
|
||||
total_loss = 0
|
||||
for batch_id, batch in enumerate(train_loader):
|
||||
assert isinstance(batch[0], torch.LongTensor)
|
||||
user, item, rating = batch[0], batch[1], batch[2]
|
||||
rating = rating.float()
|
||||
loss = self.train_single_batch(user, item, rating)
|
||||
total_loss += loss
|
||||
cnt += 1
|
||||
return total_loss / cnt
|
||||
|
||||
def train_single_batch(self, users, items, ratings):
|
||||
if self.cuda is True:
|
||||
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
|
||||
|
||||
assert self.optimizer is not None
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
ratings_pred = self(users, items)
|
||||
|
||||
assert self.embedding_user is not None
|
||||
user_embeddings = self.embedding_user(users)
|
||||
assert self.embedding_item is not None
|
||||
item_embeddings = self.embedding_item(items)
|
||||
|
||||
assert self.explainability_matrix is not None
|
||||
loss = self.criterion(
|
||||
ratings_pred=ratings_pred,
|
||||
ratings=ratings,
|
||||
u=user_embeddings,
|
||||
v=item_embeddings,
|
||||
reg_term=self.reg_term,
|
||||
expl=self.explainability_matrix[users, items],
|
||||
expl_reg_term=self.expl_reg_term,
|
||||
)
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
loss = loss.item()
|
||||
|
||||
return loss
|
||||
|
||||
def forward(self, user_indices, item_indices):
|
||||
assert self.embedding_user is not None
|
||||
user_embeddings = self.embedding_user(user_indices)
|
||||
assert self.embedding_item is not None
|
||||
item_embeddings = self.embedding_item(item_indices)
|
||||
element_product = torch.mul(user_embeddings, item_embeddings)
|
||||
rating = self.affine_output(element_product)
|
||||
return rating
|
||||
@@ -0,0 +1,165 @@
|
||||
import random
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.optim import Optimizer
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from pygrex.data_reader import DataReader, UserItemRatingDataset
|
||||
from pygrex.utils.torch_utils import use_optimizer
|
||||
from .py_torch_model import PyTorchModel
|
||||
|
||||
|
||||
class GMFModel(PyTorchModel):
|
||||
def __init__(
|
||||
self,
|
||||
learning_rate: float,
|
||||
weight_decay: float,
|
||||
latent_dim: int,
|
||||
epochs: int,
|
||||
num_negative: int,
|
||||
batch_size: int,
|
||||
cuda: bool,
|
||||
optimizer_name: str,
|
||||
device_id=None,
|
||||
):
|
||||
super().__init__(
|
||||
learning_rate=learning_rate,
|
||||
latent_dim=latent_dim,
|
||||
epochs=epochs,
|
||||
batch_size=batch_size,
|
||||
cuda=cuda,
|
||||
optimizer_name=optimizer_name,
|
||||
device_id=device_id,
|
||||
)
|
||||
|
||||
self.negative_sample_size = num_negative
|
||||
self.weight_decay = weight_decay
|
||||
self.optimizer: Optimizer | None = None
|
||||
|
||||
self.affine_output = torch.nn.Linear(
|
||||
in_features=self.latent_dim, out_features=1
|
||||
)
|
||||
self.logistic = torch.nn.Sigmoid()
|
||||
|
||||
self.criterion = nn.BCELoss()
|
||||
|
||||
def fit(self, data: DataReader):
|
||||
optimizer = use_optimizer(
|
||||
network=self,
|
||||
weight_decay=self.weight_decay,
|
||||
learning_rate=self.learning_rate,
|
||||
optimizer_name=self.optimizer_name,
|
||||
)
|
||||
|
||||
if not isinstance(optimizer, Optimizer):
|
||||
raise TypeError(f"Expected an Optimizer, but got {type(optimizer)}")
|
||||
self.optimizer = optimizer
|
||||
dataset = data.dataset
|
||||
|
||||
num_users = data.num_user
|
||||
num_items = data.num_item
|
||||
|
||||
self.embedding_user = torch.nn.Embedding(
|
||||
num_embeddings=num_users, embedding_dim=self.latent_dim
|
||||
)
|
||||
|
||||
self.embedding_item = torch.nn.Embedding(
|
||||
num_embeddings=num_items, embedding_dim=self.latent_dim
|
||||
)
|
||||
|
||||
self.negatives = self._sample_negative(dataset)
|
||||
|
||||
with tqdm(total=self.epochs) as progress:
|
||||
for epoch in range(self.epochs):
|
||||
train_loader = self.instance_a_train_loader(
|
||||
dataset, self.negative_sample_size, self.batch_size
|
||||
)
|
||||
loss = self.train_an_epoch(train_loader)
|
||||
progress.update(1)
|
||||
progress.set_postfix({"loss": loss})
|
||||
|
||||
def instance_a_train_loader(self, dataset, num_negatives, batch_size):
|
||||
"""instance train loader for one training epoch"""
|
||||
users, items, ratings = [], [], []
|
||||
train_ratings = pd.merge(
|
||||
dataset, self.negatives[["userId", "negative_items"]], on="userId"
|
||||
)
|
||||
train_ratings["negatives"] = train_ratings["negative_items"].apply(
|
||||
lambda x: random.sample(list(x), num_negatives)
|
||||
)
|
||||
user_ids = train_ratings["userId"].tolist()
|
||||
item_ids = train_ratings["itemId"].tolist()
|
||||
rating_values = train_ratings["rating"].tolist()
|
||||
negatives_lists = train_ratings["negatives"].tolist()
|
||||
|
||||
for user, item, rating, negatives in zip(
|
||||
user_ids, item_ids, rating_values, negatives_lists
|
||||
):
|
||||
users.append(user)
|
||||
items.append(item)
|
||||
ratings.append(rating)
|
||||
for neg_item in negatives:
|
||||
users.append(user)
|
||||
items.append(neg_item)
|
||||
ratings.append(float(0)) # negative samples get 0 rating
|
||||
# negative samples get 0 rating
|
||||
dataset = UserItemRatingDataset(
|
||||
user_tensor=torch.LongTensor(users),
|
||||
item_tensor=torch.LongTensor(items),
|
||||
target_tensor=torch.FloatTensor(ratings),
|
||||
)
|
||||
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
|
||||
def train_an_epoch(self, train_loader):
|
||||
self.train()
|
||||
cnt = 0
|
||||
total_loss = 0
|
||||
for batch_id, batch in enumerate(train_loader):
|
||||
assert isinstance(batch[0], torch.LongTensor)
|
||||
user, item, rating = batch[0], batch[1], batch[2]
|
||||
rating = rating.float()
|
||||
loss = self.train_single_batch(user, item, rating)
|
||||
total_loss += loss
|
||||
cnt += 1
|
||||
return total_loss / cnt
|
||||
|
||||
def train_single_batch(self, users, items, ratings):
|
||||
if self.cuda is True:
|
||||
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
|
||||
|
||||
if self.optimizer is None:
|
||||
raise RuntimeError(
|
||||
"Optimizer is not initialized. Call fit() before training."
|
||||
)
|
||||
self.optimizer.zero_grad()
|
||||
ratings_pred = self(users, items)
|
||||
loss = self.criterion(ratings_pred.view(-1), ratings)
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
loss = loss.item()
|
||||
return loss
|
||||
|
||||
def _sample_negative(self, ratings):
|
||||
"""return all negative items & 100 sampled negative items"""
|
||||
interact_status = (
|
||||
ratings.groupby("userId")["itemId"]
|
||||
.apply(set)
|
||||
.reset_index()
|
||||
.rename(columns={"itemId": "interacted_items"})
|
||||
)
|
||||
self.item_catalogue = set(ratings.itemId)
|
||||
interact_status["negative_items"] = interact_status["interacted_items"].apply(
|
||||
lambda x: self.item_catalogue - x
|
||||
)
|
||||
return interact_status[["userId", "negative_items"]]
|
||||
|
||||
def forward(self, user_indices, item_indices):
|
||||
user_embedding = self.embedding_user(user_indices)
|
||||
item_embedding = self.embedding_item(item_indices)
|
||||
element_product = torch.mul(user_embedding, item_embedding)
|
||||
dot = self.affine_output(element_product)
|
||||
rating = self.logistic(dot)
|
||||
return rating
|
||||
@@ -0,0 +1,22 @@
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class Item2Vec(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.num_items = config['num_items']
|
||||
self.latent_dim = config['latent_dim']
|
||||
self.embedding = nn.Embedding(
|
||||
num_embeddings=self.num_items,
|
||||
embedding_dim=self.latent_dim)
|
||||
self.fc = nn.Linear(
|
||||
in_features=self.latent_dim,
|
||||
out_features=self.num_items)
|
||||
|
||||
def forward(self, input_data):
|
||||
embedding = self.embedding(input_data)
|
||||
return self.fc(embedding)
|
||||
|
||||
def item_embedding(self):
|
||||
return self.embedding.weight.detach()
|
||||
@@ -0,0 +1,240 @@
|
||||
from typing import Optional, Union
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from .recommender_model import RecommenderModel
|
||||
from pygrex.data_reader import DataReader
|
||||
|
||||
|
||||
class KNNBasic(RecommenderModel):
|
||||
"""
|
||||
An improved K-Nearest Neighbors collaborative filtering model.
|
||||
|
||||
This version uses Pearson correlation similarity and improved neighbor selection
|
||||
for better performance on sparse datasets like MovieLens.
|
||||
|
||||
Args:
|
||||
k (int): Number of neighbors to consider. Default 50.
|
||||
min_k (int): Minimum number of neighbors required for prediction. Default 3.
|
||||
sim_options (dict): Similarity options. Default pearson, user-based.
|
||||
"""
|
||||
|
||||
def __init__(self, k: int = 50, min_k: int = 3, sim_options: Optional[dict] = None):
|
||||
super().__init__()
|
||||
self.k = k
|
||||
self.min_k = min_k
|
||||
self.sim_options = sim_options if sim_options is not None else {}
|
||||
|
||||
# Validate similarity options
|
||||
if self.sim_options.get("user_based", True) is False:
|
||||
raise NotImplementedError("Only the user-based approach is implemented.")
|
||||
|
||||
sim_name = self.sim_options.get("name", "pearson").lower()
|
||||
if sim_name not in ["cosine", "pearson"]:
|
||||
raise NotImplementedError(
|
||||
"Only cosine and pearson similarity are implemented."
|
||||
)
|
||||
|
||||
# Model attributes
|
||||
self.trainset: Optional[sp.csr_matrix] = None
|
||||
self.global_mean: float = 0
|
||||
self.user_biases: Optional[np.ndarray] = None
|
||||
self.item_biases: Optional[np.ndarray] = None
|
||||
self.num_users: Optional[int] = None
|
||||
self.num_items: Optional[int] = None
|
||||
|
||||
# For memory-efficient similarity computation
|
||||
self.user_means: Optional[np.ndarray] = None
|
||||
|
||||
def fit(self, data: DataReader) -> None:
|
||||
"""
|
||||
Trains the KNN model with improved memory efficiency.
|
||||
"""
|
||||
print("Fitting the improved KNNBasic model...")
|
||||
df = data.dataset
|
||||
self.num_users = data.num_user
|
||||
self.num_items = data.num_item
|
||||
|
||||
print(
|
||||
f"Building ratings matrix for {self.num_users} users and {self.num_items} items..."
|
||||
)
|
||||
|
||||
# 1. Build the sparse user-item ratings matrix
|
||||
ratings = df["rating"].values
|
||||
rows = df["userId"].values
|
||||
cols = df["itemId"].values
|
||||
self.trainset = sp.csr_matrix(
|
||||
(ratings, (rows, cols)), shape=(self.num_users, self.num_items)
|
||||
)
|
||||
|
||||
# 2. Calculate global mean and biases
|
||||
print("Computing biases...")
|
||||
self.global_mean = self.trainset.data.mean()
|
||||
|
||||
# User biases: bu = avg(ratings_u) - global_mean
|
||||
user_sums = np.array(self.trainset.sum(axis=1)).flatten()
|
||||
user_counts = np.diff(self.trainset.indptr)
|
||||
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
user_avg_ratings = np.where(
|
||||
user_counts > 0, user_sums / user_counts, self.global_mean
|
||||
)
|
||||
self.user_biases = np.where(
|
||||
user_counts > 0, user_avg_ratings - self.global_mean, 0
|
||||
)
|
||||
|
||||
# Item biases: bi = avg(ratings_i) - global_mean
|
||||
item_sums = np.array(self.trainset.sum(axis=0)).flatten()
|
||||
item_counts = np.diff(self.trainset.tocsc().indptr)
|
||||
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
item_avg_ratings = np.where(
|
||||
item_counts > 0, item_sums / item_counts, self.global_mean
|
||||
)
|
||||
self.item_biases = np.where(
|
||||
item_counts > 0, item_avg_ratings - self.global_mean, 0
|
||||
)
|
||||
|
||||
# Store user means for similarity computation
|
||||
self.user_means = user_avg_ratings
|
||||
|
||||
print("Model fitting complete.")
|
||||
|
||||
def _compute_user_similarity(self, user1_id: int, user2_id: int) -> float:
|
||||
"""
|
||||
Compute Pearson correlation similarity between two users.
|
||||
This works better than cosine similarity for collaborative filtering.
|
||||
"""
|
||||
assert self.trainset is not None
|
||||
# Get rating vectors for both users
|
||||
user1_ratings = self.trainset[user1_id].toarray().flatten()
|
||||
user2_ratings = self.trainset[user2_id].toarray().flatten()
|
||||
|
||||
# Find commonly rated items
|
||||
mask = (user1_ratings > 0) & (user2_ratings > 0)
|
||||
n_common = np.sum(mask)
|
||||
|
||||
# Need at least 2 common ratings for correlation
|
||||
if n_common < 2:
|
||||
return 0.0
|
||||
|
||||
# Extract ratings for commonly rated items
|
||||
u1_common = user1_ratings[mask]
|
||||
u2_common = user2_ratings[mask]
|
||||
|
||||
# Mean-center the ratings
|
||||
u1_mean = np.mean(u1_common)
|
||||
u2_mean = np.mean(u2_common)
|
||||
|
||||
u1_centered = u1_common - u1_mean
|
||||
u2_centered = u2_common - u2_mean
|
||||
|
||||
# Compute Pearson correlation
|
||||
numerator = np.sum(u1_centered * u2_centered)
|
||||
denom1 = np.sqrt(np.sum(u1_centered**2))
|
||||
denom2 = np.sqrt(np.sum(u2_centered**2))
|
||||
|
||||
if denom1 == 0 or denom2 == 0:
|
||||
return 0.0
|
||||
|
||||
correlation = numerator / (denom1 * denom2)
|
||||
|
||||
# Apply significance weighting based on number of common items
|
||||
# More common items = more reliable similarity
|
||||
significance_weight = min(n_common / 50.0, 1.0) # Cap at 50 common items
|
||||
|
||||
return correlation * significance_weight
|
||||
|
||||
def _get_neighbors_for_item(self, user_id: int, item_id: int):
|
||||
"""
|
||||
Get the top-k most similar users who have rated the given item.
|
||||
"""
|
||||
# Find users who rated this item
|
||||
assert self.trainset is not None
|
||||
item_col = self.trainset[:, item_id] # type: ignore
|
||||
neighbor_candidates, _ = item_col.nonzero()
|
||||
|
||||
# Remove the target user if they're in the candidates
|
||||
neighbor_candidates = neighbor_candidates[neighbor_candidates != user_id]
|
||||
|
||||
if len(neighbor_candidates) == 0:
|
||||
return np.array([]), np.array([]), np.array([])
|
||||
|
||||
# Compute similarities
|
||||
similarities = []
|
||||
for neighbor_id in neighbor_candidates:
|
||||
sim = self._compute_user_similarity(user_id, neighbor_id)
|
||||
similarities.append((sim, neighbor_id))
|
||||
|
||||
# Sort by similarity and take top-k
|
||||
similarities.sort(key=lambda x: x[0], reverse=True)
|
||||
top_k = similarities[: min(self.k, len(similarities))]
|
||||
|
||||
if len(top_k) < self.min_k:
|
||||
return np.array([]), np.array([]), np.array([])
|
||||
|
||||
# Extract data
|
||||
neighbor_sims = np.array([sim for sim, _ in top_k])
|
||||
neighbor_ids = np.array([nid for _, nid in top_k])
|
||||
neighbor_ratings = np.array(
|
||||
[self.trainset[nid, item_id] for nid in neighbor_ids]
|
||||
)
|
||||
|
||||
return neighbor_sims, neighbor_ids, neighbor_ratings
|
||||
|
||||
def predict(self, user_id: Union[int, str], item_id: Union[int, str]) -> float:
|
||||
"""
|
||||
Predict rating for a user-item pair using KNN.
|
||||
"""
|
||||
if self.trainset is None:
|
||||
raise RuntimeError("Model must be trained first using fit() method.")
|
||||
|
||||
assert self.num_users is not None
|
||||
assert self.num_items is not None
|
||||
assert self.user_biases is not None
|
||||
assert self.item_biases is not None
|
||||
user_id = int(user_id)
|
||||
item_id = int(item_id)
|
||||
# Handle out-of-bounds users/items
|
||||
if user_id >= self.num_users or item_id >= self.num_items:
|
||||
return self.global_mean
|
||||
|
||||
# 1. Calculate baseline estimate
|
||||
baseline = (
|
||||
self.global_mean + self.user_biases[user_id] + self.item_biases[item_id]
|
||||
)
|
||||
|
||||
# 2. Get neighbors who rated this item
|
||||
neighbor_sims, neighbor_ids, neighbor_ratings = self._get_neighbors_for_item(
|
||||
user_id, item_id
|
||||
)
|
||||
|
||||
if len(neighbor_ids) == 0:
|
||||
return baseline
|
||||
|
||||
# 3. Calculate weighted prediction
|
||||
neighbor_biases = self.user_biases[neighbor_ids]
|
||||
neighbor_baselines = (
|
||||
self.global_mean + neighbor_biases + self.item_biases[item_id]
|
||||
)
|
||||
|
||||
deviations = neighbor_ratings - neighbor_baselines
|
||||
|
||||
# Only use neighbors with positive similarity
|
||||
positive_mask = neighbor_sims > 0
|
||||
if not np.any(positive_mask):
|
||||
return baseline
|
||||
|
||||
neighbor_sims = neighbor_sims[positive_mask]
|
||||
deviations = deviations[positive_mask]
|
||||
|
||||
numerator = np.sum(neighbor_sims * deviations)
|
||||
denominator = np.sum(np.abs(neighbor_sims))
|
||||
|
||||
if denominator == 0:
|
||||
return baseline
|
||||
|
||||
prediction = baseline + (numerator / denominator)
|
||||
|
||||
# Clip to valid rating range
|
||||
return np.clip(prediction, 1.0, 5.0)
|
||||
@@ -0,0 +1,136 @@
|
||||
import numpy as np
|
||||
import scipy
|
||||
from typing import Union, Protocol, runtime_checkable
|
||||
|
||||
from implicit.recommender_base import RecommenderBase
|
||||
from .recommender_model import RecommenderModel
|
||||
from pygrex.data_reader import DataReader
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class FittableImplicitModel(Protocol):
|
||||
user_factors: np.ndarray
|
||||
item_factors: np.ndarray
|
||||
|
||||
def fit(self, item_user_data) -> None: ...
|
||||
|
||||
|
||||
class MFImplicitModel(RecommenderModel):
|
||||
def __init__(
|
||||
self,
|
||||
latent_dim,
|
||||
reg_term,
|
||||
learning_rate,
|
||||
epochs,
|
||||
num_users=None,
|
||||
num_items=None,
|
||||
):
|
||||
self.latent_dim = latent_dim
|
||||
self.reg_term = reg_term
|
||||
self.learning_rate = learning_rate
|
||||
self.epochs = epochs
|
||||
self.model: Union[RecommenderBase, FittableImplicitModel, None] = None
|
||||
self.total_users = num_users
|
||||
self.total_items = num_items
|
||||
|
||||
def fit(self, data: DataReader) -> None:
|
||||
if self.model is None:
|
||||
raise RuntimeError(
|
||||
"The model has not been initialized. Please use a specific subclass like ALS or BPR."
|
||||
)
|
||||
num_user_for_shape = data.dataset["userId"].max() + 1
|
||||
num_item_for_shape = data.dataset["itemId"].max() + 1
|
||||
self.total_users = num_user_for_shape
|
||||
self.total_items = num_item_for_shape
|
||||
|
||||
item_user_data = self.rearrange_dataset(
|
||||
ds=data.dataset,
|
||||
num_user=num_user_for_shape,
|
||||
num_item=num_item_for_shape,
|
||||
).T.tocsr()
|
||||
|
||||
self.model.fit(item_user_data)
|
||||
|
||||
@staticmethod
|
||||
def rearrange_dataset(ds, num_user: int, num_item: int) -> scipy.sparse.csr_matrix:
|
||||
"""
|
||||
Converts the dataset into a sparse matrix format for the implicit model.
|
||||
|
||||
Args:
|
||||
ds: Dataset containing userId and itemId columns
|
||||
num_user : Number of users in the dataset
|
||||
num_item : Number of items in the dataset
|
||||
|
||||
Returns:
|
||||
ds_mtr: Sparse matrix representation of the dataset
|
||||
"""
|
||||
|
||||
# Create sparse matrix directly from data
|
||||
data = np.ones(len(ds)) # Array of 1s for each interaction
|
||||
rows = ds["userId"].values # User IDs as row indices
|
||||
cols = ds["itemId"].values # Item IDs as column indices
|
||||
|
||||
ds_mtr = scipy.sparse.csr_matrix(
|
||||
(data, (rows, cols)), shape=(num_user, num_item)
|
||||
)
|
||||
|
||||
return ds_mtr
|
||||
|
||||
def predict(
|
||||
self, user_id: Union[str, int], item_id: Union[str, int, list, np.ndarray]
|
||||
) -> Union[float, list]:
|
||||
"""
|
||||
Predict ratings for a user and one or more items using efficient vectorization.
|
||||
|
||||
Args:
|
||||
user_id : User identifier
|
||||
item_id : Item identifier or a list/array of item identifiers
|
||||
|
||||
Returns:
|
||||
A single predicted score (float) or an array of scores (np.ndarray)
|
||||
"""
|
||||
if not isinstance(self.model, FittableImplicitModel):
|
||||
raise RuntimeError(
|
||||
"The model has not been trained yet. Please call fit() first."
|
||||
)
|
||||
user_id = int(user_id)
|
||||
|
||||
# 1. Validate user_id
|
||||
if not (0 <= user_id < self.model.user_factors.shape[0]):
|
||||
raise ValueError(f"user_id {user_id} is out of bounds")
|
||||
|
||||
# 2. Unify input to always be a numpy array
|
||||
is_single_item = not isinstance(item_id, (list, np.ndarray))
|
||||
item_ids_arr = np.array(item_id, ndmin=1).astype(int)
|
||||
|
||||
# 3. Perform a single, vectorized bounds check for all items at once
|
||||
max_item_id = self.model.item_factors.shape[0]
|
||||
if not np.all((item_ids_arr >= 0) & (item_ids_arr < max_item_id)):
|
||||
out_of_bounds_id = item_ids_arr[
|
||||
(item_ids_arr < 0) | (item_ids_arr >= max_item_id)
|
||||
][0]
|
||||
raise ValueError(f"item_id {out_of_bounds_id} is out of bounds")
|
||||
|
||||
# 4. Get all item vectors in a single, highly efficient operation
|
||||
item_vectors = self.model.item_factors[item_ids_arr]
|
||||
user_vector = self.model.user_factors[user_id]
|
||||
|
||||
# 5. Calculate all scores with one dot product
|
||||
scores = user_vector.dot(item_vectors.T)
|
||||
|
||||
# 6. Return a single float if the input was a single item, otherwise the array
|
||||
return scores[0].item() if is_single_item else scores.tolist()
|
||||
|
||||
def user_embedding(self) -> np.ndarray:
|
||||
if not isinstance(self.model, FittableImplicitModel):
|
||||
raise RuntimeError(
|
||||
"The model has not been trained yet. Please call fit() first."
|
||||
)
|
||||
return self.model.user_factors
|
||||
|
||||
def item_embedding(self) -> np.ndarray:
|
||||
if not isinstance(self.model, FittableImplicitModel):
|
||||
raise RuntimeError(
|
||||
"The model has not been trained yet. Please call fit() first."
|
||||
)
|
||||
return self.model.item_factors
|
||||
@@ -0,0 +1,179 @@
|
||||
import random
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.optim import Optimizer
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from pygrex.data_reader import DataReader, UserItemRatingDataset
|
||||
from pygrex.utils.torch_utils import use_optimizer
|
||||
from .py_torch_model import PyTorchModel
|
||||
|
||||
|
||||
class MLPModel(PyTorchModel):
|
||||
def __init__(
|
||||
self,
|
||||
learning_rate: float,
|
||||
weight_decay: float,
|
||||
latent_dim: int,
|
||||
epochs: int,
|
||||
num_negative: int,
|
||||
batch_size: int,
|
||||
cuda: bool,
|
||||
optimizer_name: str,
|
||||
device_id=None,
|
||||
):
|
||||
super().__init__(
|
||||
learning_rate=learning_rate,
|
||||
latent_dim=latent_dim,
|
||||
epochs=epochs,
|
||||
batch_size=batch_size,
|
||||
cuda=cuda,
|
||||
optimizer_name=optimizer_name,
|
||||
device_id=device_id,
|
||||
)
|
||||
|
||||
self.negative_sample_size = num_negative
|
||||
self.weight_decay = weight_decay
|
||||
|
||||
# layer dim is 2*self.latent_dim since the embeddings will be concatenated
|
||||
self.affine_output = torch.nn.Linear(
|
||||
in_features=2 * self.latent_dim, out_features=1
|
||||
)
|
||||
self.logistic = torch.nn.Sigmoid()
|
||||
|
||||
self.criterion = nn.BCELoss()
|
||||
self.optimizer: Optimizer | None = None
|
||||
|
||||
def fit(self, data: DataReader):
|
||||
optimizer = use_optimizer(
|
||||
network=self,
|
||||
weight_decay=self.weight_decay,
|
||||
learning_rate=self.learning_rate,
|
||||
optimizer_name=self.optimizer_name,
|
||||
)
|
||||
if not isinstance(optimizer, Optimizer):
|
||||
raise TypeError(f"Expected an Optimizer, but got {type(optimizer)}")
|
||||
self.optimizer = optimizer
|
||||
|
||||
dataset = data.dataset
|
||||
|
||||
num_users = data.num_user
|
||||
num_items = data.num_item
|
||||
|
||||
self.embedding_user = torch.nn.Embedding(
|
||||
num_embeddings=num_users, embedding_dim=self.latent_dim
|
||||
)
|
||||
|
||||
self.embedding_item = torch.nn.Embedding(
|
||||
num_embeddings=num_items, embedding_dim=self.latent_dim
|
||||
)
|
||||
|
||||
self.negatives = self._sample_negative(dataset)
|
||||
|
||||
with tqdm(total=self.epochs) as progress:
|
||||
for epoch in range(self.epochs):
|
||||
train_loader = self.instance_a_train_loader(
|
||||
dataset, self.negative_sample_size, self.batch_size
|
||||
)
|
||||
loss = self.train_an_epoch(train_loader)
|
||||
progress.update(1)
|
||||
progress.set_postfix({"loss": loss})
|
||||
|
||||
def instance_a_train_loader(self, dataset, num_negatives, batch_size):
|
||||
"""instance train loader for one training epoch"""
|
||||
users, items, ratings = [], [], []
|
||||
train_ratings = pd.merge(
|
||||
dataset, self.negatives[["userId", "negative_items"]], on="userId"
|
||||
)
|
||||
train_ratings["negatives"] = train_ratings["negative_items"].apply(
|
||||
lambda x: random.sample(list(x), num_negatives)
|
||||
)
|
||||
user_ids = train_ratings["userId"].tolist()
|
||||
item_ids = train_ratings["itemId"].tolist()
|
||||
rating_values = train_ratings["rating"].tolist()
|
||||
negatives_lists = train_ratings["negatives"].tolist()
|
||||
|
||||
for user, item, rating, negatives in zip(
|
||||
user_ids, item_ids, rating_values, negatives_lists
|
||||
):
|
||||
users.append(user)
|
||||
items.append(item)
|
||||
ratings.append(rating)
|
||||
for neg_item in negatives:
|
||||
users.append(user)
|
||||
items.append(neg_item)
|
||||
ratings.append(float(0)) # negative samples get 0 rating
|
||||
|
||||
dataset = UserItemRatingDataset(
|
||||
user_tensor=torch.LongTensor(users),
|
||||
item_tensor=torch.LongTensor(items),
|
||||
target_tensor=torch.FloatTensor(ratings),
|
||||
)
|
||||
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
|
||||
def train_an_epoch(self, train_loader):
|
||||
self.train()
|
||||
cnt = 0
|
||||
total_loss = 0
|
||||
for batch_id, batch in enumerate(train_loader):
|
||||
assert isinstance(batch[0], torch.LongTensor)
|
||||
user, item, rating = batch[0], batch[1], batch[2]
|
||||
rating = rating.float()
|
||||
loss = self.train_single_batch(user, item, rating)
|
||||
total_loss += loss
|
||||
cnt += 1
|
||||
return total_loss / cnt
|
||||
|
||||
def train_single_batch(self, users, items, ratings):
|
||||
if self.cuda is True:
|
||||
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
|
||||
if self.optimizer is None:
|
||||
raise RuntimeError(
|
||||
"Optimizer is not initialized. Call fit() before training."
|
||||
)
|
||||
|
||||
self.optimizer.zero_grad()
|
||||
ratings_pred = self(users, items)
|
||||
loss = self.criterion(ratings_pred.view(-1), ratings)
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
loss = loss.item()
|
||||
return loss
|
||||
|
||||
def _sample_negative(self, ratings):
|
||||
"""return all negative items & 100 sampled negative items"""
|
||||
interact_status = (
|
||||
ratings.groupby("userId")["itemId"]
|
||||
.apply(set)
|
||||
.reset_index()
|
||||
.rename(columns={"itemId": "interacted_items"})
|
||||
)
|
||||
self.item_catalogue = set(ratings.itemId)
|
||||
interact_status["negative_items"] = interact_status["interacted_items"].apply(
|
||||
lambda x: self.item_catalogue - x
|
||||
)
|
||||
return interact_status[["userId", "negative_items"]]
|
||||
|
||||
def forward(self, user_indices, item_indices):
|
||||
user_embedding = self.embedding_user(user_indices)
|
||||
item_embedding = self.embedding_item(item_indices)
|
||||
|
||||
# Ensure embeddings are 2D [batch_size, embedding_dim]
|
||||
if user_embedding.dim() == 3:
|
||||
user_embedding = user_embedding.squeeze(1)
|
||||
if item_embedding.dim() == 3:
|
||||
item_embedding = item_embedding.squeeze(1)
|
||||
|
||||
# This is needed because cat does not support broadcasting.
|
||||
if user_embedding.size(0) == 1 and item_embedding.size(0) > 1:
|
||||
user_embedding = user_embedding.repeat(item_embedding.size(0), 1)
|
||||
elif item_embedding.size(0) == 1 and user_embedding.size(0) > 1:
|
||||
item_embedding = item_embedding.repeat(user_embedding.size(0), 1)
|
||||
|
||||
element_concat = torch.cat((user_embedding, item_embedding), 1)
|
||||
concat = self.affine_output(element_concat)
|
||||
rating = self.logistic(concat)
|
||||
return rating
|
||||
@@ -0,0 +1,69 @@
|
||||
import itertools
|
||||
from typing import Union
|
||||
import torch
|
||||
|
||||
from pygrex.utils.torch_utils import use_cuda
|
||||
from .recommender_model import RecommenderModel
|
||||
from pygrex.data_reader import DataReader
|
||||
|
||||
|
||||
class PyTorchModel(RecommenderModel, torch.nn.Module):
|
||||
"""Meta Learner
|
||||
|
||||
Note: Subclass should implement self.model !
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
learning_rate: float,
|
||||
latent_dim: int,
|
||||
epochs: int,
|
||||
batch_size: int,
|
||||
cuda: bool,
|
||||
optimizer_name: str,
|
||||
device_id: Union[int, None] = None,
|
||||
):
|
||||
if optimizer_name not in ["sgd", "adam", "rmsprop"]:
|
||||
raise Exception("Wrong optimizer.")
|
||||
|
||||
if cuda is True and device_id is not None:
|
||||
use_cuda(True, device_id)
|
||||
|
||||
self.latent_dim = latent_dim
|
||||
self.learning_rate = learning_rate
|
||||
self.epochs = epochs
|
||||
self.batch_size = batch_size
|
||||
self._cuda = cuda
|
||||
self.optimizer_name = optimizer_name
|
||||
|
||||
self.dataset = None
|
||||
self.dataset_metadata = None
|
||||
self.embedding_user = None
|
||||
self.embedding_item = None
|
||||
self.optimizer = None
|
||||
|
||||
super().__init__()
|
||||
|
||||
def fit(self, data: DataReader):
|
||||
pass
|
||||
|
||||
def predict(self, user_id, item_id) -> list:
|
||||
if isinstance(user_id, int):
|
||||
user_id = [user_id]
|
||||
if isinstance(item_id, int):
|
||||
item_id = [item_id]
|
||||
user_id = torch.LongTensor(user_id)
|
||||
item_id = torch.LongTensor(item_id)
|
||||
with torch.no_grad():
|
||||
if self._cuda:
|
||||
user_id = user_id.cuda()
|
||||
item_id = item_id.cuda()
|
||||
pred = self.forward(user_id, item_id).cpu().tolist()
|
||||
pred = list(itertools.chain.from_iterable(pred))
|
||||
return pred
|
||||
|
||||
def user_embedding(self):
|
||||
return self.state_dict()["embedding_user.weight"].cpu().numpy()
|
||||
|
||||
def item_embedding(self):
|
||||
return self.state_dict()["embedding_item.weight"].cpu().numpy()
|
||||
@@ -0,0 +1,35 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Union
|
||||
|
||||
from pygrex.data_reader.data_reader import DataReader
|
||||
|
||||
|
||||
class RecommenderModel(ABC):
|
||||
"""
|
||||
Abstract base class that defines the interface for recommendation models.
|
||||
All model implementations should inherit from this class.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def predict(
|
||||
self, user_id: Union[str, int], item_id: Union[str, int]
|
||||
) -> Union[float, list]:
|
||||
"""
|
||||
Make predictions for a specific user on a list of items.
|
||||
|
||||
Args:
|
||||
user_id: The ID of the user
|
||||
item_ids: List of item IDs to predict ratings/scores for
|
||||
|
||||
Returns:
|
||||
A dictionary mapping item IDs to predicted ratings/scores
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fit(self, data: DataReader):
|
||||
"""
|
||||
Train the model on data.
|
||||
The specific parameters depend on the model implementation.
|
||||
"""
|
||||
pass
|
||||
@@ -0,0 +1,169 @@
|
||||
from math import sqrt
|
||||
import numpy as np
|
||||
from pygrex.data_reader.data_reader import DataReader
|
||||
from pygrex.models.recommender_model import RecommenderModel
|
||||
|
||||
|
||||
class SVD(RecommenderModel):
|
||||
def __init__(
|
||||
self,
|
||||
n_factors=50,
|
||||
n_epochs=25,
|
||||
lr=0.007,
|
||||
reg=0.1,
|
||||
init_mean=0.0,
|
||||
init_std=0.1,
|
||||
random_state=42,
|
||||
early_stopping=True,
|
||||
):
|
||||
self.n_factors = n_factors
|
||||
self.n_epochs = n_epochs
|
||||
self.lr = lr
|
||||
self.reg = reg
|
||||
self.init_mean = init_mean
|
||||
self.init_std = init_std
|
||||
self.random_state = random_state
|
||||
self.early_stopping = early_stopping
|
||||
|
||||
# Model parameters
|
||||
self.user_factors = None
|
||||
self.item_factors = None
|
||||
self.user_biases = None
|
||||
self.item_biases = None
|
||||
self.global_mean = None
|
||||
|
||||
# Training history
|
||||
self.training_rmse = []
|
||||
|
||||
def fit(self, data: DataReader, validation_data=None):
|
||||
df = data.dataset
|
||||
if data._num_user is None or data._num_item is None:
|
||||
raise ValueError("The number of users and items cannot be None.")
|
||||
num_users, num_items = data._num_user, data._num_item
|
||||
|
||||
# Initialize random number generator
|
||||
rng = np.random.RandomState(self.random_state)
|
||||
|
||||
# Initialize parameters with better scaling
|
||||
scale = 1.0 / sqrt(self.n_factors)
|
||||
self.user_factors = rng.normal(
|
||||
self.init_mean, scale, (num_users, self.n_factors)
|
||||
) # type: ignore
|
||||
self.item_factors = rng.normal(
|
||||
self.init_mean, scale, (num_items, self.n_factors)
|
||||
) # type: ignore
|
||||
self.user_biases = np.zeros(num_users)
|
||||
self.item_biases = np.zeros(num_items)
|
||||
self.global_mean = df["rating"].mean()
|
||||
|
||||
# Convert to list of tuples for faster iteration
|
||||
ratings_tuple = list(
|
||||
df[["userId", "itemId", "rating"]].itertuples(index=False, name=None)
|
||||
)
|
||||
|
||||
# Training loop with early stopping
|
||||
best_rmse = float("inf")
|
||||
patience = 3
|
||||
patience_counter = 0
|
||||
|
||||
for epoch in range(self.n_epochs):
|
||||
print(f"Epoch {epoch + 1}/{self.n_epochs}...")
|
||||
|
||||
# Shuffle training data
|
||||
rng.shuffle(ratings_tuple)
|
||||
|
||||
# SGD updates
|
||||
for user, item, rating in ratings_tuple:
|
||||
# Predict rating
|
||||
dot_product = np.dot(self.user_factors[user], self.item_factors[item])
|
||||
prediction = (
|
||||
self.global_mean
|
||||
+ self.user_biases[user]
|
||||
+ self.item_biases[item]
|
||||
+ dot_product
|
||||
)
|
||||
|
||||
# Compute error
|
||||
error = rating - prediction
|
||||
|
||||
# Update biases
|
||||
self.user_biases[user] += self.lr * (
|
||||
error - self.reg * self.user_biases[user]
|
||||
)
|
||||
self.item_biases[item] += self.lr * (
|
||||
error - self.reg * self.item_biases[item]
|
||||
)
|
||||
|
||||
# Update factors
|
||||
uf_temp = self.user_factors[user].copy()
|
||||
self.user_factors[user] += self.lr * (
|
||||
error * self.item_factors[item] - self.reg * self.user_factors[user]
|
||||
)
|
||||
self.item_factors[item] += self.lr * (
|
||||
error * uf_temp - self.reg * self.item_factors[item]
|
||||
)
|
||||
|
||||
# Calculate training RMSE
|
||||
if epoch % 5 == 0 or epoch == self.n_epochs - 1:
|
||||
train_rmse = self.calculate_rmse(ratings_tuple)
|
||||
self.training_rmse.append(train_rmse)
|
||||
print(f" Training RMSE: {train_rmse:.4f}")
|
||||
|
||||
# Early stopping
|
||||
if self.early_stopping and validation_data is not None:
|
||||
val_rmse = self.calculate_rmse(validation_data)
|
||||
print(f" Validation RMSE: {val_rmse:.4f}")
|
||||
|
||||
if val_rmse < best_rmse:
|
||||
best_rmse = val_rmse
|
||||
patience_counter = 0
|
||||
else:
|
||||
patience_counter += 1
|
||||
|
||||
if patience_counter >= patience:
|
||||
print(f"Early stopping at epoch {epoch + 1}")
|
||||
break
|
||||
|
||||
print("Fit complete.")
|
||||
|
||||
def calculate_rmse(self, ratings_data):
|
||||
"""Calculate RMSE for given ratings data."""
|
||||
total_error = 0
|
||||
count = 0
|
||||
|
||||
for user, item, rating in ratings_data:
|
||||
prediction = self.predict(user, item)
|
||||
total_error += (rating - prediction) ** 2
|
||||
count += 1
|
||||
|
||||
return sqrt(total_error / count) if count > 0 else 0
|
||||
|
||||
def predict(self, user_id: int | str, item_id: int | str) -> float:
|
||||
# Check that all model components are initialized
|
||||
if (
|
||||
self.user_factors is None
|
||||
or self.item_factors is None
|
||||
or self.user_biases is None
|
||||
or self.item_biases is None
|
||||
or self.global_mean is None
|
||||
):
|
||||
raise RuntimeError("The model has not been trained yet.")
|
||||
|
||||
try:
|
||||
user_id = int(user_id)
|
||||
item_id = int(item_id)
|
||||
except (ValueError, TypeError):
|
||||
# If conversion fails, return the global mean rating
|
||||
return self.global_mean
|
||||
|
||||
# Make prediction
|
||||
dot_product = np.dot(self.user_factors[user_id], self.item_factors[item_id])
|
||||
prediction = (
|
||||
self.global_mean
|
||||
+ self.user_biases[user_id]
|
||||
+ self.item_biases[item_id]
|
||||
+ dot_product
|
||||
)
|
||||
|
||||
# Clip to valid rating range
|
||||
return np.clip(prediction, 1, 5)
|
||||
Reference in New Issue
Block a user