public code v1

This commit is contained in:
2026-05-22 10:02:10 +02:00
commit 46a9ecf065
166 changed files with 6982454 additions and 0 deletions
+23
View File
@@ -0,0 +1,23 @@
from .als_model import ALS
from .bpr_model import BPR
from .gmf_model import GMFModel
from .emf_model import EMFModel
from .autoencoder_model import ExplAutoencoderTorch
from .mlp_model import MLPModel
from .emf_model import PyTorchModel
from .knn_basic_model import KNNBasic
from .svd_model import SVD
from .recommender_model import RecommenderModel
__all__ = [
"ALS",
"BPR",
"GMFModel",
"EMFModel",
"PyTorchModel",
"MLPModel",
"ExplAutoencoderTorch",
"KNNBasic",
"SVD",
"RecommenderModel",
]
+31
View File
@@ -0,0 +1,31 @@
import implicit
from .mf_implicit_model import MFImplicitModel
class ALS(MFImplicitModel):
def __init__(
self,
latent_dim,
reg_term,
epochs,
random_state=42,
num_users=None,
num_items=None,
**kwargs,
):
super(ALS, self).__init__(
latent_dim=latent_dim,
reg_term=reg_term,
epochs=epochs,
learning_rate=None,
num_users=num_users,
num_items=num_items,
)
self.model = implicit.als.AlternatingLeastSquares(
factors=self.latent_dim,
regularization=self.reg_term,
iterations=self.epochs,
random_state=random_state,
)
+223
View File
@@ -0,0 +1,223 @@
import numpy as np
import torch
import torch.nn as nn
import torch.optim
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from typing import Optional, Union, List
from pygrex.utils.torch_utils import use_cuda, use_optimizer
from pygrex.data_reader import UserItemDict, DataReader
from .recommender_model import RecommenderModel
class ExplAutoencoderTorch(RecommenderModel, nn.Module):
def __init__(
self,
hidden_layer_features: int,
learning_rate: float,
positive_threshold: float,
weight_decay: float,
epochs: int,
knn: int,
cuda: bool,
optimizer_name: str,
expl: bool,
device_id: Optional[int] = None,
):
super().__init__()
if optimizer_name not in ["sgd", "adam", "rmsprop"]:
raise Exception("Wrong optimizer.")
if cuda:
use_cuda(True, device_id if device_id is not None else 0)
self.positive_threshold = positive_threshold
self.weight_decay = weight_decay
self.knn = knn
self.learning_rate = learning_rate
self.epochs = epochs
self.use_gpu = cuda
self.optimizer_name = optimizer_name
self.hidden_layer_features = hidden_layer_features
self.expl = expl
self.dataset = None
self.data = None
self.embedding_user = None
self.embedding_item = None
self.optimizer: Optional[torch.optim.Optimizer] = None
self.explainability_matrix = None
self.sim_users = {}
self.criterion = nn.MSELoss()
def fit(self, data: DataReader):
self.data = data
self.dataset = data.dataset
num_items = self.data.num_item
self.encoder_hidden_layer = nn.Linear(
in_features=num_items, out_features=self.hidden_layer_features
)
self.decoder_output_layer = nn.Linear(
in_features=self.hidden_layer_features, out_features=num_items
)
self.compute_explainability()
optimizer = use_optimizer(
network=self,
weight_decay=self.weight_decay,
learning_rate=self.learning_rate,
optimizer_name=self.optimizer_name,
)
assert isinstance(optimizer, torch.optim.Optimizer)
self.optimizer = optimizer
with tqdm(total=self.epochs) as progress:
train_loader = self.instance_a_train_loader()
for epoch in range(self.epochs):
loss = self.train_an_epoch(train_loader)
progress.update(1)
progress.set_postfix({"loss": loss})
def compute_explainability(self):
assert self.dataset is not None
assert self.data is not None
ds = self.dataset.pivot(index="userId", columns="itemId", values="rating")
ds = ds.fillna(0)
ds = sparse.csr_matrix(ds)
sim_matrix = cosine_similarity(ds)
min_val = sim_matrix.min() - 1
for i in range(self.data.num_user):
sim_matrix[i, i] = min_val
knn_to_user_i = (-sim_matrix[i, :]).argsort()[: self.knn]
self.sim_users[i] = knn_to_user_i
self.explainability_matrix = np.zeros((self.data.num_user, self.data.num_item))
filter_dataset_on_threshold = self.dataset[
self.dataset["rating"] >= self.positive_threshold
]
for i in range(self.data.num_user):
knn_to_user_i = self.sim_users[i]
rated_items_by_sim_users = filter_dataset_on_threshold[
filter_dataset_on_threshold["userId"].isin(knn_to_user_i)
]
sim_scores = rated_items_by_sim_users.groupby(by="itemId")
sim_scores = sim_scores["rating"].sum()
sim_scores = sim_scores.reset_index()
self.explainability_matrix[i, sim_scores.itemId] = (
sim_scores.rating.to_list()
)
self.explainability_matrix = MinMaxScaler().fit_transform(
self.explainability_matrix
)
self.explainability_matrix = torch.from_numpy(self.explainability_matrix)
def instance_a_train_loader(self):
"""instance train loader for one training epoch"""
assert self.dataset is not None
assert self.explainability_matrix is not None
self.user_item_dict = UserItemDict(
self.dataset, self.explainability_matrix, self.expl
)
return DataLoader(self.user_item_dict, shuffle=True)
def train_an_epoch(self, train_loader):
self.train()
cnt = 0
total_loss = 0
for batch_id, batch in enumerate(train_loader):
assert isinstance(batch[0], torch.Tensor)
rating = batch[0]
rating = rating.float()
loss = self.train_single_user(rating)
total_loss += loss
cnt += 1
return total_loss / cnt
def train_single_user(self, ratings):
if self.use_gpu:
ratings = ratings.cuda()
assert self.optimizer is not None
self.optimizer.zero_grad()
ratings_pred = self(ratings)
loss = self.criterion(ratings_pred, ratings)
loss.backward()
self.optimizer.step()
loss = loss.item()
return loss
def forward(self, user_adjusted_ratings):
activation = self.encoder_hidden_layer(user_adjusted_ratings)
code = torch.relu(activation)
activation = self.decoder_output_layer(code)
reconstructed_ratings = torch.relu(activation)
return reconstructed_ratings
def predict(
self, user_id: Union[int, List[int], str], item_id: Union[int, List[int], str]
) -> list:
try:
if isinstance(user_id, str):
user_id = int(user_id)
elif isinstance(user_id, list):
user_id = [int(u) for u in user_id]
if isinstance(item_id, str):
item_id = int(item_id)
elif isinstance(item_id, list):
item_id = [int(i) for i in item_id]
except (ValueError, TypeError):
raise ValueError(
"User and item IDs must be integers or strings that can be converted to integers."
)
single_user = isinstance(user_id, int)
single_item = isinstance(item_id, int)
if isinstance(user_id, int):
user_id = [user_id]
if isinstance(item_id, int):
item_id = [item_id]
with torch.no_grad():
assert self.user_item_dict is not None, "The model has not been fitted yet."
# Collect ratings for all users
ratings_list = []
for uid in user_id:
rating = self.user_item_dict[uid] # Pass scalar user_id to dict
ratings_list.append(rating)
rating = torch.stack(ratings_list)
rating = rating.float()
if self.use_gpu:
rating = rating.cuda()
pred = self.forward(rating).cpu()
predictions = pred[:, item_id].tolist()
# Flatten the nested list if it contains only one user's predictions
if single_user and single_item:
return (
predictions[0][0]
if isinstance(predictions[0], list)
else predictions[0]
)
elif single_user:
return predictions[0]
return predictions
+25
View File
@@ -0,0 +1,25 @@
import implicit
from .mf_implicit_model import MFImplicitModel
class BPR(MFImplicitModel):
""""""
def __init__(self,
latent_dim,
reg_term,
learning_rate,
epochs,
**kwargs):
super(BPR, self).__init__(latent_dim=latent_dim,
reg_term=reg_term,
learning_rate=learning_rate,
epochs=epochs)
self.model = implicit.bpr.BayesianPersonalizedRanking(
factors=self.latent_dim,
learning_rate=self.learning_rate,
regularization=self.reg_term,
iterations=self.epochs
)
+391
View File
@@ -0,0 +1,391 @@
import numpy as np
import torch
import torch.nn as nn
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from typing import Union
from pygrex.data_reader import UserItemRatingDataset, DataReader
from pygrex.utils import EMFLoss
from .py_torch_model import PyTorchModel
from .recommender_model import RecommenderModel
class EMFModel(RecommenderModel):
def __init__(
self,
learning_rate: float,
reg_term: float,
expl_reg_term: float,
positive_threshold: float,
latent_dim: int,
epochs: int,
knn: int,
):
self.latent_dim = latent_dim
self.learning_rate = learning_rate
self.epochs = epochs
self.dataset = None
self.data = None
self.embedding_user = None
self.embedding_item = None
self.optimizer = None
self.reg_term = reg_term
self.expl_reg_term = expl_reg_term
self.positive_threshold = positive_threshold
self.knn = knn
self.explainability_matrix = None
self.sim_users = {}
self.affine_output = nn.Linear(in_features=self.latent_dim, out_features=1)
self.criterion = EMFLoss()
def fit(self, data: DataReader) -> None:
self.data = data
self.dataset = data.dataset
assert self.data is not None
num_users = self.data.num_user
num_items = self.data.num_item
self.embedding_user = np.random.uniform(
low=0, high=0.5 / self.latent_dim, size=(num_users, self.latent_dim)
)
self.embedding_item = np.random.uniform(
low=0, high=0.5 / self.latent_dim, size=(num_items, self.latent_dim)
)
self.compute_explainability()
with tqdm(total=self.epochs) as progress:
assert self.dataset is not None
for epoch in range(self.epochs):
self.dataset = self.dataset.sample(frac=1)
loss = []
for _, row in self.dataset.iterrows():
user_id = int(row.userId)
item_id = int(row.itemId)
p_ui = self.predict(user_id, item_id)
e_ui = row.rating - p_ui
loss.append(e_ui**2)
assert self.embedding_item is not None
assert self.embedding_user is not None
delta_u = 2 * e_ui * self.embedding_item[item_id, :]
delta_u -= self.reg_term * self.embedding_user[user_id, :]
temp = np.sign(
self.embedding_item[item_id, :]
- self.embedding_user[user_id, :]
)
assert self.explainability_matrix is not None
temp *= (
self.expl_reg_term
* self.explainability_matrix[user_id, item_id]
)
delta_u -= temp
delta_v = 2 * e_ui * self.embedding_user[user_id, :]
delta_v -= self.reg_term * self.embedding_item[item_id, :]
temp = np.sign(
self.embedding_user[user_id, :]
- self.embedding_item[item_id, :]
)
assert self.explainability_matrix is not None
temp *= (
self.expl_reg_term
* self.explainability_matrix[user_id, item_id]
)
delta_v -= temp
self.embedding_user[user_id, :] += self.learning_rate * delta_u
self.embedding_item[item_id, :] += self.learning_rate * delta_v
progress.update(1)
progress.set_postfix({"MSE": sum(loss) / len(loss)})
def compute_explainability(self):
assert self.dataset is not None
ds = self.dataset.pivot(index="userId", columns="itemId", values="rating")
ds = ds.fillna(0)
ds = sparse.csr_matrix(ds)
sim_matrix = cosine_similarity(ds)
min_val = sim_matrix.min() - 1
assert self.data is not None
for i in range(self.data.num_user):
sim_matrix[i, i] = min_val
knn_to_user_i = (-sim_matrix[i, :]).argsort()[: self.knn]
self.sim_users[i] = knn_to_user_i
self.explainability_matrix = np.zeros((self.data.num_user, self.data.num_item))
filter_dataset_on_threshold = self.dataset[
self.dataset["rating"] >= self.positive_threshold
]
for i in range(self.data.num_user):
knn_to_user_i = self.sim_users[i]
rated_items_by_sim_users = filter_dataset_on_threshold[
filter_dataset_on_threshold["userId"].isin(knn_to_user_i)
]
sim_scores = rated_items_by_sim_users.groupby(by="itemId")
sim_scores = sim_scores["rating"].sum()
sim_scores = sim_scores.reset_index()
self.explainability_matrix[i, sim_scores.itemId.astype(int)] = (
sim_scores.rating.to_list()
)
self.explainability_matrix = MinMaxScaler().fit_transform(
self.explainability_matrix
)
def predict(
self, user_id: Union[int, str], item_id: Union[int, str]
) -> Union[float, list]:
user_id_processed = user_id
item_id_processed = item_id
if isinstance(user_id_processed, np.ndarray):
user_id_processed = user_id_processed.tolist()
if isinstance(item_id_processed, np.ndarray):
item_id_processed = item_id_processed.tolist()
is_list_input = isinstance(user_id_processed, list) or isinstance(
item_id_processed, list
)
if is_list_input:
user_id_list = (
user_id_processed
if isinstance(user_id_processed, list)
else [user_id_processed]
)
item_id_list = (
item_id_processed
if isinstance(item_id_processed, list)
else [item_id_processed]
)
predictions = []
for u in user_id_list:
assert self.embedding_user is not None
assert self.embedding_item is not None
pred = [
np.dot(
self.embedding_user[int(u), :], self.embedding_item[int(i), :]
)
for i in item_id_list
]
predictions.append(pred)
predictions_np = np.array(predictions)
if len(user_id_list) == 1 or len(item_id_list) == 1:
predictions_np = predictions_np.flatten()
return predictions_np.tolist()
else:
assert self.embedding_user is not None
assert self.embedding_item is not None
return np.dot(
self.embedding_user[int(user_id), :],
self.embedding_item[int(item_id), :],
)
def user_embedding(self):
return self.embedding_user
def item_embedding(self):
return self.embedding_item
class EMFTorchModel(PyTorchModel):
def __init__(
self,
learning_rate: float,
reg_term: float,
expl_reg_term: float,
positive_threshold: float,
momentum: float,
weight_decay: float,
latent_dim: int,
epochs: int,
batch_size: int,
knn: int,
cuda: bool,
optimizer_name: str,
device_id=None,
):
super().__init__(
learning_rate=learning_rate,
latent_dim=latent_dim,
epochs=epochs,
batch_size=batch_size,
cuda=cuda,
optimizer_name=optimizer_name,
device_id=device_id,
)
self.reg_term = reg_term
self.expl_reg_term = expl_reg_term
self.positive_threshold = positive_threshold
self.momentum = momentum
self.weight_decay = weight_decay
self.knn = knn
self.explainability_matrix = None
self.sim_users = {}
self.affine_output = nn.Linear(in_features=self.latent_dim, out_features=1)
self.criterion = EMFLoss()
def fit(self, data: DataReader) -> None:
self.data = data
self.dataset = data.dataset
assert self.data is not None
num_users = self.data.num_user
num_items = self.data.num_item
self.embedding_user = nn.Embedding(
num_embeddings=num_users, embedding_dim=self.latent_dim
)
self.embedding_item = nn.Embedding(
num_embeddings=num_items, embedding_dim=self.latent_dim
)
self.compute_explainability()
self.optimizer = torch.optim.SGD(
self.parameters(),
lr=self.learning_rate,
momentum=self.momentum,
weight_decay=self.weight_decay,
)
with tqdm(total=self.epochs) as progress:
for epoch in range(self.epochs):
train_loader = self.instance_a_train_loader(self.batch_size)
loss = self.train_an_epoch(train_loader)
progress.update(1)
progress.set_postfix({"loss": loss})
def compute_explainability(self):
assert self.dataset is not None
ds = self.dataset.pivot(index="userId", columns="itemId", values="rating")
ds = ds.fillna(0)
ds = sparse.csr_matrix(ds)
sim_matrix = cosine_similarity(ds)
min_val = sim_matrix.min() - 1
assert self.data is not None
for i in range(self.data.num_user):
sim_matrix[i, i] = min_val
knn_to_user_i = (-sim_matrix[i, :]).argsort()[: self.knn]
self.sim_users[i] = knn_to_user_i
self.explainability_matrix = np.zeros((self.data.num_user, self.data.num_item))
filter_dataset_on_threshold = self.dataset[
self.dataset["rating"] >= self.positive_threshold
]
for i in range(self.data.num_user):
knn_to_user_i = self.sim_users[i]
rated_items_by_sim_users = filter_dataset_on_threshold[
filter_dataset_on_threshold["userId"].isin(knn_to_user_i)
]
sim_scores = rated_items_by_sim_users.groupby(by="itemId")
sim_scores = sim_scores["rating"].sum()
sim_scores = sim_scores.reset_index()
self.explainability_matrix[i, sim_scores.itemId.astype(int)] = (
sim_scores.rating.to_list()
)
self.explainability_matrix = MinMaxScaler().fit_transform(
self.explainability_matrix
)
self.explainability_matrix = torch.from_numpy(self.explainability_matrix)
def instance_a_train_loader(self, batch_size):
assert self.dataset is not None
dataset = UserItemRatingDataset(
user_tensor=torch.LongTensor(self.dataset.userId.values),
item_tensor=torch.LongTensor(self.dataset.itemId.values),
target_tensor=torch.FloatTensor(self.dataset.rating.values),
)
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
def train_an_epoch(self, train_loader):
self.train()
cnt = 0
total_loss = 0
for batch_id, batch in enumerate(train_loader):
assert isinstance(batch[0], torch.LongTensor)
user, item, rating = batch[0], batch[1], batch[2]
rating = rating.float()
loss = self.train_single_batch(user, item, rating)
total_loss += loss
cnt += 1
return total_loss / cnt
def train_single_batch(self, users, items, ratings):
if self.cuda is True:
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
assert self.optimizer is not None
self.optimizer.zero_grad()
ratings_pred = self(users, items)
assert self.embedding_user is not None
user_embeddings = self.embedding_user(users)
assert self.embedding_item is not None
item_embeddings = self.embedding_item(items)
assert self.explainability_matrix is not None
loss = self.criterion(
ratings_pred=ratings_pred,
ratings=ratings,
u=user_embeddings,
v=item_embeddings,
reg_term=self.reg_term,
expl=self.explainability_matrix[users, items],
expl_reg_term=self.expl_reg_term,
)
loss.backward()
self.optimizer.step()
loss = loss.item()
return loss
def forward(self, user_indices, item_indices):
assert self.embedding_user is not None
user_embeddings = self.embedding_user(user_indices)
assert self.embedding_item is not None
item_embeddings = self.embedding_item(item_indices)
element_product = torch.mul(user_embeddings, item_embeddings)
rating = self.affine_output(element_product)
return rating
+165
View File
@@ -0,0 +1,165 @@
import random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Optimizer
from tqdm.auto import tqdm
from pygrex.data_reader import DataReader, UserItemRatingDataset
from pygrex.utils.torch_utils import use_optimizer
from .py_torch_model import PyTorchModel
class GMFModel(PyTorchModel):
def __init__(
self,
learning_rate: float,
weight_decay: float,
latent_dim: int,
epochs: int,
num_negative: int,
batch_size: int,
cuda: bool,
optimizer_name: str,
device_id=None,
):
super().__init__(
learning_rate=learning_rate,
latent_dim=latent_dim,
epochs=epochs,
batch_size=batch_size,
cuda=cuda,
optimizer_name=optimizer_name,
device_id=device_id,
)
self.negative_sample_size = num_negative
self.weight_decay = weight_decay
self.optimizer: Optimizer | None = None
self.affine_output = torch.nn.Linear(
in_features=self.latent_dim, out_features=1
)
self.logistic = torch.nn.Sigmoid()
self.criterion = nn.BCELoss()
def fit(self, data: DataReader):
optimizer = use_optimizer(
network=self,
weight_decay=self.weight_decay,
learning_rate=self.learning_rate,
optimizer_name=self.optimizer_name,
)
if not isinstance(optimizer, Optimizer):
raise TypeError(f"Expected an Optimizer, but got {type(optimizer)}")
self.optimizer = optimizer
dataset = data.dataset
num_users = data.num_user
num_items = data.num_item
self.embedding_user = torch.nn.Embedding(
num_embeddings=num_users, embedding_dim=self.latent_dim
)
self.embedding_item = torch.nn.Embedding(
num_embeddings=num_items, embedding_dim=self.latent_dim
)
self.negatives = self._sample_negative(dataset)
with tqdm(total=self.epochs) as progress:
for epoch in range(self.epochs):
train_loader = self.instance_a_train_loader(
dataset, self.negative_sample_size, self.batch_size
)
loss = self.train_an_epoch(train_loader)
progress.update(1)
progress.set_postfix({"loss": loss})
def instance_a_train_loader(self, dataset, num_negatives, batch_size):
"""instance train loader for one training epoch"""
users, items, ratings = [], [], []
train_ratings = pd.merge(
dataset, self.negatives[["userId", "negative_items"]], on="userId"
)
train_ratings["negatives"] = train_ratings["negative_items"].apply(
lambda x: random.sample(list(x), num_negatives)
)
user_ids = train_ratings["userId"].tolist()
item_ids = train_ratings["itemId"].tolist()
rating_values = train_ratings["rating"].tolist()
negatives_lists = train_ratings["negatives"].tolist()
for user, item, rating, negatives in zip(
user_ids, item_ids, rating_values, negatives_lists
):
users.append(user)
items.append(item)
ratings.append(rating)
for neg_item in negatives:
users.append(user)
items.append(neg_item)
ratings.append(float(0)) # negative samples get 0 rating
# negative samples get 0 rating
dataset = UserItemRatingDataset(
user_tensor=torch.LongTensor(users),
item_tensor=torch.LongTensor(items),
target_tensor=torch.FloatTensor(ratings),
)
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
def train_an_epoch(self, train_loader):
self.train()
cnt = 0
total_loss = 0
for batch_id, batch in enumerate(train_loader):
assert isinstance(batch[0], torch.LongTensor)
user, item, rating = batch[0], batch[1], batch[2]
rating = rating.float()
loss = self.train_single_batch(user, item, rating)
total_loss += loss
cnt += 1
return total_loss / cnt
def train_single_batch(self, users, items, ratings):
if self.cuda is True:
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
if self.optimizer is None:
raise RuntimeError(
"Optimizer is not initialized. Call fit() before training."
)
self.optimizer.zero_grad()
ratings_pred = self(users, items)
loss = self.criterion(ratings_pred.view(-1), ratings)
loss.backward()
self.optimizer.step()
loss = loss.item()
return loss
def _sample_negative(self, ratings):
"""return all negative items & 100 sampled negative items"""
interact_status = (
ratings.groupby("userId")["itemId"]
.apply(set)
.reset_index()
.rename(columns={"itemId": "interacted_items"})
)
self.item_catalogue = set(ratings.itemId)
interact_status["negative_items"] = interact_status["interacted_items"].apply(
lambda x: self.item_catalogue - x
)
return interact_status[["userId", "negative_items"]]
def forward(self, user_indices, item_indices):
user_embedding = self.embedding_user(user_indices)
item_embedding = self.embedding_item(item_indices)
element_product = torch.mul(user_embedding, item_embedding)
dot = self.affine_output(element_product)
rating = self.logistic(dot)
return rating
+22
View File
@@ -0,0 +1,22 @@
import torch.nn as nn
class Item2Vec(nn.Module):
def __init__(self, config):
super().__init__()
self.num_items = config['num_items']
self.latent_dim = config['latent_dim']
self.embedding = nn.Embedding(
num_embeddings=self.num_items,
embedding_dim=self.latent_dim)
self.fc = nn.Linear(
in_features=self.latent_dim,
out_features=self.num_items)
def forward(self, input_data):
embedding = self.embedding(input_data)
return self.fc(embedding)
def item_embedding(self):
return self.embedding.weight.detach()
+240
View File
@@ -0,0 +1,240 @@
from typing import Optional, Union
import numpy as np
import scipy.sparse as sp
from .recommender_model import RecommenderModel
from pygrex.data_reader import DataReader
class KNNBasic(RecommenderModel):
"""
An improved K-Nearest Neighbors collaborative filtering model.
This version uses Pearson correlation similarity and improved neighbor selection
for better performance on sparse datasets like MovieLens.
Args:
k (int): Number of neighbors to consider. Default 50.
min_k (int): Minimum number of neighbors required for prediction. Default 3.
sim_options (dict): Similarity options. Default pearson, user-based.
"""
def __init__(self, k: int = 50, min_k: int = 3, sim_options: Optional[dict] = None):
super().__init__()
self.k = k
self.min_k = min_k
self.sim_options = sim_options if sim_options is not None else {}
# Validate similarity options
if self.sim_options.get("user_based", True) is False:
raise NotImplementedError("Only the user-based approach is implemented.")
sim_name = self.sim_options.get("name", "pearson").lower()
if sim_name not in ["cosine", "pearson"]:
raise NotImplementedError(
"Only cosine and pearson similarity are implemented."
)
# Model attributes
self.trainset: Optional[sp.csr_matrix] = None
self.global_mean: float = 0
self.user_biases: Optional[np.ndarray] = None
self.item_biases: Optional[np.ndarray] = None
self.num_users: Optional[int] = None
self.num_items: Optional[int] = None
# For memory-efficient similarity computation
self.user_means: Optional[np.ndarray] = None
def fit(self, data: DataReader) -> None:
"""
Trains the KNN model with improved memory efficiency.
"""
print("Fitting the improved KNNBasic model...")
df = data.dataset
self.num_users = data.num_user
self.num_items = data.num_item
print(
f"Building ratings matrix for {self.num_users} users and {self.num_items} items..."
)
# 1. Build the sparse user-item ratings matrix
ratings = df["rating"].values
rows = df["userId"].values
cols = df["itemId"].values
self.trainset = sp.csr_matrix(
(ratings, (rows, cols)), shape=(self.num_users, self.num_items)
)
# 2. Calculate global mean and biases
print("Computing biases...")
self.global_mean = self.trainset.data.mean()
# User biases: bu = avg(ratings_u) - global_mean
user_sums = np.array(self.trainset.sum(axis=1)).flatten()
user_counts = np.diff(self.trainset.indptr)
with np.errstate(divide="ignore", invalid="ignore"):
user_avg_ratings = np.where(
user_counts > 0, user_sums / user_counts, self.global_mean
)
self.user_biases = np.where(
user_counts > 0, user_avg_ratings - self.global_mean, 0
)
# Item biases: bi = avg(ratings_i) - global_mean
item_sums = np.array(self.trainset.sum(axis=0)).flatten()
item_counts = np.diff(self.trainset.tocsc().indptr)
with np.errstate(divide="ignore", invalid="ignore"):
item_avg_ratings = np.where(
item_counts > 0, item_sums / item_counts, self.global_mean
)
self.item_biases = np.where(
item_counts > 0, item_avg_ratings - self.global_mean, 0
)
# Store user means for similarity computation
self.user_means = user_avg_ratings
print("Model fitting complete.")
def _compute_user_similarity(self, user1_id: int, user2_id: int) -> float:
"""
Compute Pearson correlation similarity between two users.
This works better than cosine similarity for collaborative filtering.
"""
assert self.trainset is not None
# Get rating vectors for both users
user1_ratings = self.trainset[user1_id].toarray().flatten()
user2_ratings = self.trainset[user2_id].toarray().flatten()
# Find commonly rated items
mask = (user1_ratings > 0) & (user2_ratings > 0)
n_common = np.sum(mask)
# Need at least 2 common ratings for correlation
if n_common < 2:
return 0.0
# Extract ratings for commonly rated items
u1_common = user1_ratings[mask]
u2_common = user2_ratings[mask]
# Mean-center the ratings
u1_mean = np.mean(u1_common)
u2_mean = np.mean(u2_common)
u1_centered = u1_common - u1_mean
u2_centered = u2_common - u2_mean
# Compute Pearson correlation
numerator = np.sum(u1_centered * u2_centered)
denom1 = np.sqrt(np.sum(u1_centered**2))
denom2 = np.sqrt(np.sum(u2_centered**2))
if denom1 == 0 or denom2 == 0:
return 0.0
correlation = numerator / (denom1 * denom2)
# Apply significance weighting based on number of common items
# More common items = more reliable similarity
significance_weight = min(n_common / 50.0, 1.0) # Cap at 50 common items
return correlation * significance_weight
def _get_neighbors_for_item(self, user_id: int, item_id: int):
"""
Get the top-k most similar users who have rated the given item.
"""
# Find users who rated this item
assert self.trainset is not None
item_col = self.trainset[:, item_id] # type: ignore
neighbor_candidates, _ = item_col.nonzero()
# Remove the target user if they're in the candidates
neighbor_candidates = neighbor_candidates[neighbor_candidates != user_id]
if len(neighbor_candidates) == 0:
return np.array([]), np.array([]), np.array([])
# Compute similarities
similarities = []
for neighbor_id in neighbor_candidates:
sim = self._compute_user_similarity(user_id, neighbor_id)
similarities.append((sim, neighbor_id))
# Sort by similarity and take top-k
similarities.sort(key=lambda x: x[0], reverse=True)
top_k = similarities[: min(self.k, len(similarities))]
if len(top_k) < self.min_k:
return np.array([]), np.array([]), np.array([])
# Extract data
neighbor_sims = np.array([sim for sim, _ in top_k])
neighbor_ids = np.array([nid for _, nid in top_k])
neighbor_ratings = np.array(
[self.trainset[nid, item_id] for nid in neighbor_ids]
)
return neighbor_sims, neighbor_ids, neighbor_ratings
def predict(self, user_id: Union[int, str], item_id: Union[int, str]) -> float:
"""
Predict rating for a user-item pair using KNN.
"""
if self.trainset is None:
raise RuntimeError("Model must be trained first using fit() method.")
assert self.num_users is not None
assert self.num_items is not None
assert self.user_biases is not None
assert self.item_biases is not None
user_id = int(user_id)
item_id = int(item_id)
# Handle out-of-bounds users/items
if user_id >= self.num_users or item_id >= self.num_items:
return self.global_mean
# 1. Calculate baseline estimate
baseline = (
self.global_mean + self.user_biases[user_id] + self.item_biases[item_id]
)
# 2. Get neighbors who rated this item
neighbor_sims, neighbor_ids, neighbor_ratings = self._get_neighbors_for_item(
user_id, item_id
)
if len(neighbor_ids) == 0:
return baseline
# 3. Calculate weighted prediction
neighbor_biases = self.user_biases[neighbor_ids]
neighbor_baselines = (
self.global_mean + neighbor_biases + self.item_biases[item_id]
)
deviations = neighbor_ratings - neighbor_baselines
# Only use neighbors with positive similarity
positive_mask = neighbor_sims > 0
if not np.any(positive_mask):
return baseline
neighbor_sims = neighbor_sims[positive_mask]
deviations = deviations[positive_mask]
numerator = np.sum(neighbor_sims * deviations)
denominator = np.sum(np.abs(neighbor_sims))
if denominator == 0:
return baseline
prediction = baseline + (numerator / denominator)
# Clip to valid rating range
return np.clip(prediction, 1.0, 5.0)
+136
View File
@@ -0,0 +1,136 @@
import numpy as np
import scipy
from typing import Union, Protocol, runtime_checkable
from implicit.recommender_base import RecommenderBase
from .recommender_model import RecommenderModel
from pygrex.data_reader import DataReader
@runtime_checkable
class FittableImplicitModel(Protocol):
user_factors: np.ndarray
item_factors: np.ndarray
def fit(self, item_user_data) -> None: ...
class MFImplicitModel(RecommenderModel):
def __init__(
self,
latent_dim,
reg_term,
learning_rate,
epochs,
num_users=None,
num_items=None,
):
self.latent_dim = latent_dim
self.reg_term = reg_term
self.learning_rate = learning_rate
self.epochs = epochs
self.model: Union[RecommenderBase, FittableImplicitModel, None] = None
self.total_users = num_users
self.total_items = num_items
def fit(self, data: DataReader) -> None:
if self.model is None:
raise RuntimeError(
"The model has not been initialized. Please use a specific subclass like ALS or BPR."
)
num_user_for_shape = data.dataset["userId"].max() + 1
num_item_for_shape = data.dataset["itemId"].max() + 1
self.total_users = num_user_for_shape
self.total_items = num_item_for_shape
item_user_data = self.rearrange_dataset(
ds=data.dataset,
num_user=num_user_for_shape,
num_item=num_item_for_shape,
).T.tocsr()
self.model.fit(item_user_data)
@staticmethod
def rearrange_dataset(ds, num_user: int, num_item: int) -> scipy.sparse.csr_matrix:
"""
Converts the dataset into a sparse matrix format for the implicit model.
Args:
ds: Dataset containing userId and itemId columns
num_user : Number of users in the dataset
num_item : Number of items in the dataset
Returns:
ds_mtr: Sparse matrix representation of the dataset
"""
# Create sparse matrix directly from data
data = np.ones(len(ds)) # Array of 1s for each interaction
rows = ds["userId"].values # User IDs as row indices
cols = ds["itemId"].values # Item IDs as column indices
ds_mtr = scipy.sparse.csr_matrix(
(data, (rows, cols)), shape=(num_user, num_item)
)
return ds_mtr
def predict(
self, user_id: Union[str, int], item_id: Union[str, int, list, np.ndarray]
) -> Union[float, list]:
"""
Predict ratings for a user and one or more items using efficient vectorization.
Args:
user_id : User identifier
item_id : Item identifier or a list/array of item identifiers
Returns:
A single predicted score (float) or an array of scores (np.ndarray)
"""
if not isinstance(self.model, FittableImplicitModel):
raise RuntimeError(
"The model has not been trained yet. Please call fit() first."
)
user_id = int(user_id)
# 1. Validate user_id
if not (0 <= user_id < self.model.user_factors.shape[0]):
raise ValueError(f"user_id {user_id} is out of bounds")
# 2. Unify input to always be a numpy array
is_single_item = not isinstance(item_id, (list, np.ndarray))
item_ids_arr = np.array(item_id, ndmin=1).astype(int)
# 3. Perform a single, vectorized bounds check for all items at once
max_item_id = self.model.item_factors.shape[0]
if not np.all((item_ids_arr >= 0) & (item_ids_arr < max_item_id)):
out_of_bounds_id = item_ids_arr[
(item_ids_arr < 0) | (item_ids_arr >= max_item_id)
][0]
raise ValueError(f"item_id {out_of_bounds_id} is out of bounds")
# 4. Get all item vectors in a single, highly efficient operation
item_vectors = self.model.item_factors[item_ids_arr]
user_vector = self.model.user_factors[user_id]
# 5. Calculate all scores with one dot product
scores = user_vector.dot(item_vectors.T)
# 6. Return a single float if the input was a single item, otherwise the array
return scores[0].item() if is_single_item else scores.tolist()
def user_embedding(self) -> np.ndarray:
if not isinstance(self.model, FittableImplicitModel):
raise RuntimeError(
"The model has not been trained yet. Please call fit() first."
)
return self.model.user_factors
def item_embedding(self) -> np.ndarray:
if not isinstance(self.model, FittableImplicitModel):
raise RuntimeError(
"The model has not been trained yet. Please call fit() first."
)
return self.model.item_factors
+179
View File
@@ -0,0 +1,179 @@
import random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Optimizer
from tqdm.auto import tqdm
from pygrex.data_reader import DataReader, UserItemRatingDataset
from pygrex.utils.torch_utils import use_optimizer
from .py_torch_model import PyTorchModel
class MLPModel(PyTorchModel):
def __init__(
self,
learning_rate: float,
weight_decay: float,
latent_dim: int,
epochs: int,
num_negative: int,
batch_size: int,
cuda: bool,
optimizer_name: str,
device_id=None,
):
super().__init__(
learning_rate=learning_rate,
latent_dim=latent_dim,
epochs=epochs,
batch_size=batch_size,
cuda=cuda,
optimizer_name=optimizer_name,
device_id=device_id,
)
self.negative_sample_size = num_negative
self.weight_decay = weight_decay
# layer dim is 2*self.latent_dim since the embeddings will be concatenated
self.affine_output = torch.nn.Linear(
in_features=2 * self.latent_dim, out_features=1
)
self.logistic = torch.nn.Sigmoid()
self.criterion = nn.BCELoss()
self.optimizer: Optimizer | None = None
def fit(self, data: DataReader):
optimizer = use_optimizer(
network=self,
weight_decay=self.weight_decay,
learning_rate=self.learning_rate,
optimizer_name=self.optimizer_name,
)
if not isinstance(optimizer, Optimizer):
raise TypeError(f"Expected an Optimizer, but got {type(optimizer)}")
self.optimizer = optimizer
dataset = data.dataset
num_users = data.num_user
num_items = data.num_item
self.embedding_user = torch.nn.Embedding(
num_embeddings=num_users, embedding_dim=self.latent_dim
)
self.embedding_item = torch.nn.Embedding(
num_embeddings=num_items, embedding_dim=self.latent_dim
)
self.negatives = self._sample_negative(dataset)
with tqdm(total=self.epochs) as progress:
for epoch in range(self.epochs):
train_loader = self.instance_a_train_loader(
dataset, self.negative_sample_size, self.batch_size
)
loss = self.train_an_epoch(train_loader)
progress.update(1)
progress.set_postfix({"loss": loss})
def instance_a_train_loader(self, dataset, num_negatives, batch_size):
"""instance train loader for one training epoch"""
users, items, ratings = [], [], []
train_ratings = pd.merge(
dataset, self.negatives[["userId", "negative_items"]], on="userId"
)
train_ratings["negatives"] = train_ratings["negative_items"].apply(
lambda x: random.sample(list(x), num_negatives)
)
user_ids = train_ratings["userId"].tolist()
item_ids = train_ratings["itemId"].tolist()
rating_values = train_ratings["rating"].tolist()
negatives_lists = train_ratings["negatives"].tolist()
for user, item, rating, negatives in zip(
user_ids, item_ids, rating_values, negatives_lists
):
users.append(user)
items.append(item)
ratings.append(rating)
for neg_item in negatives:
users.append(user)
items.append(neg_item)
ratings.append(float(0)) # negative samples get 0 rating
dataset = UserItemRatingDataset(
user_tensor=torch.LongTensor(users),
item_tensor=torch.LongTensor(items),
target_tensor=torch.FloatTensor(ratings),
)
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
def train_an_epoch(self, train_loader):
self.train()
cnt = 0
total_loss = 0
for batch_id, batch in enumerate(train_loader):
assert isinstance(batch[0], torch.LongTensor)
user, item, rating = batch[0], batch[1], batch[2]
rating = rating.float()
loss = self.train_single_batch(user, item, rating)
total_loss += loss
cnt += 1
return total_loss / cnt
def train_single_batch(self, users, items, ratings):
if self.cuda is True:
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
if self.optimizer is None:
raise RuntimeError(
"Optimizer is not initialized. Call fit() before training."
)
self.optimizer.zero_grad()
ratings_pred = self(users, items)
loss = self.criterion(ratings_pred.view(-1), ratings)
loss.backward()
self.optimizer.step()
loss = loss.item()
return loss
def _sample_negative(self, ratings):
"""return all negative items & 100 sampled negative items"""
interact_status = (
ratings.groupby("userId")["itemId"]
.apply(set)
.reset_index()
.rename(columns={"itemId": "interacted_items"})
)
self.item_catalogue = set(ratings.itemId)
interact_status["negative_items"] = interact_status["interacted_items"].apply(
lambda x: self.item_catalogue - x
)
return interact_status[["userId", "negative_items"]]
def forward(self, user_indices, item_indices):
user_embedding = self.embedding_user(user_indices)
item_embedding = self.embedding_item(item_indices)
# Ensure embeddings are 2D [batch_size, embedding_dim]
if user_embedding.dim() == 3:
user_embedding = user_embedding.squeeze(1)
if item_embedding.dim() == 3:
item_embedding = item_embedding.squeeze(1)
# This is needed because cat does not support broadcasting.
if user_embedding.size(0) == 1 and item_embedding.size(0) > 1:
user_embedding = user_embedding.repeat(item_embedding.size(0), 1)
elif item_embedding.size(0) == 1 and user_embedding.size(0) > 1:
item_embedding = item_embedding.repeat(user_embedding.size(0), 1)
element_concat = torch.cat((user_embedding, item_embedding), 1)
concat = self.affine_output(element_concat)
rating = self.logistic(concat)
return rating
+69
View File
@@ -0,0 +1,69 @@
import itertools
from typing import Union
import torch
from pygrex.utils.torch_utils import use_cuda
from .recommender_model import RecommenderModel
from pygrex.data_reader import DataReader
class PyTorchModel(RecommenderModel, torch.nn.Module):
"""Meta Learner
Note: Subclass should implement self.model !
"""
def __init__(
self,
learning_rate: float,
latent_dim: int,
epochs: int,
batch_size: int,
cuda: bool,
optimizer_name: str,
device_id: Union[int, None] = None,
):
if optimizer_name not in ["sgd", "adam", "rmsprop"]:
raise Exception("Wrong optimizer.")
if cuda is True and device_id is not None:
use_cuda(True, device_id)
self.latent_dim = latent_dim
self.learning_rate = learning_rate
self.epochs = epochs
self.batch_size = batch_size
self._cuda = cuda
self.optimizer_name = optimizer_name
self.dataset = None
self.dataset_metadata = None
self.embedding_user = None
self.embedding_item = None
self.optimizer = None
super().__init__()
def fit(self, data: DataReader):
pass
def predict(self, user_id, item_id) -> list:
if isinstance(user_id, int):
user_id = [user_id]
if isinstance(item_id, int):
item_id = [item_id]
user_id = torch.LongTensor(user_id)
item_id = torch.LongTensor(item_id)
with torch.no_grad():
if self._cuda:
user_id = user_id.cuda()
item_id = item_id.cuda()
pred = self.forward(user_id, item_id).cpu().tolist()
pred = list(itertools.chain.from_iterable(pred))
return pred
def user_embedding(self):
return self.state_dict()["embedding_user.weight"].cpu().numpy()
def item_embedding(self):
return self.state_dict()["embedding_item.weight"].cpu().numpy()
+35
View File
@@ -0,0 +1,35 @@
from abc import ABC, abstractmethod
from typing import Union
from pygrex.data_reader.data_reader import DataReader
class RecommenderModel(ABC):
"""
Abstract base class that defines the interface for recommendation models.
All model implementations should inherit from this class.
"""
@abstractmethod
def predict(
self, user_id: Union[str, int], item_id: Union[str, int]
) -> Union[float, list]:
"""
Make predictions for a specific user on a list of items.
Args:
user_id: The ID of the user
item_ids: List of item IDs to predict ratings/scores for
Returns:
A dictionary mapping item IDs to predicted ratings/scores
"""
pass
@abstractmethod
def fit(self, data: DataReader):
"""
Train the model on data.
The specific parameters depend on the model implementation.
"""
pass
+169
View File
@@ -0,0 +1,169 @@
from math import sqrt
import numpy as np
from pygrex.data_reader.data_reader import DataReader
from pygrex.models.recommender_model import RecommenderModel
class SVD(RecommenderModel):
def __init__(
self,
n_factors=50,
n_epochs=25,
lr=0.007,
reg=0.1,
init_mean=0.0,
init_std=0.1,
random_state=42,
early_stopping=True,
):
self.n_factors = n_factors
self.n_epochs = n_epochs
self.lr = lr
self.reg = reg
self.init_mean = init_mean
self.init_std = init_std
self.random_state = random_state
self.early_stopping = early_stopping
# Model parameters
self.user_factors = None
self.item_factors = None
self.user_biases = None
self.item_biases = None
self.global_mean = None
# Training history
self.training_rmse = []
def fit(self, data: DataReader, validation_data=None):
df = data.dataset
if data._num_user is None or data._num_item is None:
raise ValueError("The number of users and items cannot be None.")
num_users, num_items = data._num_user, data._num_item
# Initialize random number generator
rng = np.random.RandomState(self.random_state)
# Initialize parameters with better scaling
scale = 1.0 / sqrt(self.n_factors)
self.user_factors = rng.normal(
self.init_mean, scale, (num_users, self.n_factors)
) # type: ignore
self.item_factors = rng.normal(
self.init_mean, scale, (num_items, self.n_factors)
) # type: ignore
self.user_biases = np.zeros(num_users)
self.item_biases = np.zeros(num_items)
self.global_mean = df["rating"].mean()
# Convert to list of tuples for faster iteration
ratings_tuple = list(
df[["userId", "itemId", "rating"]].itertuples(index=False, name=None)
)
# Training loop with early stopping
best_rmse = float("inf")
patience = 3
patience_counter = 0
for epoch in range(self.n_epochs):
print(f"Epoch {epoch + 1}/{self.n_epochs}...")
# Shuffle training data
rng.shuffle(ratings_tuple)
# SGD updates
for user, item, rating in ratings_tuple:
# Predict rating
dot_product = np.dot(self.user_factors[user], self.item_factors[item])
prediction = (
self.global_mean
+ self.user_biases[user]
+ self.item_biases[item]
+ dot_product
)
# Compute error
error = rating - prediction
# Update biases
self.user_biases[user] += self.lr * (
error - self.reg * self.user_biases[user]
)
self.item_biases[item] += self.lr * (
error - self.reg * self.item_biases[item]
)
# Update factors
uf_temp = self.user_factors[user].copy()
self.user_factors[user] += self.lr * (
error * self.item_factors[item] - self.reg * self.user_factors[user]
)
self.item_factors[item] += self.lr * (
error * uf_temp - self.reg * self.item_factors[item]
)
# Calculate training RMSE
if epoch % 5 == 0 or epoch == self.n_epochs - 1:
train_rmse = self.calculate_rmse(ratings_tuple)
self.training_rmse.append(train_rmse)
print(f" Training RMSE: {train_rmse:.4f}")
# Early stopping
if self.early_stopping and validation_data is not None:
val_rmse = self.calculate_rmse(validation_data)
print(f" Validation RMSE: {val_rmse:.4f}")
if val_rmse < best_rmse:
best_rmse = val_rmse
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= patience:
print(f"Early stopping at epoch {epoch + 1}")
break
print("Fit complete.")
def calculate_rmse(self, ratings_data):
"""Calculate RMSE for given ratings data."""
total_error = 0
count = 0
for user, item, rating in ratings_data:
prediction = self.predict(user, item)
total_error += (rating - prediction) ** 2
count += 1
return sqrt(total_error / count) if count > 0 else 0
def predict(self, user_id: int | str, item_id: int | str) -> float:
# Check that all model components are initialized
if (
self.user_factors is None
or self.item_factors is None
or self.user_biases is None
or self.item_biases is None
or self.global_mean is None
):
raise RuntimeError("The model has not been trained yet.")
try:
user_id = int(user_id)
item_id = int(item_id)
except (ValueError, TypeError):
# If conversion fails, return the global mean rating
return self.global_mean
# Make prediction
dot_product = np.dot(self.user_factors[user_id], self.item_factors[item_id])
prediction = (
self.global_mean
+ self.user_biases[user_id]
+ self.item_biases[item_id]
+ dot_product
)
# Clip to valid rating range
return np.clip(prediction, 1, 5)