public code v1

This commit is contained in:
2026-05-22 10:02:10 +02:00
commit 46a9ecf065
166 changed files with 6982454 additions and 0 deletions
View File
View File
+7
View File
@@ -0,0 +1,7 @@
import yaml
from box import Box
with open("configs/config.yml", "r") as yml_file:
full_cfg = yaml.safe_load(yml_file)
cfg = Box({**full_cfg["base"]}, default_box=True, default_box_attr=None)
+11
View File
@@ -0,0 +1,11 @@
from .data_reader import DataReader
from .user_item_rating_dataset import UserItemRatingDataset
from .group_interaction_handler import GroupInteractionHandler
from .user_item_dict import UserItemDict
__all__ = [
"DataReader",
"UserItemRatingDataset",
"GroupInteractionHandler",
"UserItemDict",
]
+416
View File
@@ -0,0 +1,416 @@
from typing import List, Optional, Union, cast
import numpy as np
import pandas as pd
import warnings
class DataReader:
def __init__(
self,
filepath_or_buffer: Optional[str] = None,
sep: Optional[str] = None,
names: Optional[List[str]] = None,
skiprows: int = 0,
dataframe: Optional[pd.DataFrame] = None,
) -> None:
"""
Initialize the DataReader with either a DataFrame or file parameters.
Args:
filepath_or_buffer (Optional[str]): Path to the CSV file or buffer.
sep (Optional[str]): Separator used in the CSV file.
names (Optional[List[str]]): List of column names for the CSV file.
skiprows (int, optional): Number of rows to skip in the CSV file. Defaults to 0.
dataframe (Optional[pd.DataFrame], optional): A DataFrame to use directly. Defaults to None.
Raises:
ValueError: If neither `dataframe` nor valid file parameters are provided.
FileNotFoundError: If the file cannot be found when loading from file.
pd.errors.ParserError: If the CSV file cannot be parsed when loading from file.
Note:
If `dataframe` is provided, it takes precedence, and file-related parameters
are ignored but stored for reference. A warning is issued in this case.
The DataFrame must contain columns: 'userId', 'itemId', 'rating', 'timestamp'.
"""
if dataframe is None and (not filepath_or_buffer or not sep or not names):
raise ValueError(
"Must provide either a DataFrame or valid file parameters."
)
self.filepath_or_buffer = filepath_or_buffer
self.sep = sep
self.names = names
self.skiprows = skiprows
self._dataset = None
self._raw_dataset = None
self._num_user: Optional[int] = None
self._num_item: Optional[int] = None
self.original_user_id: Optional[pd.DataFrame] = None
self.original_item_id: Optional[pd.DataFrame] = None
self.new_user_id: Optional[pd.DataFrame] = None
self.new_item_id: Optional[pd.DataFrame] = None
if dataframe is not None:
if any(param is not None for param in [filepath_or_buffer, sep, names]):
warnings.warn(
"DataFrame provided; file parameters (filepath_or_buffer, sep, names) are ignored.",
UserWarning,
)
self.dataset = dataframe
elif filepath_or_buffer and sep and names:
# Eagerly load data if file parameters are provided
try:
assert self.filepath_or_buffer is not None
loaded_df = pd.read_csv(
filepath_or_buffer=self.filepath_or_buffer,
sep=self.sep,
names=self.names,
skiprows=self.skiprows,
engine="python",
)
self._raw_dataset = loaded_df.copy()
# Use the setter to handle dataset validation and setting _num_user/_num_item
self.dataset = loaded_df
except FileNotFoundError:
raise FileNotFoundError(f"File not found: {self.filepath_or_buffer}")
except pd.errors.ParserError as e:
raise pd.errors.ParserError(f"Failed to parse CSV: {str(e)}")
else:
raise ValueError(
"Must provide either a DataFrame or valid file parameters."
)
@property
def dataset(self) -> pd.DataFrame:
"""
Get the dataset DataFrame.
"""
if self._dataset is None:
if self._dataset is None:
# If it reach here and _dataset is None, it means initialization failed
# or an empty DataFrame was set.
# This state should ideally not be reached with eager loading if file params were valid.
raise ValueError("Dataset is not loaded or is not valid.")
return self._dataset
@dataset.setter
def dataset(self, new_data: pd.DataFrame) -> None:
"""
Set the dataset and compute the number of unique users and items.
Args:
new_data (pd.DataFrame): The new dataset to set.
Raises:
ValueError: If the DataFrame is None, empty, lacks required columns,
or contains invalid data types/missing values.
"""
if new_data is None:
raise ValueError("DataFrame cannot be None")
if new_data.empty:
raise ValueError("DataFrame cannot be empty")
# Validate data types
for col in ["userId", "itemId", "rating"]:
if not pd.api.types.is_numeric_dtype(new_data[col]):
warnings.warn(
f"Column '{col}' is not numeric. Attempting conversion.",
UserWarning,
)
try:
new_data[col] = pd.to_numeric(new_data[col])
except ValueError:
raise ValueError(
f"Column '{col}' cannot be converted to a numeric type."
)
# Check for missing values in essential columns
if new_data[["userId", "itemId", "rating"]].isnull().any().any():
raise ValueError(
"DataFrame contains missing values in essential columns (userId, itemId, rating)."
)
self._dataset = new_data
self._raw_dataset = new_data.copy()
self._num_user = int(self._dataset["userId"].nunique())
self._num_item = int(self._dataset["itemId"].nunique())
# Set the index to userId and itemId for easier access
# Reset id mappings as they are now invalid for the new dataset
self.original_user_id = None
self.original_item_id = None
self.new_user_id = None
self.new_item_id = None
def get_raw_dataset(self) -> pd.DataFrame:
"""
Get the raw dataset as loaded from the file or initially set.
Returns:
pd.DataFrame: The raw dataset.
Raises:
ValueError: If the raw dataset is not set.
"""
if self._raw_dataset is None:
raise ValueError(
"Raw dataset is not set. Load data from file or set a DataFrame first."
)
return self._raw_dataset
@staticmethod
def _create_id_mapping(column: pd.Series, new_column_name: str) -> pd.DataFrame:
"""
Create a mapping for consecutive IDs.
Args:
column (pd.Series): The column to map.
new_column_name (str): The name of the new column for consecutive IDs.
Returns:
pd.DataFrame: A DataFrame with the original and mapped IDs.
Raises:
ValueError: If the column is empty.
"""
if column.empty:
raise ValueError("Cannot create ID mapping for an empty column")
unique_values = column.drop_duplicates().reset_index(drop=True)
mapping = pd.DataFrame(
{column.name: unique_values, new_column_name: np.arange(len(unique_values))}
)
return mapping
def make_consecutive_ids_in_dataset(self) -> None:
"""
Map user and item IDs to consecutive integers starting from 0 in a deterministic way.
Modifies the dataset in-place and stores mappings for original and new IDs.
"""
if self._dataset is None:
raise ValueError("Dataset must be loaded or set before mapping IDs")
dataset = self.dataset.copy()
# Get unique IDs and SORT them to ensure the mapping is identical every time.
sorted_unique_users = sorted(dataset["userId"].unique())
sorted_unique_items = sorted(dataset["itemId"].unique())
# Create user ID mapping from the sorted list
user_id_mapping = pd.DataFrame(
{
"userId": sorted_unique_users,
"new_userId": range(len(sorted_unique_users)),
}
)
dataset["userId"] = dataset["userId"].map(
user_id_mapping.set_index("userId")["new_userId"]
)
# Create item ID mapping from the sorted list
item_id_mapping = pd.DataFrame(
{
"itemId": sorted_unique_items,
"new_itemId": range(len(sorted_unique_items)),
}
)
dataset["itemId"] = dataset["itemId"].map(
item_id_mapping.set_index("itemId")["new_itemId"]
)
# Store mappings for lookups
self.original_user_id = user_id_mapping.set_index("new_userId")
self.original_item_id = item_id_mapping.set_index("new_itemId")
self.new_user_id = user_id_mapping.set_index("userId")
self.new_item_id = item_id_mapping.set_index("itemId")
# Update the internal dataset
dataset["userId"] = dataset["userId"].astype(int)
dataset["itemId"] = dataset["itemId"].astype(int)
self._dataset = dataset
self._num_user = self._dataset["userId"].max() + 1
self._num_item = self._dataset["itemId"].max() + 1
def binarize(
self, binary_threshold: float = 1, inplace: bool = True
) -> Optional[pd.DataFrame]:
"""
Binarize ratings into 0 or 1 based on a threshold (implicit feedback).
Args:
binary_threshold (float, optional): Threshold for binarization. Defaults to 1.0.
inplace (bool, optional): If True, modify the dataset in-place. If False, return a new DataFrame.
Defaults to True.
Returns:
Optional[pd.DataFrame]: The binarized dataset if inplace=False, else None.
Raises:
ValueError: If the dataset is not set or binary_threshold is invalid.
Example:
Ratings [0.5, 2.0, 3.0] with threshold=1.0 -> [0, 1, 1]
"""
if self._dataset is None:
raise ValueError("Dataset must be loaded or set before binarization")
if not isinstance(binary_threshold, (int, float)):
raise ValueError("binary_threshold must be a number")
dataset = self._dataset if inplace else self._dataset.copy()
dataset["rating"] = (dataset["rating"] > binary_threshold).astype(int)
if not inplace:
return dataset
self._dataset = dataset
return None
@property
def num_user(self) -> int:
"""
Get the number of unique users.
Returns:
int: Number of unique users.
Raises:
ValueError: If the dataset is not set.
"""
if self._num_user is None:
raise ValueError("Dataset must be loaded or set to compute num_user")
return self._num_user
@property
def num_item(self) -> int:
"""
Get the number of unique items.
Returns:
int: Number of unique items.
Raises:
ValueError: If the dataset is not set.
"""
if self._num_item is None:
raise ValueError("Dataset must be loaded or set to compute num_item")
return self._num_item
def get_original_user_id(self, u: Union[int, List[int]]) -> Union[int, List[int]]:
"""
Get the original user ID(s) from the new (consecutive) ID(s).
Args:
u (Union[int, List[int]]): New user ID(s).
Returns:
Union[int, List[int]]: Original user ID(s).
Raises:
ValueError: If ID mapping is not set or if any ID is not found.
"""
if self.original_user_id is None:
raise ValueError(
"ID mapping not set. Call make_consecutive_ids_in_dataset first"
)
try:
if isinstance(u, (int, np.integer)):
return int(self.original_user_id.loc[u, "userId"]) # type: ignore
series = cast(pd.Series, self.original_user_id.loc[u, "userId"])
return series.tolist()
except KeyError as e:
raise ValueError(f"User ID(s) not found: {e}")
def get_original_item_id(self, i: Union[int, List[int]]) -> Union[int, List[int]]:
"""
Get the original item ID(s) from the new (consecutive) ID(s).
Args:
i (Union[int, List[int]]): New item ID(s).
Returns:
Union[int, List[int]]: Original item ID(s).
Raises:
ValueError: If ID mapping is not set or if any ID is not found.
"""
if self.original_item_id is None:
raise ValueError(
"ID mapping not set. Call make_consecutive_ids_in_dataset first"
)
try:
if isinstance(i, (int, np.integer)):
return int(self.original_item_id.loc[i, "itemId"]) # type: ignore
series = cast(pd.Series, self.original_item_id.loc[i, "itemId"])
return series.tolist()
except KeyError as e:
raise ValueError(f"Item ID(s) not found: {e}")
def get_new_user_id(
self, u: Union[Union[str, int], List[Union[str, int]]]
) -> Union[int, List[int]]:
"""
Get the new (consecutive) user ID(s) from the original ID(s).
Args:
u: Original user ID(s).
Returns:
New user ID(s).
Raises:
ValueError: If ID mapping is not set or if any ID is not found.
"""
if self.new_user_id is None:
raise ValueError(
"ID mapping not set. Call make_consecutive_ids_in_dataset first"
)
try:
if isinstance(u, str):
u = int(u)
return int(self.new_user_id.loc[u, "new_userId"]) # type: ignore
if isinstance(u, list) and all(isinstance(x, str) for x in u):
u = [int(x) for x in u]
series = cast(pd.Series, self.new_user_id.loc[u, "new_userId"])
return series.tolist()
if isinstance(u, (int, np.integer)):
return int(self.new_user_id.loc[u, "new_userId"]) # type: ignore
series = cast(pd.Series, self.new_user_id.loc[u, "new_userId"])
return series.tolist()
except KeyError as e:
raise ValueError(f"User ID(s) not found: {e}")
def get_new_item_id(
self, i: Union[Union[str, int], List[Union[str, int]]]
) -> Union[int, List[int]]:
"""
Get the new (consecutive) item ID(s) from the original ID(s).
Args:
i: Original item ID(s).
Returns:
New item ID(s).
Raises:
ValueError: If ID mapping is not set or if any ID is not found.
"""
if self.new_item_id is None:
raise ValueError(
"ID mapping not set. Call make_consecutive_ids_in_dataset first"
)
try:
if isinstance(i, str):
i = int(i)
return int(self.new_item_id.loc[i, "new_itemId"]) # type: ignore
if isinstance(i, list) and all(isinstance(x, str) for x in i):
i = [int(x) for x in i]
series = cast(pd.Series, self.new_item_id.loc[i, "new_itemId"])
return series.tolist()
if isinstance(i, (int, np.integer)):
return int(self.new_item_id.loc[i, "new_itemId"]) # type: ignore
series = cast(pd.Series, self.new_item_id.loc[i, "new_itemId"])
return series.tolist()
except KeyError as e:
raise ValueError(f"Item ID(s) not found: {e}")
@@ -0,0 +1,289 @@
from typing import List, Optional, Union
import numpy as np
import pandas as pd
from pathlib import Path
from pygrex.data_reader.data_reader import DataReader
class GroupInteractionHandler:
def __init__(self, filepath_or_buffer: Union[str, Path, List[Union[str, Path]]]):
"""
Initialize the GroupInteractionHandler.
Args:
filepath_or_buffer: Path to directory containing group files or list of file paths
"""
# Convert to Path objects
if isinstance(filepath_or_buffer, (str, Path)):
path = Path(filepath_or_buffer)
# If a single directory path is provided, get all files in it
if path.is_dir():
self.filepath_or_buffer = [
str(file) for file in path.iterdir() if file.is_file()
]
else:
self.filepath_or_buffer = [str(path)]
else:
# If a list of paths is provided, convert all to Path and then to strings
self.filepath_or_buffer = [str(Path(p)) for p in filepath_or_buffer]
def _get_group_filepath(self, filename: str) -> str:
"""
Get a specific group file path by matching the filename.
Args:
filename (str): The name of the file to search for.
Returns:
str: The matched file path.
Raises:
ValueError: Error: File does not exist
ValueError: No file found containing '{filename}' in its name.
"""
for path_str in self.filepath_or_buffer:
if filename in path_str: # Check if filename is part of the path
path = Path(path_str).resolve()
if path.exists():
return str(path)
else:
raise ValueError(f"Error: File does not exist: {path}")
raise ValueError(f"Error: No file found containing '{filename}' in its name.")
def read_groups(self, filename: str) -> List[str]:
"""
Method to read group IDs from a specified file.
Args:
filename (str): Name of the file containing group IDs.
Returns:
List[str]: List of group IDs.
Raises:
ValueError: If groups path is not specified in configuration
"""
if not filename:
raise ValueError("Groups path not specified in configuration")
filepath = self._get_group_filepath(filename)
# Use Path for file reading
path = Path(filepath)
return [line.strip() for line in path.read_text().splitlines()]
def parse_group_members(self, group: str) -> List[int]:
"""
Parse group ID to get member IDs.
Args:
group: Group ID string
Returns:
List of member IDs
"""
group = group.strip()
members = group.split("_")
return [int(m) for m in members]
def get_group_members(self, group: Union[List[Union[int, str]], str]) -> List[int]:
"""
Get group members from a group ID string or list.
Args:
group: Group ID string in format "id1_id2_id3" or list of IDs
Returns:
List of member IDs as integers
Raises:
ValueError: If any member ID cannot be converted to an integer
TypeError: If group is neither a string nor a list
"""
if isinstance(group, list):
return [int(member) for member in group]
if not isinstance(group, str):
raise TypeError(f"Expected string or list, got {type(group).__name__}")
group = group.strip()
if not group:
return []
try:
return [int(member) for member in group.split("_")]
except ValueError as e:
raise ValueError(f"Invalid member ID in group: {str(e)}")
def create_modified_dataset(
self,
original_data: Union[pd.DataFrame, DataReader],
group_ids: List[Union[int, str]],
item_ids: List[Union[int, str]],
data: Optional[DataReader] = None,
) -> pd.DataFrame:
"""
Creates a modified dataset by removing interactions between specified groups and items.
Args:
original_data: Either a pandas DataFrame or a DataReader object containing the dataset
group_ids: List of group IDs to consider for removal
item_ids: List of item IDs to consider for removal
data: Optional DataReader object if original_data is a DataFrame
Returns:
pd.DataFrame: A pandas DataFrame with the specified interactions removed
Raises:
ValueError: If input data types are incorrect
"""
# Determine the data source and target dataset
if isinstance(original_data, DataReader):
data_reader = original_data
dataset = original_data.dataset
elif isinstance(original_data, pd.DataFrame) and isinstance(data, DataReader):
data_reader = data
dataset = original_data
else:
raise ValueError(
"Either original_data must be a DataReader or data must be provided as a DataReader"
)
# Convert IDs to internal representation
new_group_ids = [
data_reader.get_new_user_id(
int(g) if isinstance(g, (int, np.integer)) else g
)
for g in group_ids
]
new_item_ids = [
data_reader.get_new_item_id(
int(i) if isinstance(i, (int, np.integer)) else i
)
for i in item_ids
]
# Create mask for rows to keep (inverse of rows to drop)
mask = ~(dataset.itemId.isin(new_item_ids) & dataset.userId.isin(new_group_ids))
return dataset[mask]
def get_rated_items_by_all_group_members(
self, group: List[Union[int, str]], original_data: DataReader
) -> np.ndarray:
"""
Get all items rated by any member of the group.
Args:
group: List of user IDs
original_data: Data object with mapping methods
Returns:
np.ndarray: Array of original item IDs rated by any group member
"""
# Convert group members to new user IDs
new_group = [
original_data.get_new_user_id(
int(g) if isinstance(g, (int, np.integer)) else g
)
for g in group
]
# Get unique items rated by any group member
group_items = original_data.dataset[
original_data.dataset.userId.isin(new_group)
]["itemId"].unique()
# Convert back to original item IDs
original_ids = original_data.get_original_item_id(group_items.tolist())
return np.array(original_ids)
def get_common_rated_items(
self, group: List[Union[int, str]], original_data: DataReader
) -> np.ndarray:
"""
Get items rated by all members of the group (intersection of rated items).
Args:
group: List of user IDs
original_data: DataReader object with mapping methods
Returns:
np.ndarray: Array of original item IDs rated by all group members
"""
# Convert group members to new user IDs
new_group = [
original_data.get_new_user_id(
int(g) if isinstance(g, (int, np.integer)) else g
)
for g in group
]
# Get items rated by each group member
rated_items_per_member = []
for user_id in new_group:
user_items = original_data.dataset[original_data.dataset.userId == user_id][
"itemId"
].unique()
rated_items_per_member.append(set(user_items))
# Find intersection of all rated items
if rated_items_per_member:
common_items = set.intersection(*rated_items_per_member)
common_items_array = np.array(list(common_items))
# Convert back to original item IDs
original_ids = original_data.get_original_item_id(
common_items_array.tolist()
)
return np.array(original_ids)
else:
return np.array([])
def get_items_for_group_recommendation(
self, data: pd.DataFrame, item_ids: np.ndarray, group: List[int]
) -> np.ndarray:
"""
Get items for group recommendation (those not interacted with by any group member).
Args:
data: DataFrame with interaction data
item_ids: Array of all item IDs
group: List of group member IDs
Returns:
Array of item IDs not interacted with by any group member
"""
item_ids_group = data.loc[data.userId.isin(group), "itemId"]
return np.setdiff1d(item_ids, item_ids_group)
def get_group_preferences(
self, group: List[Union[int, str]], data_reader: DataReader
) -> pd.DataFrame:
"""
Get all preferences (ratings) by all members of the group.
Args:
group: List of user IDs
data_reader: DataReader object with the dataset
Returns:
pd.DataFrame: DataFrame containing all preferences by group members
"""
# Convert group members to new user IDs
new_group = [
data_reader.get_new_user_id(
int(g) if isinstance(g, (int, np.integer)) else g
)
for g in group
]
# Get all interactions by group members
group_preferences = data_reader.dataset[
data_reader.dataset.userId.isin(new_group)
].copy()
return group_preferences
+36
View File
@@ -0,0 +1,36 @@
from torch.utils.data import Dataset
import torch
import numpy as np
class UserItemDict(Dataset):
"""Wrapper, convert <user, item, rating> Tensor into Pytorch Dataset"""
def __init__(self, data, expl_matrix, expl):
"""
args:
target_tensor: torch.Tensor, the corresponding rating for <user, item> pair
"""
grp_data = data.groupby('userId')
self.users_dict = dict()
for userId, itemId_rating in grp_data:
self.users_dict[userId] = {'items': list(itemId_rating.itemId),
'rating': list(itemId_rating.rating)}
self.n_items = data.itemId.nunique()
self.n_users = data.userId.nunique()
self.expl_matrix = expl_matrix
self.expl = expl
def __getitem__(self, index):
ratings = np.zeros(self.n_items)
ratings[self.users_dict[index]['items']] = self.users_dict[index]['rating']
if self.expl:
return torch.tensor(ratings) + self.expl_matrix[index, :]
else:
return torch.tensor(ratings)
def __len__(self):
return self.n_users
@@ -0,0 +1,21 @@
from torch.utils.data import Dataset
class UserItemRatingDataset(Dataset):
"""Wrapper, convert <user, item, rating> Tensor into Pytorch Dataset"""
def __init__(self, user_tensor, item_tensor, target_tensor):
"""
args:
target_tensor: torch.Tensor, the corresponding rating for <user, item> pair
"""
self.user_tensor = user_tensor
self.item_tensor = item_tensor
self.target_tensor = target_tensor
def __getitem__(self, index):
return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]
def __len__(self):
return self.user_tensor.size(0)
+15
View File
@@ -0,0 +1,15 @@
from .splitter import Splitter
from .model_evaluator import ModelEvaluator
from .explainer_evaluator import ExplanationEvaluator
from .evaluation_pipelines import (
run_evaluation_with_proper_split,
run_leave_one_out_evaluation,
)
__all__ = [
"Splitter",
"ModelEvaluator",
"ExplanationEvaluator",
"run_evaluation_with_proper_split",
"run_leave_one_out_evaluation",
]
+251
View File
@@ -0,0 +1,251 @@
import time
from typing import Dict
import pandas as pd
import numpy as np
from pygrex.data_reader.data_reader import DataReader
from pygrex.evaluator import Splitter, ModelEvaluator
def run_leave_one_out_evaluation(
data_reader: DataReader, model, top_n: int = 10
) -> Dict:
print("Starting leave-one-out evaluation...")
start_time = time.time()
# 1. Proper leave-one-out split (one item per user)
train_dr, test_df = Splitter.split_leave_n_out(
data_reader, n=1
) # n=1 for true leave-one-out
print(f"Split completed: {len(test_df)} test interactions")
train_users = set(train_dr.dataset["userId"].unique())
train_items = set(train_dr.dataset["itemId"].unique())
original_test_len = len(test_df)
test_df = test_df[
test_df["userId"].isin(train_users) & test_df["itemId"].isin(train_items)
]
print(
f"Filtered test set: {len(test_df)} interactions remaining from {original_test_len}"
)
# 2. Train model on training data
print("Training model on reduced dataset...")
train_start = time.time()
model.fit(train_dr)
train_time = time.time() - train_start
print(f"Model training completed in {train_time:.2f} seconds")
# 3. Generate recommendations efficiently
print("Generating recommendations...")
rec_start = time.time()
recommendations = generate_recommendations_batch(model, train_dr, test_df, top_n)
rec_time = time.time() - rec_start
print(f"Recommendations generated in {rec_time:.2f} seconds")
# 4. Use the existing Evaluator class
evaluator = ModelEvaluator(test_df, top_n=top_n)
# Calculate metrics
hit_ratio = evaluator.cal_hit_ratio(recommendations)
ndcg = evaluator.cal_ndcg(recommendations)
total_time = time.time() - start_time
print(f"Total evaluation time: {total_time:.2f} seconds")
return {
"Hit Ratio": hit_ratio,
"NDCG": ndcg, # Using standard NDCG instead of eNDCG for now
"evaluation_time": total_time,
}
def generate_recommendations_batch(
model, train_dr: DataReader, test_df: pd.DataFrame, top_n: int
) -> pd.DataFrame:
"""
Generate recommendations in batch mode for efficiency.
Returns DataFrame with columns: ['userId', 'itemId', 'rank', 'score']
"""
all_items = set(train_dr.dataset["itemId"].unique())
recommendations = []
test_users = test_df["userId"].unique()
print(f"Generating recommendations for {len(test_users)} users...")
for i, user_id in enumerate(test_users):
if i % 100 == 0: # Progress indicator
print(f"Processing user {i}/{len(test_users)}")
# Get items the user has already interacted with
user_items = set(
train_dr.dataset[train_dr.dataset["userId"] == user_id]["itemId"]
)
# Candidate items (unseen items)
candidate_items = list(all_items - user_items)
# For efficiency, limit candidates if there are too many
if len(candidate_items) > 10000: # Adjust this threshold based on your needs
candidate_items = np.random.choice(
candidate_items, 10000, replace=False
).tolist()
# Generate predictions - try to use batch prediction if available
try:
# Check if model has batch prediction capability
if hasattr(model, "predict_batch") or hasattr(model, "recommend"):
user_recs = generate_recommendations_efficient(
model, user_id, candidate_items, top_n
)
else:
# Fall back to individual predictions (slower)
user_recs = generate_recommendations_individual(
model, user_id, candidate_items, top_n
)
recommendations.extend(user_recs)
except Exception as e:
print(f"Error generating recommendations for user {user_id}: {e}")
continue
# Convert to DataFrame
if recommendations:
rec_df = pd.DataFrame(
recommendations, columns=["userId", "itemId", "rank", "score"]
)
else:
# Return empty DataFrame with correct structure
rec_df = pd.DataFrame(columns=["userId", "itemId", "rank", "score"])
return rec_df
def generate_recommendations_efficient(
model, user_id: int, candidate_items: list, top_n: int
) -> list:
"""
Try to use efficient recommendation methods if available.
"""
recommendations = []
# Try different efficient methods based on model type
if hasattr(model, "recommend"):
# Some models have a recommend method
try:
recs = model.recommend(user_id, candidate_items, top_n)
for rank, (item_id, score) in enumerate(recs, 1):
recommendations.append((user_id, item_id, rank, score))
except Exception:
# Fall back to individual predictions
return generate_recommendations_individual(
model, user_id, candidate_items, top_n
)
elif hasattr(model, "predict_batch"):
# Batch prediction if available
try:
user_items_batch = [(user_id, item_id) for item_id in candidate_items]
scores = model.predict_batch(user_items_batch)
# Sort by score and get top-N
scored_items = list(zip(candidate_items, scores))
scored_items.sort(key=lambda x: x[1], reverse=True)
for rank, (item_id, score) in enumerate(scored_items[:top_n], 1):
recommendations.append((user_id, item_id, rank, score))
except Exception:
return generate_recommendations_individual(
model, user_id, candidate_items, top_n
)
else:
return generate_recommendations_individual(
model, user_id, candidate_items, top_n
)
return recommendations
def generate_recommendations_individual(
model, user_id: int, candidate_items: list, top_n: int
) -> list:
"""
Fall back to individual predictions (slower but works with any model).
"""
predictions = []
# Batch the individual predictions for better performance
batch_size = 100
for i in range(0, len(candidate_items), batch_size):
batch_items = candidate_items[i : i + batch_size]
for item_id in batch_items:
try:
score = model.predict(user_id, item_id)
predictions.append((item_id, score))
except Exception as e:
print(f"Prediction error for user {user_id}, item {item_id}: {e}")
# Skip items that cause prediction errors
continue
# Sort by score and get top-N
predictions.sort(key=lambda x: x[1], reverse=True)
top_predictions = predictions[:top_n]
recommendations = []
for rank, (item_id, score) in enumerate(top_predictions, 1):
recommendations.append((user_id, item_id, rank, score))
return recommendations
def run_evaluation_with_proper_split(
data_reader: DataReader, model, test_size: float = 0.2, top_n: int = 10
) -> Dict:
"""
Alternative evaluation using a proper train/test split instead of leave-one-out.
"""
print(f"Starting evaluation with {test_size * 100}% test split...")
start_time = time.time()
# 1. Split data into train/test
train_dr, test_df = Splitter.split_leave_n_out(data_reader, frac=test_size)
print(f"Split completed: {len(test_df)} test interactions")
# 2. Filter test set to ensure all users/items exist in the training set
train_users = set(train_dr.dataset["userId"].unique())
train_items = set(train_dr.dataset["itemId"].unique())
original_test_len = len(test_df)
test_df = test_df[
test_df["userId"].isin(train_users) & test_df["itemId"].isin(train_items)
]
print(
f"Filtered test set: {len(test_df)} interactions remaining from {original_test_len}"
)
# 2. Train model
print("Training model...")
model.fit(train_dr)
# 3. Generate recommendations
print("Generating recommendations...")
recommendations = generate_recommendations_batch(model, train_dr, test_df, top_n)
# 4. Evaluate
evaluator = ModelEvaluator(test_df, top_n=top_n)
hit_ratio = evaluator.cal_hit_ratio(recommendations)
ndcg = evaluator.cal_ndcg(recommendations)
total_time = time.time() - start_time
print(f"Evaluation completed in {total_time:.2f} seconds")
return {
"Hit Ratio": hit_ratio,
"NDCG": ndcg,
"evaluation_time": total_time,
"test_interactions": len(test_df),
"total_recommendations": len(recommendations),
}
+68
View File
@@ -0,0 +1,68 @@
from typing import Dict, Any
from pygrex.utils import calculate_gild_for_explanations
class ExplanationEvaluator:
"""
A unified evaluator for different explanation methods.
This class takes the results generated by an explainer and calculates
a standard set of quality metrics, such as Fidelity and Diversity (GILD).
"""
def __init__(self):
"""Initializes the ExplanationEvaluator."""
# This class is stateless, so __init__ is simple.
pass
def evaluate(
self, explanation_results: Dict[str, Any], explainer_type: str
) -> Dict[str, float]:
"""
Calculates all relevant metrics for a given explanation result.
Args:
explanation_results: The dictionary returned by an explainer's
`find_explanation` method.
explainer_type: A string identifier for the explainer used
(e.g., "LORE4Groups", "EXPGRS").
Returns:
A dictionary containing the calculated metric scores.
"""
if not explanation_results:
return {"fidelity": 0.0, "gild": 0.0}
fidelity = self._calculate_fidelity(explanation_results)
gild = self._calculate_gild(explanation_results, explainer_type)
return {"fidelity": fidelity, "gild": gild}
def _calculate_fidelity(self, explanation_results: Dict[str, Any]) -> float:
"""
Extracts the fidelity score from the explanation results.
Fidelity is computed by the explainer itself, as it's the ratio of
items it was able to explain. This method standardizes its retrieval.
"""
return explanation_results.get("fidelity", 0.0)
def _calculate_gild(
self, explanation_results: Dict[str, Any], explainer_type: str
) -> float:
"""
Calculates the Gaussian Inter-List Diversity (GILD) of the explanations.
This is a wrapper around the utility function that handles the details.
It uses the 'details' part of the explanation results.
"""
explanation_details = explanation_results.get("details", {})
if not explanation_details:
return 0.0
# The GILD function is now called from a central, logical place.
gild_score = calculate_gild_for_explanations(
explanation_details, explainer_type
)
return gild_score
+179
View File
@@ -0,0 +1,179 @@
import numpy as np
import pandas as pd
class ModelEvaluator:
disc_functions = ["log", "linear"]
def __init__(self, test_set, top_n: int = 10, discount_function: str = "log"):
self.test_set = test_set
self._top_n = top_n
assert discount_function in self.disc_functions, "Wrong Discount Function."
self._discount_function = discount_function
self.num_users = self.test_set.userId.nunique()
@property
def top_n(self):
return self._top_n
@top_n.setter
def top_n(self, top_n: int):
self._top_n = top_n
@property
def discount_function(self):
return self._discount_function
@discount_function.setter
def discount_function(self, discount_function: str):
assert discount_function in self.disc_functions, "Wrong Discount Function."
self._discount_function = discount_function
def cal_hit_ratio(self, recommendations):
"""
Hit Ratio
:param recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
:return: hit rate.
"""
test_in_top_n = self.get_hits(recommendations)
# count hits per user
hits_per_user = self.count_positives(test_in_top_n)
# merge with the entire list of positive items for user
hits_per_user = hits_per_user.merge(
self.count_positives(self.test_set),
on="userId",
suffixes=("_true", ""),
how="right",
)
# if there are users with 0 hits the merge will have NA.
hits_per_user = hits_per_user.fillna(0)
# get the hit rate per user
hit_rate = hits_per_user.positive_true / hits_per_user.positive
# average
hit_rate = hit_rate.mean()
return hit_rate
def get_hits(self, recommendations):
"""
Find which items in the test set have a hit on the recommendations.
:param recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
:return: dataframe, removing the rows missing in the test set.
"""
# check whether there are top_n items per user
top_n_recommendations = self.filter_to_top_n(recommendations)
# find the hits
test_in_top_n = pd.merge(
top_n_recommendations, self.test_set, on=["userId", "itemId"]
)
return test_in_top_n
def filter_to_top_n(self, dataset):
"""
if rank > top_n, we do not use it for evaluation
:param dataset: dataframe, columns = ['userId', 'itemId', 'rank']
:return: dataframe, columns = ['userId', 'itemId', 'rank']
"""
return dataset[dataset["rank"] <= self.top_n]
def cal_ndcg(self, recommendations):
r"""
For evaluating the top-N recommendation list, we also provide the normalized Discounted Cumulative Gain at N
recommendation (nDCG@N) computed as the ratio of the Discounted Cumulative Gain(DCG) with the ideal Discounted
Cumulative Gain(IDCG):
DGC_{pos} = rel_1 + \sum_{i=2}^{pos} \frac{rel_i}{\log_2i} \qquad \qquad
IDGC_{pos} = rel_1 + \sum_{i=2}^{|h|-1} \frac{rel_i}{\log_2i} \\
nDCG_{pos} = \frac{DCG}{IDCG}
where pos denotes the position up to which relevance is accumulated, and $rel_i$ is the relevance of the recommended item at position \textit{i}.
Ref: Y. Wang, L. Wang, Y. Li, D. He, T.-Y. Liu, and W. Chen.
A theoretical analysis of ndcgtype ranking measures.
:param recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
:return: nDCG
"""
# get hits
hits = self.get_hits(recommendations)
DCG = self.cal_dcg(hits)
iDCG = self.cal_idcg()
# join to check if there are users in the test without hits
nDCG = iDCG.merge(DCG, on="userId", how="left")
nDCG = nDCG.fillna(0)
# normalize
nDCG["ndcg"] = nDCG["dcg"] / nDCG["idcg"]
return nDCG["ndcg"].mean()
def cal_dcg(self, hits):
"""
Discounted Comulative Gain
:param hits: recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
:return: DCG
"""
# todo: the gain so far is set to a constant.
if self.discount_function == "log":
hits["discounted_gain"] = np.log(2) / np.log(hits["rank"] + 1)
elif self.discount_function == "linear":
hits["discounted_gain"] = 1 / hits["rank"]
DCG = hits.groupby("userId")["discounted_gain"].sum()
return pd.DataFrame(
{"userId": hits["userId"].unique(), "dcg": DCG}
).reset_index(drop=True)
def cal_idcg(self):
"""
the Ideal DCG, is the DCG for the best ranking possible (i.e. all true positives were recommended first).
:return: iDCG
"""
# create a fake ranking for test set items.
# We assume that the items in the test set are all on the Top-N list.
count_positives = self.count_positives(self.test_set)
ideal_rank = [i for x in count_positives["positive"] for i in (range(1, x + 1))]
test_ideal_ranking = self.test_set.copy()
test_ideal_ranking["rank"] = ideal_rank
# Filter to have at most top-N items.
test_ideal_ranking = self.filter_to_top_n(test_ideal_ranking)
# get the dcg for the ideal ranking
idcg = self.cal_dcg(test_ideal_ranking)
idcg = idcg.rename(columns={"dcg": "idcg"})
return idcg
@staticmethod
def count_positives(dataset):
"""
Returns the positives count.
:param dataset: dataframe, columns = ['userId', 'itemId', 'rank']
:return: dataframe, columns = ['userId', 'positive']
"""
users_with_positives = dataset.userId.unique()
positives_per_user = dataset.groupby("userId")["itemId"].count()
positives_per_user = pd.DataFrame(
{"userId": users_with_positives, "positive": positives_per_user}
)
return positives_per_user.reset_index(drop=True)
# if __name__ == '__main__':
## recoms = pd.DataFrame({
# 'userId': [1, 1, 1, 2, 2, 2, 3, 3, 3],
# 'itemId': [1, 2, 3, 4, 1, 2, 2, 3, 4],
# 'rank': [1, 2, 3, 1, 2, 3, 1, 2, 3]
# })
# test = pd.DataFrame({
# 'userId': [1, 1, 2, 3],
# 'itemId': [1, 4, 1, 5]
# })
# eval = Evaluator(test_set=test, top_n=2)
# assert eval.num_users == 3, 'number of users'
# assert eval.top_n == 2, 'number of top n'
# eval.top_n = 3
# assert eval.top_n == 3, 'changing of top n'
# print(eval.cal_hit_ratio(recoms))
# print(eval.cal_ndcg(recoms))
+169
View File
@@ -0,0 +1,169 @@
import sys
import random
import pandas as pd
import copy
from pygrex.data_reader.data_reader import DataReader
def fix_data_reader_mappings(source: DataReader, target: DataReader):
target._num_user = source._num_user
target._num_item = source._num_item
# Copy over the original ID mappings
target.original_user_id = source.original_user_id
target.original_item_id = source.original_item_id
target.new_user_id = source.new_user_id
target.new_item_id = source.new_item_id
return target
class Splitter:
"""
Super Splitting Class.
args:
data: DataReader object, which contains in its dataset attribute 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
"""
def __init__(self):
pass
@staticmethod
def split_leave_latest_out(data: DataReader, n_latest: int = 1):
"""
Leave N latest interactions out train/test split.
Ref:
Campos, Pedro G., Fernando Díez, and Iván Cantador. "Time-aware recommender systems: a comprehensive survey and
analysis of existing evaluation protocols." User Modeling and User-Adapted Interaction 24.1-2 (2014): 67-119.
:param data:
:param n_latest: int, number of latest interactions to be in the the test set.
:returns train as DataReader, test as data.frames
"""
# group items by suer id and rank them by timestamp
rank_latest = data.dataset.groupby(["userId"])["timestamp"].rank(
method="first", ascending=False
)
# keep in test items that are ranked higher than n_latest
test = data.dataset[rank_latest <= n_latest]
# keep in train the rest
train = DataReader(dataframe=data.dataset.copy())
train.dataset = data.dataset[rank_latest > n_latest]
train = fix_data_reader_mappings(data, train)
return train, test
@staticmethod
def split_leave_n_out(data: DataReader, n: int = 1, frac: float | None = None):
"""
Leave N latest interactions out train/test split.
Ref:
Shani, Guy, and Asela Gunawardana. "Evaluating recommendation systems." Recommender systems handbook. Springer,
Boston, MA, 2011. 257-297.
:param data:
:param n int, number of interactions to be in the the test set.
:param frac float, fraction.
:returns dataframe train and test
"""
min_nr_ratings_user = min(data.dataset["userId"].value_counts())
if min_nr_ratings_user < n:
sys.exit(
"split_leave_n_out: There are users with less ratings than n (required number of interactions "
"in the test set)."
)
if frac is not None and frac > 1:
sys.exit("f (i.e.) fraction should be smaller than 1.")
# group items by user id and extraxt a random number of items per user
grouped = data.dataset.groupby(["userId"])
if frac is not None:
test = grouped.sample(frac=frac)
else:
test = grouped.sample(n=n)
test = test.reset_index(drop=True)
train_pd = pd.merge(
data.dataset,
test,
on=list(data.dataset.columns),
how="outer",
indicator=True,
)
train_pd = train_pd[train_pd["_merge"] == "left_only"]
train_pd = train_pd.drop(columns="_merge")
train = copy.deepcopy(data)
train.dataset = train_pd
train = fix_data_reader_mappings(data, train)
assert test.shape[0] + train_pd.shape[0] == data.dataset.shape[0]
return train, test
def rel_plus_n(
self,
data,
negative_sample_size: int = 99,
splitting: str = "latest",
n: int = 1,
):
"""
RelPlusN: We build the users test set by extracting one relevant random item ($HR_u$) from the entire set of
rated items. Then a set of random items with unknown relevance ($NR_u$), is extracted for each user $u$, where $u$
had no previous interaction with these items. Finally, for each item $i$ in $HR_u$, the algorithm requests a ranking
of the top-$N$ items from the set $ {i} cup NR_u$, on which the evaluation is performed. The evaluation metrics
are averaged over all the items in $HR_u$ and later over all the users. In the following, all experiments have been
conducted according to this protocol.
Ref:
- Paolo Cremonesi, Yehuda Koren, and Roberto Turrin. 2010. Performance of Recommender Algorithms on Top-n
Recommendation Tasks. InProceedings ofthe Fourth ACM Conference on Recommender Systems (RecSys 10).
- Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. 2017. Neural Collaborative
Filtering. In Proceedings of the 26th InternationalConference on World Wide Web (WWW 17).
:param data
:param negative_sample_size how many negative items to compute
:param splitting either latest for leave n latest out, or n for leave n out
:param n how many to leave out
"""
if splitting == "latest":
train, test = self.split_leave_latest_out(data, n)
elif splitting == "n":
train, test = self.split_leave_n_out(data, n)
else:
sys.exit('splitting can be either "latest" or "n". ')
neg_sample = self.sample_negative(data, negative_sample_size)
return train, pd.concat([test, neg_sample], ignore_index=True)
@staticmethod
def sample_negative(data, negative_sample_size):
"""return all negative items"""
item_catalogue = set(data.dataset["itemId"])
interact_status = (
data.dataset.groupby("userId")["itemId"]
.apply(set)
.reset_index()
.rename(columns={"itemId": "interacted_items"})
)
interact_status["negative_items"] = interact_status["interacted_items"].apply(
lambda x: item_catalogue - x
)
interact_status["negative_samples"] = interact_status["negative_items"].apply(
lambda x: random.sample(x, negative_sample_size)
)
interact_status = interact_status[["userId", "negative_samples"]]
userId = []
itemId = []
for row in interact_status.itertuples():
for i in range(negative_sample_size):
userId.append(int(row.userId))
itemId.append(int(row.negative_samples[i]))
return pd.DataFrame.from_dict({"userId": userId, "itemId": itemId})
+18
View File
@@ -0,0 +1,18 @@
from .individual.model_based_emf import EMFExplainer
from .individual.model_based_als_explain import ALSExplainer
from .individual.post_hoc_association_rules import ARPostHocExplainer
from .individual.post_hoc_knn import KNNPostHocExplainer
from .groups.rule_based_group_rec_explainer import RuleBasedGroupRecExplainer
from .groups.sliding_window_explainer import SlidingWindowExplainer
from .groups.lore4groups_explainer import LORE4GroupsExplainer
__all__ = [
"EMFExplainer",
"ALSExplainer",
"ARPostHocExplainer",
"KNNPostHocExplainer",
"RuleBasedGroupRecExplainer",
"SlidingWindowExplainer",
"LORE4GroupsExplainer",
]
+10
View File
@@ -0,0 +1,10 @@
from .rule_based_group_rec_explainer import RuleBasedGroupRecExplainer
from .sliding_window_explainer import SlidingWindowExplainer
from .lore4groups_explainer import LORE4GroupsExplainer
__all__ = [
"RuleBasedGroupRecExplainer",
"SlidingWindowExplainer",
"LORE4GroupsExplainer",
]
@@ -0,0 +1,731 @@
import pandas as pd
import numpy as np
import re
import logging
import traceback
from collections import Counter
from typing import Dict, Set, List, Optional, Any, Tuple, Union
from sklearn.tree import DecisionTreeClassifier, _tree
ItemId = Union[str, int]
UserId = Union[str, int]
FactualRule = List[str]
CounterfactualSet = List[List[str]]
Explanation = Tuple[Optional[FactualRule], Optional[CounterfactualSet]]
class LORE4GroupsExplainer:
"""
Enhanced LORE4Groups explainer that incorporates genre information
and stores decision trees for visualization
"""
def __init__(
self,
item_profiles: Dict[str, Set[str]],
item_label_matrix: pd.DataFrame,
config: Dict,
genre_profiles: Optional[Dict[str, Set[str]]] = None,
):
self.item_profiles = {str(k): v for k, v in item_profiles.items()}
self.item_label_matrix = item_label_matrix
self.params = config["explainer"]["lore4groups"]
# NEW: Store genre information
self.genre_profiles = (
{str(k): v for k, v in genre_profiles.items()} if genre_profiles else {}
)
all_columns = item_label_matrix.columns.tolist()
self.all_labels = [col for col in all_columns if col != "like"]
# Add 'like' back for target variable access (but not as feature)
if "like" in all_columns:
self.all_labels.append("like")
def _enhanced_jaccard_similarity(self, item1_id: ItemId, item2_id: ItemId) -> float:
"""Enhanced Jaccard similarity that considers both tags and genres"""
# Get regular tags
tags1 = self.item_profiles.get(str(item1_id), set())
tags2 = self.item_profiles.get(str(item2_id), set())
# Get genres and add them as features
genres1 = self.genre_profiles.get(str(item1_id), set())
genres2 = self.genre_profiles.get(str(item2_id), set())
# Combine tags and genres for enhanced similarity
features1 = tags1.union({f"genre_{g.lower()}" for g in genres1})
features2 = tags2.union({f"genre_{g.lower()}" for g in genres2})
if not features1 or not features2:
return 0.0
union_len = len(features1.union(features2))
intersection_len = len(features1.intersection(features2))
return intersection_len / union_len if union_len > 0 else 0.0
def _jaccard_similarity(self, item1_id: ItemId, item2_id: ItemId) -> float:
"""Original jaccard similarity (kept for compatibility)"""
tags1 = self.item_profiles.get(str(item1_id), set())
tags2 = self.item_profiles.get(str(item2_id), set())
if not tags1 or not tags2:
return 0.0
union_len = len(tags1.union(tags2))
return len(tags1.intersection(tags2)) / union_len if union_len > 0 else 0.0
def _get_enhanced_similar_examples(
self,
user_id_consecutive: UserId,
target_item_id: ItemId,
user_hist: Set[ItemId],
dataset: pd.DataFrame,
model=None,
data_reader=None,
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
"""Enhanced version that returns both DataFrame and metadata for visualization"""
# 1. Find all similar items using enhanced similarity
similarities = [
(seen_id, self._enhanced_jaccard_similarity(target_item_id, seen_id))
for seen_id in user_hist
]
similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
sim_th = self.params.get("similarity_threshold", 0.0)
top_similar_items_str = {
item[0]
for item in similarities[: self.params["n_similar_for_tree"]]
if item[1] >= sim_th
}
if not top_similar_items_str:
return pd.DataFrame(), {}
# 2. Build the local dataset
top_similar_items_int = [int(i) for i in top_similar_items_str]
# Get existing ratings for similar items
local_df = dataset[
(dataset["userId"] == user_id_consecutive)
& (dataset["itemId"].isin(top_similar_items_int))
].copy()
rated_items = set(local_df["itemId"])
items_to_predict = [
item for item in top_similar_items_int if item not in rated_items
]
# Add predictions for unrated items
if model and data_reader and items_to_predict:
try:
orig_user_id = data_reader.get_original_user_id(
int(user_id_consecutive)
)
predicted_ratings = []
for item_id_consecutive in items_to_predict:
orig_item_id = data_reader.get_original_item_id(
int(item_id_consecutive)
)
pred = model.predict(orig_user_id, orig_item_id)
predicted_ratings.append(
{
"userId": user_id_consecutive,
"itemId": item_id_consecutive,
"rating": float(pred),
}
)
if predicted_ratings:
pred_df = pd.DataFrame(predicted_ratings)
local_df = pd.concat([local_df, pred_df], ignore_index=True)
except Exception:
traceback.print_exc()
# Check minimum samples requirement
if len(local_df) < 2:
return pd.DataFrame(), {}
# 3. Apply thresholding with fallbacks
rating_threshold = self.params["rating_threshold_for_like"]
threshold_info = {
"was_overridden": False,
"original_threshold": rating_threshold,
"final_threshold": rating_threshold,
}
local_df["like"] = (local_df["rating"] >= rating_threshold).astype(int)
# Apply fallback thresholds if needed
like_counts = local_df["like"].value_counts()
if len(like_counts) < 2:
# Try mean-based threshold
mean_rating = local_df["rating"].mean()
local_df["like"] = (local_df["rating"] >= mean_rating).astype(int)
threshold_info["was_overridden"] = True
threshold_info["final_threshold"] = mean_rating
like_counts = local_df["like"].value_counts()
if len(like_counts) < 2:
return pd.DataFrame(), {}
# Check for severe imbalance (>90% one class)
min_class_ratio = like_counts.min() / len(local_df)
if min_class_ratio < 0.1:
if like_counts.min() < 2:
return pd.DataFrame(), {}
# 4. Construct the enhanced feature matrix (including genres)
feature_labels = [label for label in self.all_labels if label != "like"]
examples = []
genre_features_used = set()
for idx, row in local_df.iterrows():
item_id = str(int(row["itemId"]))
tags = self.item_profiles.get(item_id, set())
genres = self.genre_profiles.get(item_id, set())
# Create base example with target variables
example = {
"movie_id": item_id,
"rating": row["rating"],
"like": int(row["like"]),
}
# Add tag features (excluding 'like')
for label in feature_labels:
example[label] = 1 if label in tags else 0
# Add genre features dynamically
for genre in genres:
genre_feature = f"genre_{genre.lower()}"
example[genre_feature] = 1
genre_features_used.add(genre_feature)
# Also add to feature_labels if not already there
if genre_feature not in feature_labels:
feature_labels.append(genre_feature)
examples.append(example)
# Ensure all examples have all genre features
for example in examples:
for genre_feature in genre_features_used:
if genre_feature not in example:
example[genre_feature] = 0
final_df = pd.DataFrame(examples)
# Final validation
if final_df["like"].nunique() < 2:
return pd.DataFrame(), {}
# Prepare metadata for visualization
metadata = {
"feature_labels": [label for label in feature_labels if label != "like"],
"genre_features": list(genre_features_used),
"similarity_scores": dict(similarities[:5]), # Top 5 similarities
"target_item_genres": self.genre_profiles.get(str(target_item_id), set()),
"rating_threshold": threshold_info["final_threshold"],
"threshold_info": threshold_info,
}
return final_df, metadata
def _get_factual_path_for_item(
self,
clf: DecisionTreeClassifier,
x_item: pd.DataFrame,
metadata: Dict[str, Any],
) -> Optional[List[str]]:
"""
Traces the specific path an item takes through the decision tree
and returns the corresponding factual rule set.
"""
feature_labels = metadata.get("feature_labels", [])
if not feature_labels:
return None
# 1. Get the sequence of nodes the item travels through
node_indicator = clf.decision_path(x_item)
node_index = node_indicator.indices[ # type: ignore
node_indicator.indptr[0] : node_indicator.indptr[ # type: ignore
1
]
]
rules = []
tree = clf.tree_
# 2. Iterate through the path to build the rules
# We stop at the second to last node because the last one is the leaf
for i in range(len(node_index) - 1):
node_id = node_index[i]
child_node_id = node_index[i + 1]
# Ensure this is not a leaf node
if tree.feature[node_id] != _tree.TREE_UNDEFINED: # type: ignore
feature_name = feature_labels[tree.feature[node_id]] # type: ignore
threshold = tree.threshold[node_id] # type: ignore
# 3. Determine if the path went left or right to form the rule
if child_node_id == tree.children_left[node_id]: # type: ignore
# Path went left (True condition for <= threshold)
rule = f"{feature_name} <= {threshold:.2f}"
else:
# Path went right (False condition for <= threshold)
rule = f"{feature_name} > {threshold:.2f}"
# Use the same enhanced formatting as before for consistency
if feature_name.startswith("genre_"):
genre_name = feature_name.replace("genre_", "").title()
if child_node_id == tree.children_left[node_id]: # type: ignore
rules.append(f"Does NOT have genre: `{genre_name}`")
else:
rules.append(f"Has genre: `{genre_name}`")
else:
rules.append(rule)
return rules if rules else None
def _train_enhanced_decision_tree(
self,
user_id_consecutive: UserId,
item_id: ItemId,
user_hist: Set[ItemId],
dataset: pd.DataFrame,
model=None,
data_reader=None,
) -> Tuple[Optional[DecisionTreeClassifier], Dict[str, Any]]:
"""Enhanced tree training that returns both classifier and metadata"""
df_examples, metadata = self._get_enhanced_similar_examples(
user_id_consecutive, item_id, user_hist, dataset, model, data_reader
)
if df_examples.empty:
return None, {}
like_counts = df_examples["like"].value_counts()
if len(like_counts) < 2 or like_counts.min() < 2:
return None, {}
feature_labels = metadata.get("feature_labels", [])
X = df_examples[feature_labels]
y = df_examples["like"]
# Verify feature matrix has variance
feature_variances = X.var()
if (feature_variances == 0).all():
return None, {}
clf = DecisionTreeClassifier(
max_depth=5, # Slightly deeper to accommodate genre features
min_samples_split=max(4, len(df_examples) // 4),
min_samples_leaf=2,
random_state=42,
class_weight="balanced",
)
try:
clf.fit(X, y)
# Enhanced feature importance analysis
feature_importance = list(zip(feature_labels, clf.feature_importances_))
important_features = [
(f, imp) for f, imp in feature_importance if imp > 0.001
]
genre_important_features = [
(f, imp) for f, imp in important_features if f.startswith("genre_")
]
# Add classifier and feature info to metadata
metadata.update(
{
"classifier": clf,
"feature_importance": dict(feature_importance),
"important_features": important_features,
"genre_important_features": genre_important_features,
"training_data_size": len(df_examples),
"class_distribution": like_counts.to_dict(),
}
)
return clf, metadata
except Exception as _:
return None, {}
def _get_enhanced_explanation_path(
self,
clf: DecisionTreeClassifier,
x_item: pd.DataFrame,
metadata: Dict[str, Any],
) -> Optional[List[str]]:
"""Enhanced explanation path that provides better rule descriptions"""
if 1 not in clf.classes_:
return None
leaf_id = clf.apply(x_item)[0] # type: ignore
class_index = np.where(clf.classes_ == 1)[0]
if not class_index.size or clf.tree_.value[leaf_id][0][class_index[0]] == 0: # type: ignore
return None
node_indicator = clf.decision_path(x_item)
node_index = node_indicator.indices[ # type: ignore
node_indicator.indptr[0] : node_indicator.indptr[ # type: ignore
1
]
]
rules = []
feature_labels = metadata.get("feature_labels", [])
for i in range(len(node_index) - 1): # Exclude leaf node
node_id = node_index[i]
next_node_id = node_index[i + 1]
if clf.tree_.feature[node_id] != _tree.TREE_UNDEFINED: # type: ignore
feature_name = feature_labels[clf.tree_.feature[node_id]] # type: ignore
threshold = clf.tree_.threshold[node_id] # type: ignore
# Enhanced rule formatting based on feature type
if feature_name.startswith("genre_"):
genre_name = feature_name.replace("genre_", "").title()
if next_node_id == clf.tree_.children_left[node_id]: # type: ignore
rules.append(f"Does NOT have genre: `{genre_name}`")
else:
rules.append(f"Has genre: `{genre_name}`")
else:
# Regular tag features
if next_node_id == clf.tree_.children_left[node_id]: # type: ignore
rules.append(f"{feature_name} <= {threshold}")
else:
rules.append(f"{feature_name} > {threshold}")
return rules
def _generate_enhanced_individual_explanation(
self, clf: DecisionTreeClassifier, item_id: ItemId, metadata: Dict[str, Any]
) -> Optional[Explanation]:
"""Enhanced individual explanation generation"""
if str(item_id) not in self.item_label_matrix.index:
return None
x_item_full = self.item_label_matrix.loc[[str(item_id)]]
feature_labels = metadata.get("feature_labels", [])
try:
# For genre features, we need to dynamically add them to the item
item_genres = self.genre_profiles.get(str(item_id), set())
# Create enhanced item representation
enhanced_item_data = x_item_full.copy()
# Add genre features
for genre in item_genres:
genre_feature = f"genre_{genre.lower()}"
if genre_feature in feature_labels:
enhanced_item_data[genre_feature] = 1
# Ensure all genre features exist (set to 0 if not present)
for feature in feature_labels:
if (
feature.startswith("genre_")
and feature not in enhanced_item_data.columns
):
enhanced_item_data[feature] = 0
# Select only the features used in training
x_item = enhanced_item_data[feature_labels]
except KeyError as _:
return None
# Get enhanced factual rule
# factual_rule = self._get_enhanced_explanation_path(clf, x_item, metadata)
factual_rule = self._get_factual_path_for_item(clf, x_item, metadata)
if not factual_rule:
return None
# Get counterfactuals (reuse existing method)
counterfactual_set = self._get_counterfactual_paths(clf, x_item)
if not counterfactual_set:
return None
return (factual_rule, counterfactual_set)
def _get_counterfactual_paths(
self, clf: DecisionTreeClassifier, x_item: pd.DataFrame
) -> Optional[CounterfactualSet]:
"""Original counterfactual path method (kept for compatibility)"""
tree = clf.tree_
paths = []
def find_paths(node_id, current_path):
if tree.feature[node_id] == _tree.TREE_UNDEFINED: # type: ignore
class_index = np.where(clf.classes_ == 0)[0]
if class_index.size and tree.value[node_id][0][class_index[0]] > 0:
paths.append(list(current_path))
return
feature_idx = tree.feature[node_id] # type: ignore
threshold = tree.threshold[node_id] # type: ignore
current_path.append((feature_idx, "<=", threshold))
find_paths(tree.children_left[node_id], current_path) # type: ignore
current_path.pop()
current_path.append((feature_idx, ">", threshold))
find_paths(tree.children_right[node_id], current_path) # type: ignore
current_path.pop()
find_paths(0, [])
if not paths:
return None
min_nf = float("inf")
counterfactuals = []
for path in paths:
nf = 0
for feature_idx, op, threshold in path:
if feature_idx < len(x_item.columns):
item_val = x_item.iloc[0, feature_idx]
if not (
(op == "<=" and item_val <= threshold)
or (op == ">" and item_val > threshold)
):
nf += 1
if nf < min_nf:
min_nf = nf
counterfactuals = [path]
elif nf == min_nf:
counterfactuals.append(path)
# Enhanced counterfactual formatting
formatted_counterfactuals = []
for cf_path in counterfactuals:
formatted_path = []
for idx, op, _ in cf_path:
if idx < len(x_item.columns):
feature_name = x_item.columns[idx]
if feature_name.startswith("genre_"):
genre_name = feature_name.replace("genre_", "").title()
if op == "<=":
formatted_path.append(
f"Does NOT have genre: `{genre_name}`"
)
else:
formatted_path.append(f"Has genre: `{genre_name}`")
else:
formatted_path.append(f"{feature_name} {op} 0.5")
if formatted_path:
formatted_counterfactuals.append(formatted_path)
return formatted_counterfactuals if formatted_counterfactuals else None
def _aggregate_factual_rules(
self, individual_explanations: Dict[UserId, List[str]], total_group_size: int
) -> Dict[str, List[str]]:
"""
Aggregates individual factual rules into a group consensus by finding
the rules supported by a majority of members.
"""
# Flatten the list of all rules from all users into a single list
all_rules_flat = [
rule
for rules_list in individual_explanations.values()
for rule in rules_list
]
if not all_rules_flat:
return {"unanimous": [], "majority": [], "minority": []}
# Count the occurrences of each rule
rule_counts = Counter(all_rules_flat)
majority_threshold = (total_group_size // 2) + 1 if total_group_size > 1 else 1
minority_threshold = 1
cleaned_rules_set = self._clean_contradictory_rules(set(rule_counts.keys()))
categorized_rules = {"unanimous": [], "majority": [], "minority": []}
for rule in sorted(list(cleaned_rules_set)):
count = rule_counts[rule]
rule_with_support = f"{rule} ({count}/{total_group_size} members)"
if count == total_group_size:
categorized_rules["unanimous"].append(rule_with_support)
elif count >= majority_threshold:
categorized_rules["majority"].append(rule_with_support)
elif count >= minority_threshold:
categorized_rules["minority"].append(rule_with_support)
return categorized_rules
def _clean_contradictory_rules(self, rules_set: Set[str]) -> Set[str]:
"""Enhanced contradiction cleaning that handles genre rules"""
conditions_by_attr = {}
for rule in rules_set:
# Handle genre rules
if "Has genre:" in rule or "Does NOT have genre:" in rule:
genre_match = re.search(r"`([^`]+)`", rule)
if genre_match:
genre = genre_match.group(1)
attr = f"genre_{genre}"
op = "has" if "Has genre:" in rule else "not_has"
conditions_by_attr.setdefault(attr, set()).add(op)
else:
# Handle regular rules
match = re.match(r"(.+?)\s*([<>]=?)\s*(\d+\.?\d*)", rule)
if match:
attr, op, _ = match.groups()
conditions_by_attr.setdefault(attr.strip(), set()).add(op)
# Find contradictory attributes
invalid_attrs = set()
for attr, ops in conditions_by_attr.items():
if attr.startswith("genre_"):
# Genre contradiction: has and not_has same genre
if "has" in ops and "not_has" in ops:
invalid_attrs.add(attr)
else:
# Numerical contradiction: <= and >
if any(op in ops for op in ["<=", "<"]) and any(
op in ops for op in [">", ">="]
):
invalid_attrs.add(attr)
# Remove contradictory rules
clean_rules = set()
for rule in rules_set:
is_invalid = False
for invalid_attr in invalid_attrs:
if invalid_attr.startswith("genre_"):
genre = invalid_attr.replace("genre_", "")
if f"`{genre}`" in rule:
is_invalid = True
break
else:
if invalid_attr in rule:
is_invalid = True
break
if not is_invalid:
clean_rules.add(rule)
return clean_rules
def find_explanation(
self,
recommended_items: List[ItemId],
members: List[UserId],
user_hist: Dict[UserId, Set[ItemId]],
dataset: pd.DataFrame,
model=None,
data_reader=None,
) -> Dict[str, Any]:
"""Enhanced explanation finding with tree storage for visualization"""
if data_reader is None:
raise ValueError(
"A 'data_reader' object must be provided to find explanations."
)
detailed_explanations = {}
explainable_count = 0
if not recommended_items:
return {"fidelity": 0.0, "details": {}}
for item_id in recommended_items:
all_individual_rules = {}
all_counterfactuals = {}
stored_classifiers = {} # Store classifiers for visualization
stored_metadata = {} # Store metadata for visualization
representative_decision_path = None
threshold_info_for_item = None
for user_id in members:
user_id_consecutive = data_reader.get_new_user_id(user_id)
clf, metadata = self._train_enhanced_decision_tree(
user_id_consecutive,
item_id,
user_hist.get(user_id, set()),
dataset,
model,
data_reader,
)
if clf and metadata:
if threshold_info_for_item is None and "threshold_info" in metadata:
threshold_info_for_item = metadata["threshold_info"]
explanation = self._generate_enhanced_individual_explanation(
clf, item_id, metadata
)
if explanation:
r, phi = explanation
all_individual_rules[user_id] = r
all_counterfactuals[user_id] = phi
if representative_decision_path is None:
representative_decision_path = r
# Store for visualization (use first successful classifier)
if not stored_classifiers:
stored_classifiers[user_id] = clf
stored_metadata[user_id] = metadata
total_members_in_group = len(members)
factual_set = self._aggregate_factual_rules(
all_individual_rules, total_members_in_group
)
if representative_decision_path and factual_set:
explainable_count += 1
# Enhanced detailed explanations with visualization data
item_explanation = {
"decision_path": representative_decision_path,
"group_factual_rule": factual_set,
"individual_counterfactuals": all_counterfactuals,
}
if threshold_info_for_item:
item_explanation["threshold_info"] = threshold_info_for_item
# Add visualization data if available
if stored_classifiers:
user_id_for_viz = list(stored_classifiers.keys())[0]
item_explanation.update(
{
"decision_tree": stored_classifiers[user_id_for_viz],
"feature_names": stored_metadata[user_id_for_viz].get(
"feature_labels", []
),
"tree_metadata": stored_metadata[user_id_for_viz],
"item_genres": self.genre_profiles.get(str(item_id), set()),
}
)
detailed_explanations[item_id] = item_explanation
fidelity = (
explainable_count / len(recommended_items) if recommended_items else 0.0
)
group_explanations = {
"fidelity": fidelity,
"details": detailed_explanations,
}
logging.info(
f"Enhanced fidelity for {members}: {fidelity:.3f} ({explainable_count}/{len(recommended_items)})"
)
return group_explanations
@@ -0,0 +1,314 @@
"""Rule-based group recommendation explainer module."""
from typing import Dict, List, Optional, Set, Union
import logging
from pygrex.data_reader.data_reader import DataReader
from pygrex.utils.association_rules import AssociationRules
# Type aliases for better readability
ItemId = Union[str, int]
MemberId = Union[str, int]
UserHistory = Dict[MemberId, Set[ItemId]]
logger = logging.getLogger(__name__)
class RuleBasedGroupRecExplainer:
"""
A class to explain group recommendations using rule-based methods.
This class provides methods to generate explanations for group recommendations
based on association rules and user interaction history.
"""
def __init__(
self,
rules: AssociationRules,
data: DataReader,
pool_recommendations: Optional[Union[List[ItemId], ItemId]] = None,
members: Optional[List[MemberId]] = None,
user_history: Optional[UserHistory] = None,
min_members_threshold: int = 1,
) -> None:
"""
Initialize the RuleBasedGroupRecExplainer.
Args:
rules: An instance of AssociationRules containing the rules for explanations.
pool_recommendations: A list of item IDs to explain, or a single item ID.
members: A list of member IDs in the group.
user_history: A dictionary mapping member IDs to sets of item IDs
they have interacted with.
min_members_threshold: Minimum number of members that must satisfy
the rule for it to be considered valid.
Raises:
ValueError: If min_members_threshold is less than 1.
"""
if min_members_threshold < 1:
raise ValueError("min_members_threshold must be at least 1")
self.rules = rules
self.members = members or []
self.min_members_threshold = min_members_threshold
self.user_history = user_history or {}
self.data = data
# Normalize pool_recommendations to always be a list
self.pool_recommendations = self._normalize_recommendations(
pool_recommendations
)
def _normalize_recommendations(
self, recommendations: Optional[Union[List[ItemId], ItemId]]
) -> List[ItemId]:
"""
Normalize recommendations input to a list format.
Args:
recommendations: Single item ID, list of item IDs, or None.
Returns:
List of item IDs.
"""
if recommendations is None:
return []
if isinstance(recommendations, (str, int)):
return [recommendations]
return recommendations
def _is_rule_satisfied_by_member(
self, member: MemberId, antecedent: Set[ItemId]
) -> bool:
"""
Check if a member satisfies the rule's antecedent.
Args:
member: The member ID to check.
antecedent: The set of items that form the rule's antecedent.
Returns:
True if the member's history contains all items in the antecedent.
"""
member_history = self.user_history.get(member, set())
member_history_str = {str(item) for item in member_history}
x = member_history_str.issuperset(antecedent)
return x
def _count_satisfied_members(self, antecedent: Set[ItemId]) -> int:
"""
Count how many members satisfy the given antecedent.
Args:
antecedent: The set of items that form the rule's antecedent.
Returns:
Number of members whose history satisfies the antecedent.
"""
return sum(
1
for member in self.members
if self._is_rule_satisfied_by_member(member, antecedent)
)
def _find_applicable_rules(self, item_id: ItemId):
"""
Find rules that have the given item in their consequents.
Args:
item_id: The item ID to find rules for.
Returns:
DataFrame containing applicable rules.
"""
item_id = self.data.get_new_item_id(item_id) # type: ignore
applicable_rules = self.rules[ # type: ignore
self.rules["consequents"].apply(lambda x: str(item_id) in x) # type: ignore
]
return applicable_rules
def find_explanation(self) -> float:
"""
Generate explanations for the group recommendations based on the rules.
Returns:
The fidelity of the explanations, which is the ratio of explained
recommendations to total recommendations in the pool.
"""
if not self.pool_recommendations:
logger.warning("No recommendations to explain")
return 0.0
explained_count = 0
total_recommendations = len(self.pool_recommendations)
for item_id in self.pool_recommendations:
if self._can_explain_item(item_id):
explained_count += 1
fidelity = explained_count / total_recommendations
logger.info(
f"Explained {explained_count}/{total_recommendations} recommendations "
f"(fidelity: {fidelity:.3f})"
)
return fidelity
def _can_explain_item(self, item_id: ItemId) -> bool:
"""
Check if an item can be explained by any rule.
Args:
item_id: The item ID to check.
Returns:
True if at least one rule can explain the item.
"""
applicable_rules = self._find_applicable_rules(item_id)
for _, rule in applicable_rules.iterrows():
antecedent = rule["antecedents"]
satisfied_count = self._count_satisfied_members(antecedent)
if satisfied_count >= self.min_members_threshold:
logger.debug(f"Rule fired for item {item_id}")
return True
return False
def get_explanation_details(self) -> Dict[ItemId, List[Dict]]:
"""
Get detailed explanations for each recommendation.
Returns:
Dictionary mapping item IDs to lists of applicable rule details.
"""
explanations = {}
for item_id in self.pool_recommendations:
item_explanations = []
applicable_rules = self._find_applicable_rules(item_id)
for _, rule in applicable_rules.iterrows():
antecedent = rule["antecedents"]
satisfied_count = self._count_satisfied_members(antecedent)
if satisfied_count >= self.min_members_threshold:
item_explanations.append(
{
"antecedent": antecedent,
"consequent": rule["consequents"],
"satisfied_members": satisfied_count,
"confidence": rule.get("confidence", "N/A"),
"support": rule.get("support", "N/A"),
}
)
explanations[item_id] = item_explanations
return explanations
def compute_group_fidelity_advanced(self) -> float:
"""
Compute group fidelity using advanced conditions.
This method implements a more sophisticated fidelity calculation where:
- Condition 1: Each member of the group must have seen at least one item from the antecedent
- Condition 2: Each item in the antecedent must have been seen by at least one member
Returns:
The fidelity score as a float between 0 and 1.
"""
if not self.pool_recommendations:
logger.warning("No recommendations to explain")
return 0.0
if not self.members:
logger.warning("No group members defined")
return 0.0
explained_count = 0
total_recommendations = len(self.pool_recommendations)
# Convert member IDs to set for faster lookup
members_set = set(self.members)
# Get all items seen by any group member
all_seen_items = set()
for member in members_set:
member_history = self.user_history.get(member, set())
# Convert to strings for consistency with rules
member_history_str = {str(item) for item in member_history}
all_seen_items.update(member_history_str)
for item_id in self.pool_recommendations:
if self._can_explain_item_advanced(item_id, members_set, all_seen_items):
explained_count += 1
fidelity = explained_count / total_recommendations
logger.info(
f"Advanced explanation: {explained_count}/{total_recommendations} recommendations "
f"(fidelity: {fidelity:.3f})"
)
return fidelity
def _can_explain_item_advanced(
self, item_id: ItemId, members_set: Set[MemberId], all_seen_items: Set[str]
) -> bool:
"""
Check if an item can be explained using advanced conditions.
Args:
item_id: The item ID to check.
members_set: Set of group member IDs.
all_seen_items: Set of all items seen by any group member.
Returns:
True if the item can be explained by at least one rule satisfying both conditions.
"""
applicable_rules = self._find_applicable_rules(item_id)
for _, rule in applicable_rules.iterrows():
antecedent = rule["antecedents"]
# Condition 1: Each member must have seen at least one item from the antecedent
cond1 = all(
self._member_has_antecedent_item(member, antecedent)
for member in members_set
)
# Condition 2: Each item in the antecedent must have been seen by at least one member
cond2 = antecedent.issubset(all_seen_items)
if cond1 and cond2:
logger.debug(f"Advanced rule fired for item {item_id}")
return True
return False
def _member_has_antecedent_item(
self, member: MemberId, antecedent: Set[ItemId]
) -> bool:
"""
Check if a member has seen at least one item from the antecedent.
Args:
member: The member ID to check.
antecedent: The set of items in the rule's antecedent.
Returns:
True if the member has seen at least one item from the antecedent.
"""
member_history = self.user_history.get(member, set())
member_history_str = {str(item) for item in member_history}
# Check if there's any intersection between member history and antecedent
return len(antecedent.intersection(member_history_str)) > 0
@@ -0,0 +1,434 @@
import itertools
from typing import Dict, List, Sequence, Union
from pygrex.data_reader import DataReader, GroupInteractionHandler
from pygrex.models import RecommenderModel
from pygrex.recommender import GroupRecommender
from pygrex.utils import SlidingWindowRanker, SlidingWindow, AggregationStrategy
class SlidingWindowExplainer:
"""
Stratigi, M., Bikakis, N., Stefanidis, K.: Counterfactual explanations for group
recommendations. In: Proceedings of the 27th International Workshop on Design,
Optimization, Languages and Analytical Processing of Big Data (DOLAP 2025).
A class that uses a sliding window approach to find counterfactual explanations
for group recommendation systems.
This class helps identify which items, if removed from the group's interaction history,
would cause a specific target item to no longer appear in the group recommendations.
"""
def __init__(
self,
config,
data: DataReader,
group_handler: GroupInteractionHandler,
members: List[Union[str, int]],
target_item: Union[str, int],
model: RecommenderModel,
aggregation_strategy: AggregationStrategy = AggregationStrategy.AVG_PREDICTIONS,
window_size=3,
):
"""
Initialize the SlidingWindowExplainer.
Args:
config: Configuration object with model parameters
data: DataReader object containing the dataset
group_handler: Object that handles group data modifications
members: List of user IDs in the group
target_item: The item ID for which explanation is sought
model: Recommender model to use for predictions,
aggregation_strategy: Strategy to aggregate individual recommendations,
window_size: Size of the sliding window
"""
self.cfg = config
self.data = data
self.group_handler = group_handler
self.members = members
self.target_item = target_item
self.model = model
self.aggregation_strategy = aggregation_strategy
self.window_size = window_size
# Results tracking
self.explanations_found: Dict[int, Dict] = {}
self.calls = 0
self.max_calls = 1000
self.item_metrics = {}
def set_sliding_window(self, sliding_window):
"""Set the sliding window object if not provided during initialization."""
self.sliding_window = sliding_window
def set_item_metrics(self, metrics: Dict[Union[str, int], Dict[str, float]]):
"""Store the pre-calculated metric scores for all items."""
self.item_metrics = metrics
def find_explanation(
self,
items_rated_by_group: List[Union[str, int]],
group_predictions: Dict,
top_recommendation: Union[str, int],
ranking_weights: Dict[str, float],
) -> Dict[int, Dict]:
"""
Find counterfactual explanations using the full, encapsulated process.
Args:
items_rated_by_group: All items rated by any member of the group.
group_predictions: The original individual predictions from the recommender.
top_recommendation: The original top recommended item.
ranking_weights: The weights from the UI for each ranking component.
Returns:
A dictionary of found explanations, including their justification metrics.
"""
self.calls = 0
ranker = SlidingWindowRanker(config={})
ranker.set_group_recommender_values(group_predictions, top_recommendation)
ranked_items, self.item_metrics = ranker.generate_ranked_items(
all_rated_items=items_rated_by_group,
data=self.data,
group_members=self.members,
component_weights=ranking_weights,
)
sliding_window = SlidingWindow(
sequence=ranked_items, window_size=self.window_size
)
found = 0
while True:
# Get the sliding window
big_window = sliding_window.get_next_window()
# Check exit conditions
if big_window is None or found > 0 or self.calls >= self.max_calls:
break
# Count calls and windows
self.calls += 1
# Test if removing this window affects recommendations
if self._test_window_removal(big_window, self.target_item):
# A counterfactual explanation has been found
found += 1
# Look for minimal subsets within this window
self._find_minimal_subset(big_window, self.target_item)
if found == 0:
print("Explanation could not be found")
return self.explanations_found
def _test_window_removal(
self, item_ids: List[Union[str, int]], original_group_rec: Union[str, int]
) -> bool:
"""
Test if removing the given items affects the group recommendation.
Args:
item_ids: List of item IDs to remove from group interactions
original_group_rec: The original recommendation to compare against
Returns:
bool: True if removing these items changes recommendations, False otherwise
"""
# Get new recommendations after removing items
group_recommendation = self._get_recommendations_after_removal(item_ids)
# Check if target item is still in recommendations
return original_group_rec not in group_recommendation
def _get_recommendations_after_removal(
self, item_ids: List[Union[str, int]], top_n: int = 10
) -> Sequence[Union[str, int]]:
"""
Get group recommendations after removing specified items from interaction history.
Args:
item_ids: List of item IDs to remove from group interactions
top_n: Number of top recommendations to return
Returns:
List of recommended item IDs
"""
# Create modified dataset with items removed
changed_data = self.group_handler.create_modified_dataset(
original_data=self.data.dataset,
group_ids=self.members,
item_ids=item_ids,
data=self.data,
)
# Create new DataReader and retrain model
data_retrained = self._create_data_reader_and_prepare(changed_data)
model_retrained = self._retrain_model(data_retrained)
# Set up recommender with new model and data
group_recommender = GroupRecommender(data_retrained)
group_recommender.setup_recommendation(
model_retrained,
self.members,
data_retrained,
aggregation_strategy=self.aggregation_strategy,
)
recommendations = group_recommender.get_group_recommendations(top_n)
if not isinstance(recommendations, list):
return []
return recommendations
def _create_data_reader_and_prepare(self, changed_data):
"""
Create and prepare a new DataReader with modified data.
Args:
changed_data: DataFrame with modified dataset
Returns:
DataReader: A new DataReader object with the modified dataset
"""
data_retrained = DataReader(
filepath_or_buffer=None,
sep=None,
names=None,
skiprows=0,
dataframe=changed_data,
)
# Fix for potential dataset issue in original code
# data_retrained.dataset = data_retrained.dataset.iloc[1:].reset_index(drop=True)
# Prepare data
data_retrained.make_consecutive_ids_in_dataset()
data_retrained.binarize(binary_threshold=1)
return data_retrained
def _retrain_model(self, data):
"""
Retrain the recommendation model with modified data.
Args:
data: Prepared DataReader object with modified dataset
Returns:
Retrained model
"""
self.model.fit(data)
return self.model
def _find_minimal_subset(
self, big_window: List[Union[str, int]], original_group_rec: Union[str, int]
) -> None:
"""
Find minimal subset of items that act as counterfactual explanation.
Args:
big_window: List of item IDs to search within
original_group_rec: The original recommendation to compare against
"""
found_subset = 0
# Try combinations of different lengths
for length in range(1, len(big_window) + 1):
if found_subset > 0 or self.calls > self.max_calls:
break
combinations = itertools.combinations(big_window, length)
for item_combo in combinations:
if found_subset > 0 or self.calls > self.max_calls:
break
subset_items = list(item_combo)
self.calls += 1
# Get recommendations after removing this subset
new_recommendations = self._get_recommendations_after_removal(
subset_items
)
# Check if this is a counterfactual explanation
if original_group_rec not in new_recommendations:
found_subset += 1
self._record_explanation(
subset_items, original_group_rec, new_recommendations[0]
)
def _record_explanation(
self,
explanation_items: List[Union[str, int]],
original_rec: Union[str, int],
new_rec: Union[str, int],
) -> None:
"""
Record and display found explanation.
Args:
explanation_items: Items that form the counterfactual explanation
original_rec: Original recommendation
new_rec: New top recommendation after removing explanation items
"""
print(
f"If the group had not interacted with these items {explanation_items},\n"
f"the item of interest {original_rec} would not have appeared on the recommendation list;\n"
f"instead, {new_rec} would have been recommended."
)
# print("")
# print(f"Explanation: {explanation_items} : found at call: {self.calls}")
# Calculate metrics for the explanation
item_intensity = self._calculate_item_intensity(explanation_items)
user_intensity = self._calculate_user_intensity(explanation_items)
explanation_metrics = {
item: self.item_metrics.get(item, {}) for item in explanation_items
}
self.explanations_found[self.calls] = {
"items": explanation_items,
"new_rec": new_rec,
"metrics": explanation_metrics,
}
exp_size = len(explanation_items)
# print(f"{exp_size}\t{self.calls}\t{item_intensity}\t{user_intensity}")
def _calculate_item_intensity(self, items: List[Union[str, int]]) -> List[float]:
"""
Calculate average item intensity for explanation items.
Args:
items: List of item IDs in the explanation
Returns:
List of average intensity scores for each item
"""
return self._calculate_average_item_intensity_score(
items, self.members, self.data
)
def _calculate_user_intensity(self, items: List[Union[str, int]]) -> List[float]:
"""
Calculate user intensity score for explanation items.
Args:
items: List of item IDs in the explanation
Returns:
List of intensity scores for each user
"""
return self._calculate_user_intensity_score(items, self.members, self.data)
@staticmethod
def _calculate_average_item_intensity_score(
explanation: List[Union[str, int]],
members: List[Union[str, int]],
data: DataReader,
) -> List[float]:
"""
Calculate the average item intensity for a counterfactual explanation.
Average item intensity is defined as the average number of interactions
between group members and each item in the explanation.
Args:
explanation: The counterfactual explanation items.
members: User IDs of the group members.
data: DataReader object containing the dataset and ID mapping methods.
Returns:
list: Average intensity for each item in the explanation.
"""
internal_group_ids = []
# Convert user IDs to internal representation
for user_id in members:
new_user_id = data.get_new_user_id(user_id)
if isinstance(new_user_id, list):
if new_user_id: # Check that the list is not empty
internal_group_ids.append(int(new_user_id[0]))
else:
internal_group_ids.append(int(new_user_id))
group_size = len(members)
item_intensities = []
for item_id in explanation:
# Convert item ID to internal representation
internal_item_id = data.get_new_item_id(item_id)
# Count interactions between this item and group members
interactions_count = len(
data.dataset[
(data.dataset.itemId == internal_item_id)
& (data.dataset.userId.isin(internal_group_ids))
]
)
# Calculate average intensity
average_intensity = interactions_count / group_size
item_intensities.append(average_intensity)
return item_intensities
@staticmethod
def _calculate_user_intensity_score(
explanation_items: List[Union[str, int]],
members: List[Union[str, int]],
data: DataReader,
) -> List[float]:
"""
Calculate the interaction intensity for each user based on their interactions with items in an explanation.
Interaction intensity represents how much a user has interacted with the items in the explanation,
normalized by the total number of explanation items.
Args
explanation_items : List of item IDs in the explanation
members : List of user IDs to calculate intensity for
data : DataReader object containing the dataset and ID mapping methods
Returns
List of interaction intensities for each user (same order as members)
Values range from 0 to 1, where:
- 0 means no interaction with any explanation item
- 1 means interaction with all explanation items
Notes
Intensity is calculated as: (number of user interactions with explanation items) / (number of explanation items)
"""
# Convert external item IDs to internal IDs
internal_item_ids = [
data.get_new_item_id(item_id) for item_id in explanation_items
]
user_intensities = []
num_explanation_items = len(explanation_items)
for member in members:
# Convert external user ID to internal ID
internal_user_id = data.get_new_user_id(member)
# Count interactions between this user and explanation items
user_interactions_count = len(
data.dataset[
(data.dataset.itemId.isin(internal_item_ids))
& (data.dataset.userId == internal_user_id)
]
)
# Calculate intensity as proportion of explanation items the user interacted with
intensity = user_interactions_count / num_explanation_items
user_intensities.append(intensity)
return user_intensities
+11
View File
@@ -0,0 +1,11 @@
from .model_based_emf import EMFExplainer
from .model_based_als_explain import ALSExplainer
from .post_hoc_association_rules import ARPostHocExplainer
from .post_hoc_knn import KNNPostHocExplainer
__all__ = [
"EMFExplainer",
"ALSExplainer",
"ARPostHocExplainer",
"KNNPostHocExplainer",
]
+49
View File
@@ -0,0 +1,49 @@
from tqdm.auto import tqdm
from abc import ABC, abstractmethod
from typing import Dict, Any
class Explainer(ABC):
def __init__(self, model, recommendations, data):
self.model = model
self.recommendations = recommendations
self.dataset = data.dataset
self.num_items = data.num_item
self.num_users = data.num_user
self.users = self.dataset.groupby(by="userId")
def explain_recommendations(self):
explanations = []
with tqdm(
total=self.recommendations.shape[0], desc="Computing explanations: "
) as pbar:
for _, row in self.recommendations.iterrows():
explanations.append(
self.explain_recommendation_to_user(
int(row.userId), int(row.itemId)
)
)
pbar.update()
self.recommendations["explanations"] = explanations
return self.recommendations
def get_user_items(self, user_id):
"""
Items Ids rated by a user.
:param user_id: the user
:return: list
"""
return self.users.get_group(user_id).itemId.values
@abstractmethod
def explain_recommendation_to_user(
self, user_id: int, item_id: int
) -> Dict[str, Any]:
"""
Generates an explanation for a single user-item recommendation.
This method must be implemented by any subclass.
"""
raise NotImplementedError
@@ -0,0 +1,51 @@
import numpy as np
import pandas as pd
from .explainer import Explainer
class ALSExplainer(Explainer):
def __init__(self, model, recommendations, data, number_of_contributions=10):
super(ALSExplainer, self).__init__(model, recommendations, data)
self.number_of_contributions = number_of_contributions
def explain_recommendation_to_user(self, user_id: int, item_id: int):
"""
Measuring the contribution of each item to the recommendation.
:param model:
:param item_id:
:param user_id:
:return: returns a dataframe with the contribution to the recommendation of each previously interacted with item.
"""
current_interactions = np.zeros(self.num_items)
current_interactions[self.get_user_items(user_id)] = 1
c_u = np.diag(current_interactions)
y_t = self.model.item_embedding().transpose()
temp = np.matmul(y_t, c_u)
temp = np.matmul(temp, self.model.item_embedding())
temp = temp + np.diag([self.model.reg_term] * self.model.latent_dim)
if len(self.get_user_items(user_id)) > 1:
weight_mtr = np.linalg.inv(temp)
else:
weight_mtr = np.linalg.pinv(temp)
temp = np.matmul(self.model.item_embedding(), weight_mtr)
sim_to_rec_id = temp.dot(self.model.item_embedding()[item_id, :])
sim_to_rec_id = sim_to_rec_id[self.get_user_items(user_id)]
contribution = {
"item": self.get_user_items(user_id),
"contribution": sim_to_rec_id,
}
contribution = pd.DataFrame(contribution)
contribution = contribution.sort_values(by=["contribution"], ascending=False)
return {
"item": contribution.item[: self.number_of_contributions],
"contribution": contribution.contribution[: self.number_of_contributions],
}
@@ -0,0 +1,28 @@
from .explainer import Explainer
class EMFExplainer(Explainer):
def __init__(self, model, recommendations, data):
super(EMFExplainer, self).__init__(model, recommendations, data)
def explain_recommendation_to_user(self, user_id: int, item_id: int):
"""
Measuring the contribution of each item to the recommendation.
:param user_id:
:param item_id: recommendation
:return: returns a dataframe with the contribution to the recommendation of each previously interacted with item.
"""
ratings_on_item = self.dataset[self.dataset.itemId == item_id]
similar_users = self.model.sim_users[user_id]
similar_users_ratings_on_item = ratings_on_item[
ratings_on_item.userId.isin(similar_users)
]
explanation_df = similar_users_ratings_on_item.groupby(by="rating").count()
explanation = {}
for index, row in explanation_df.iterrows():
explanation[index] = row[0]
return explanation
@@ -0,0 +1,79 @@
from typing import Any, Dict
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd
from .explainer import Explainer
class ARPostHocExplainer(Explainer):
def __init__(
self,
model,
recommendations,
data,
min_support=0.1,
max_len=2,
metric="lift",
min_threshold=0.1,
min_confidence=0.1,
min_lift=0.1,
):
super(ARPostHocExplainer, self).__init__(model, recommendations, data)
self.AR = None
self.min_support = min_support
self.max_len = max_len
self.metric = metric
self.min_threshold = min_threshold
self.min_confidence = min_confidence
self.min_lift = min_lift
self.rules: pd.DataFrame | None = None
def get_rules_for_getting(self, item_id: int) -> pd.DataFrame:
if self.rules is None:
self.compute_association_rules()
if self.rules is not None:
return self.rules[self.rules.consequents == item_id]
return pd.DataFrame()
def compute_association_rules(self):
item_sets = [
[item for item in self.dataset[self.dataset.userId == user].itemId]
for user in self.dataset.userId.unique()
]
te = TransactionEncoder()
te_ary = te.fit(item_sets).transform(item_sets)
# The te_ary object is a NumPy array, which is a valid input for a DataFrame.
# Pylance may raise a false positive here due to incomplete type stubs for mlxtend.
df = pd.DataFrame(te_ary.astype(bool), columns=te.columns_) # type: ignore
frequent_itemsets = apriori(
df, min_support=self.min_support, use_colnames=True, max_len=self.max_len
)
rules = association_rules(
frequent_itemsets, metric="lift", min_threshold=self.min_threshold
)
rules = rules[
(rules["confidence"] > self.min_confidence)
& (rules["lift"] > self.min_lift)
]
rules["consequents"] = rules["consequents"].apply(lambda x: list(x)[0])
rules["antecedents"] = rules["antecedents"].apply(lambda x: list(x)[0])
self.rules = rules[["consequents", "antecedents", "confidence"]]
def explain_recommendation_to_user(
self, user_id: int, item_id: int
) -> Dict[str, Any]:
user_ratings = self.get_user_items(user_id)
rules = self.get_rules_for_getting(item_id)
explanations = rules[rules.antecedents.isin(user_ratings)]
return {"antecedents": set(explanations.antecedents)}
+46
View File
@@ -0,0 +1,46 @@
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import Dict, Any
from .explainer import Explainer
class KNNPostHocExplainer(Explainer):
def __init__(self, model, recommendations, data, knn=10):
super(KNNPostHocExplainer, self).__init__(model, recommendations, data)
self.knn = knn
# Initialize as an empty dictionary to prevent subscripting None
self.knn_items_dict: Dict[int, np.ndarray] = {}
def get_nn_for_getting(self, item_id: int) -> np.ndarray:
# Check if the KNN dictionary has been computed
if not self.knn_items_dict:
self.compute_knn_items_for_all_items()
# Return the neighbors for the item, or an empty array if not found
return self.knn_items_dict.get(item_id, np.array([]))
def compute_knn_items_for_all_items(self):
ds = np.zeros((self.num_items, self.num_users))
# Assuming self.dataset has attributes itemId, userId, and rating
ds[self.dataset.itemId, self.dataset.userId] = self.dataset.rating
ds = sparse.csr_matrix(ds)
sim_matrix = cosine_similarity(ds)
min_val = sim_matrix.min() - 1
for i in range(self.num_items):
sim_matrix[i, i] = min_val
knn_to_item_i = (-sim_matrix[i, :]).argsort()[: self.knn]
self.knn_items_dict[i] = knn_to_item_i
def explain_recommendation_to_user(
self, user_id: int, item_id: int
) -> Dict[str, Any]:
user_ratings = self.get_user_items(user_id)
sim_items = self.get_nn_for_getting(item_id)
explanations = set(sim_items) & set(user_ratings)
return {"explanations": explanations}
+23
View File
@@ -0,0 +1,23 @@
from .als_model import ALS
from .bpr_model import BPR
from .gmf_model import GMFModel
from .emf_model import EMFModel
from .autoencoder_model import ExplAutoencoderTorch
from .mlp_model import MLPModel
from .emf_model import PyTorchModel
from .knn_basic_model import KNNBasic
from .svd_model import SVD
from .recommender_model import RecommenderModel
__all__ = [
"ALS",
"BPR",
"GMFModel",
"EMFModel",
"PyTorchModel",
"MLPModel",
"ExplAutoencoderTorch",
"KNNBasic",
"SVD",
"RecommenderModel",
]
+31
View File
@@ -0,0 +1,31 @@
import implicit
from .mf_implicit_model import MFImplicitModel
class ALS(MFImplicitModel):
def __init__(
self,
latent_dim,
reg_term,
epochs,
random_state=42,
num_users=None,
num_items=None,
**kwargs,
):
super(ALS, self).__init__(
latent_dim=latent_dim,
reg_term=reg_term,
epochs=epochs,
learning_rate=None,
num_users=num_users,
num_items=num_items,
)
self.model = implicit.als.AlternatingLeastSquares(
factors=self.latent_dim,
regularization=self.reg_term,
iterations=self.epochs,
random_state=random_state,
)
+223
View File
@@ -0,0 +1,223 @@
import numpy as np
import torch
import torch.nn as nn
import torch.optim
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from typing import Optional, Union, List
from pygrex.utils.torch_utils import use_cuda, use_optimizer
from pygrex.data_reader import UserItemDict, DataReader
from .recommender_model import RecommenderModel
class ExplAutoencoderTorch(RecommenderModel, nn.Module):
def __init__(
self,
hidden_layer_features: int,
learning_rate: float,
positive_threshold: float,
weight_decay: float,
epochs: int,
knn: int,
cuda: bool,
optimizer_name: str,
expl: bool,
device_id: Optional[int] = None,
):
super().__init__()
if optimizer_name not in ["sgd", "adam", "rmsprop"]:
raise Exception("Wrong optimizer.")
if cuda:
use_cuda(True, device_id if device_id is not None else 0)
self.positive_threshold = positive_threshold
self.weight_decay = weight_decay
self.knn = knn
self.learning_rate = learning_rate
self.epochs = epochs
self.use_gpu = cuda
self.optimizer_name = optimizer_name
self.hidden_layer_features = hidden_layer_features
self.expl = expl
self.dataset = None
self.data = None
self.embedding_user = None
self.embedding_item = None
self.optimizer: Optional[torch.optim.Optimizer] = None
self.explainability_matrix = None
self.sim_users = {}
self.criterion = nn.MSELoss()
def fit(self, data: DataReader):
self.data = data
self.dataset = data.dataset
num_items = self.data.num_item
self.encoder_hidden_layer = nn.Linear(
in_features=num_items, out_features=self.hidden_layer_features
)
self.decoder_output_layer = nn.Linear(
in_features=self.hidden_layer_features, out_features=num_items
)
self.compute_explainability()
optimizer = use_optimizer(
network=self,
weight_decay=self.weight_decay,
learning_rate=self.learning_rate,
optimizer_name=self.optimizer_name,
)
assert isinstance(optimizer, torch.optim.Optimizer)
self.optimizer = optimizer
with tqdm(total=self.epochs) as progress:
train_loader = self.instance_a_train_loader()
for epoch in range(self.epochs):
loss = self.train_an_epoch(train_loader)
progress.update(1)
progress.set_postfix({"loss": loss})
def compute_explainability(self):
assert self.dataset is not None
assert self.data is not None
ds = self.dataset.pivot(index="userId", columns="itemId", values="rating")
ds = ds.fillna(0)
ds = sparse.csr_matrix(ds)
sim_matrix = cosine_similarity(ds)
min_val = sim_matrix.min() - 1
for i in range(self.data.num_user):
sim_matrix[i, i] = min_val
knn_to_user_i = (-sim_matrix[i, :]).argsort()[: self.knn]
self.sim_users[i] = knn_to_user_i
self.explainability_matrix = np.zeros((self.data.num_user, self.data.num_item))
filter_dataset_on_threshold = self.dataset[
self.dataset["rating"] >= self.positive_threshold
]
for i in range(self.data.num_user):
knn_to_user_i = self.sim_users[i]
rated_items_by_sim_users = filter_dataset_on_threshold[
filter_dataset_on_threshold["userId"].isin(knn_to_user_i)
]
sim_scores = rated_items_by_sim_users.groupby(by="itemId")
sim_scores = sim_scores["rating"].sum()
sim_scores = sim_scores.reset_index()
self.explainability_matrix[i, sim_scores.itemId] = (
sim_scores.rating.to_list()
)
self.explainability_matrix = MinMaxScaler().fit_transform(
self.explainability_matrix
)
self.explainability_matrix = torch.from_numpy(self.explainability_matrix)
def instance_a_train_loader(self):
"""instance train loader for one training epoch"""
assert self.dataset is not None
assert self.explainability_matrix is not None
self.user_item_dict = UserItemDict(
self.dataset, self.explainability_matrix, self.expl
)
return DataLoader(self.user_item_dict, shuffle=True)
def train_an_epoch(self, train_loader):
self.train()
cnt = 0
total_loss = 0
for batch_id, batch in enumerate(train_loader):
assert isinstance(batch[0], torch.Tensor)
rating = batch[0]
rating = rating.float()
loss = self.train_single_user(rating)
total_loss += loss
cnt += 1
return total_loss / cnt
def train_single_user(self, ratings):
if self.use_gpu:
ratings = ratings.cuda()
assert self.optimizer is not None
self.optimizer.zero_grad()
ratings_pred = self(ratings)
loss = self.criterion(ratings_pred, ratings)
loss.backward()
self.optimizer.step()
loss = loss.item()
return loss
def forward(self, user_adjusted_ratings):
activation = self.encoder_hidden_layer(user_adjusted_ratings)
code = torch.relu(activation)
activation = self.decoder_output_layer(code)
reconstructed_ratings = torch.relu(activation)
return reconstructed_ratings
def predict(
self, user_id: Union[int, List[int], str], item_id: Union[int, List[int], str]
) -> list:
try:
if isinstance(user_id, str):
user_id = int(user_id)
elif isinstance(user_id, list):
user_id = [int(u) for u in user_id]
if isinstance(item_id, str):
item_id = int(item_id)
elif isinstance(item_id, list):
item_id = [int(i) for i in item_id]
except (ValueError, TypeError):
raise ValueError(
"User and item IDs must be integers or strings that can be converted to integers."
)
single_user = isinstance(user_id, int)
single_item = isinstance(item_id, int)
if isinstance(user_id, int):
user_id = [user_id]
if isinstance(item_id, int):
item_id = [item_id]
with torch.no_grad():
assert self.user_item_dict is not None, "The model has not been fitted yet."
# Collect ratings for all users
ratings_list = []
for uid in user_id:
rating = self.user_item_dict[uid] # Pass scalar user_id to dict
ratings_list.append(rating)
rating = torch.stack(ratings_list)
rating = rating.float()
if self.use_gpu:
rating = rating.cuda()
pred = self.forward(rating).cpu()
predictions = pred[:, item_id].tolist()
# Flatten the nested list if it contains only one user's predictions
if single_user and single_item:
return (
predictions[0][0]
if isinstance(predictions[0], list)
else predictions[0]
)
elif single_user:
return predictions[0]
return predictions
+25
View File
@@ -0,0 +1,25 @@
import implicit
from .mf_implicit_model import MFImplicitModel
class BPR(MFImplicitModel):
""""""
def __init__(self,
latent_dim,
reg_term,
learning_rate,
epochs,
**kwargs):
super(BPR, self).__init__(latent_dim=latent_dim,
reg_term=reg_term,
learning_rate=learning_rate,
epochs=epochs)
self.model = implicit.bpr.BayesianPersonalizedRanking(
factors=self.latent_dim,
learning_rate=self.learning_rate,
regularization=self.reg_term,
iterations=self.epochs
)
+391
View File
@@ -0,0 +1,391 @@
import numpy as np
import torch
import torch.nn as nn
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from typing import Union
from pygrex.data_reader import UserItemRatingDataset, DataReader
from pygrex.utils import EMFLoss
from .py_torch_model import PyTorchModel
from .recommender_model import RecommenderModel
class EMFModel(RecommenderModel):
def __init__(
self,
learning_rate: float,
reg_term: float,
expl_reg_term: float,
positive_threshold: float,
latent_dim: int,
epochs: int,
knn: int,
):
self.latent_dim = latent_dim
self.learning_rate = learning_rate
self.epochs = epochs
self.dataset = None
self.data = None
self.embedding_user = None
self.embedding_item = None
self.optimizer = None
self.reg_term = reg_term
self.expl_reg_term = expl_reg_term
self.positive_threshold = positive_threshold
self.knn = knn
self.explainability_matrix = None
self.sim_users = {}
self.affine_output = nn.Linear(in_features=self.latent_dim, out_features=1)
self.criterion = EMFLoss()
def fit(self, data: DataReader) -> None:
self.data = data
self.dataset = data.dataset
assert self.data is not None
num_users = self.data.num_user
num_items = self.data.num_item
self.embedding_user = np.random.uniform(
low=0, high=0.5 / self.latent_dim, size=(num_users, self.latent_dim)
)
self.embedding_item = np.random.uniform(
low=0, high=0.5 / self.latent_dim, size=(num_items, self.latent_dim)
)
self.compute_explainability()
with tqdm(total=self.epochs) as progress:
assert self.dataset is not None
for epoch in range(self.epochs):
self.dataset = self.dataset.sample(frac=1)
loss = []
for _, row in self.dataset.iterrows():
user_id = int(row.userId)
item_id = int(row.itemId)
p_ui = self.predict(user_id, item_id)
e_ui = row.rating - p_ui
loss.append(e_ui**2)
assert self.embedding_item is not None
assert self.embedding_user is not None
delta_u = 2 * e_ui * self.embedding_item[item_id, :]
delta_u -= self.reg_term * self.embedding_user[user_id, :]
temp = np.sign(
self.embedding_item[item_id, :]
- self.embedding_user[user_id, :]
)
assert self.explainability_matrix is not None
temp *= (
self.expl_reg_term
* self.explainability_matrix[user_id, item_id]
)
delta_u -= temp
delta_v = 2 * e_ui * self.embedding_user[user_id, :]
delta_v -= self.reg_term * self.embedding_item[item_id, :]
temp = np.sign(
self.embedding_user[user_id, :]
- self.embedding_item[item_id, :]
)
assert self.explainability_matrix is not None
temp *= (
self.expl_reg_term
* self.explainability_matrix[user_id, item_id]
)
delta_v -= temp
self.embedding_user[user_id, :] += self.learning_rate * delta_u
self.embedding_item[item_id, :] += self.learning_rate * delta_v
progress.update(1)
progress.set_postfix({"MSE": sum(loss) / len(loss)})
def compute_explainability(self):
assert self.dataset is not None
ds = self.dataset.pivot(index="userId", columns="itemId", values="rating")
ds = ds.fillna(0)
ds = sparse.csr_matrix(ds)
sim_matrix = cosine_similarity(ds)
min_val = sim_matrix.min() - 1
assert self.data is not None
for i in range(self.data.num_user):
sim_matrix[i, i] = min_val
knn_to_user_i = (-sim_matrix[i, :]).argsort()[: self.knn]
self.sim_users[i] = knn_to_user_i
self.explainability_matrix = np.zeros((self.data.num_user, self.data.num_item))
filter_dataset_on_threshold = self.dataset[
self.dataset["rating"] >= self.positive_threshold
]
for i in range(self.data.num_user):
knn_to_user_i = self.sim_users[i]
rated_items_by_sim_users = filter_dataset_on_threshold[
filter_dataset_on_threshold["userId"].isin(knn_to_user_i)
]
sim_scores = rated_items_by_sim_users.groupby(by="itemId")
sim_scores = sim_scores["rating"].sum()
sim_scores = sim_scores.reset_index()
self.explainability_matrix[i, sim_scores.itemId.astype(int)] = (
sim_scores.rating.to_list()
)
self.explainability_matrix = MinMaxScaler().fit_transform(
self.explainability_matrix
)
def predict(
self, user_id: Union[int, str], item_id: Union[int, str]
) -> Union[float, list]:
user_id_processed = user_id
item_id_processed = item_id
if isinstance(user_id_processed, np.ndarray):
user_id_processed = user_id_processed.tolist()
if isinstance(item_id_processed, np.ndarray):
item_id_processed = item_id_processed.tolist()
is_list_input = isinstance(user_id_processed, list) or isinstance(
item_id_processed, list
)
if is_list_input:
user_id_list = (
user_id_processed
if isinstance(user_id_processed, list)
else [user_id_processed]
)
item_id_list = (
item_id_processed
if isinstance(item_id_processed, list)
else [item_id_processed]
)
predictions = []
for u in user_id_list:
assert self.embedding_user is not None
assert self.embedding_item is not None
pred = [
np.dot(
self.embedding_user[int(u), :], self.embedding_item[int(i), :]
)
for i in item_id_list
]
predictions.append(pred)
predictions_np = np.array(predictions)
if len(user_id_list) == 1 or len(item_id_list) == 1:
predictions_np = predictions_np.flatten()
return predictions_np.tolist()
else:
assert self.embedding_user is not None
assert self.embedding_item is not None
return np.dot(
self.embedding_user[int(user_id), :],
self.embedding_item[int(item_id), :],
)
def user_embedding(self):
return self.embedding_user
def item_embedding(self):
return self.embedding_item
class EMFTorchModel(PyTorchModel):
def __init__(
self,
learning_rate: float,
reg_term: float,
expl_reg_term: float,
positive_threshold: float,
momentum: float,
weight_decay: float,
latent_dim: int,
epochs: int,
batch_size: int,
knn: int,
cuda: bool,
optimizer_name: str,
device_id=None,
):
super().__init__(
learning_rate=learning_rate,
latent_dim=latent_dim,
epochs=epochs,
batch_size=batch_size,
cuda=cuda,
optimizer_name=optimizer_name,
device_id=device_id,
)
self.reg_term = reg_term
self.expl_reg_term = expl_reg_term
self.positive_threshold = positive_threshold
self.momentum = momentum
self.weight_decay = weight_decay
self.knn = knn
self.explainability_matrix = None
self.sim_users = {}
self.affine_output = nn.Linear(in_features=self.latent_dim, out_features=1)
self.criterion = EMFLoss()
def fit(self, data: DataReader) -> None:
self.data = data
self.dataset = data.dataset
assert self.data is not None
num_users = self.data.num_user
num_items = self.data.num_item
self.embedding_user = nn.Embedding(
num_embeddings=num_users, embedding_dim=self.latent_dim
)
self.embedding_item = nn.Embedding(
num_embeddings=num_items, embedding_dim=self.latent_dim
)
self.compute_explainability()
self.optimizer = torch.optim.SGD(
self.parameters(),
lr=self.learning_rate,
momentum=self.momentum,
weight_decay=self.weight_decay,
)
with tqdm(total=self.epochs) as progress:
for epoch in range(self.epochs):
train_loader = self.instance_a_train_loader(self.batch_size)
loss = self.train_an_epoch(train_loader)
progress.update(1)
progress.set_postfix({"loss": loss})
def compute_explainability(self):
assert self.dataset is not None
ds = self.dataset.pivot(index="userId", columns="itemId", values="rating")
ds = ds.fillna(0)
ds = sparse.csr_matrix(ds)
sim_matrix = cosine_similarity(ds)
min_val = sim_matrix.min() - 1
assert self.data is not None
for i in range(self.data.num_user):
sim_matrix[i, i] = min_val
knn_to_user_i = (-sim_matrix[i, :]).argsort()[: self.knn]
self.sim_users[i] = knn_to_user_i
self.explainability_matrix = np.zeros((self.data.num_user, self.data.num_item))
filter_dataset_on_threshold = self.dataset[
self.dataset["rating"] >= self.positive_threshold
]
for i in range(self.data.num_user):
knn_to_user_i = self.sim_users[i]
rated_items_by_sim_users = filter_dataset_on_threshold[
filter_dataset_on_threshold["userId"].isin(knn_to_user_i)
]
sim_scores = rated_items_by_sim_users.groupby(by="itemId")
sim_scores = sim_scores["rating"].sum()
sim_scores = sim_scores.reset_index()
self.explainability_matrix[i, sim_scores.itemId.astype(int)] = (
sim_scores.rating.to_list()
)
self.explainability_matrix = MinMaxScaler().fit_transform(
self.explainability_matrix
)
self.explainability_matrix = torch.from_numpy(self.explainability_matrix)
def instance_a_train_loader(self, batch_size):
assert self.dataset is not None
dataset = UserItemRatingDataset(
user_tensor=torch.LongTensor(self.dataset.userId.values),
item_tensor=torch.LongTensor(self.dataset.itemId.values),
target_tensor=torch.FloatTensor(self.dataset.rating.values),
)
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
def train_an_epoch(self, train_loader):
self.train()
cnt = 0
total_loss = 0
for batch_id, batch in enumerate(train_loader):
assert isinstance(batch[0], torch.LongTensor)
user, item, rating = batch[0], batch[1], batch[2]
rating = rating.float()
loss = self.train_single_batch(user, item, rating)
total_loss += loss
cnt += 1
return total_loss / cnt
def train_single_batch(self, users, items, ratings):
if self.cuda is True:
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
assert self.optimizer is not None
self.optimizer.zero_grad()
ratings_pred = self(users, items)
assert self.embedding_user is not None
user_embeddings = self.embedding_user(users)
assert self.embedding_item is not None
item_embeddings = self.embedding_item(items)
assert self.explainability_matrix is not None
loss = self.criterion(
ratings_pred=ratings_pred,
ratings=ratings,
u=user_embeddings,
v=item_embeddings,
reg_term=self.reg_term,
expl=self.explainability_matrix[users, items],
expl_reg_term=self.expl_reg_term,
)
loss.backward()
self.optimizer.step()
loss = loss.item()
return loss
def forward(self, user_indices, item_indices):
assert self.embedding_user is not None
user_embeddings = self.embedding_user(user_indices)
assert self.embedding_item is not None
item_embeddings = self.embedding_item(item_indices)
element_product = torch.mul(user_embeddings, item_embeddings)
rating = self.affine_output(element_product)
return rating
+165
View File
@@ -0,0 +1,165 @@
import random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Optimizer
from tqdm.auto import tqdm
from pygrex.data_reader import DataReader, UserItemRatingDataset
from pygrex.utils.torch_utils import use_optimizer
from .py_torch_model import PyTorchModel
class GMFModel(PyTorchModel):
def __init__(
self,
learning_rate: float,
weight_decay: float,
latent_dim: int,
epochs: int,
num_negative: int,
batch_size: int,
cuda: bool,
optimizer_name: str,
device_id=None,
):
super().__init__(
learning_rate=learning_rate,
latent_dim=latent_dim,
epochs=epochs,
batch_size=batch_size,
cuda=cuda,
optimizer_name=optimizer_name,
device_id=device_id,
)
self.negative_sample_size = num_negative
self.weight_decay = weight_decay
self.optimizer: Optimizer | None = None
self.affine_output = torch.nn.Linear(
in_features=self.latent_dim, out_features=1
)
self.logistic = torch.nn.Sigmoid()
self.criterion = nn.BCELoss()
def fit(self, data: DataReader):
optimizer = use_optimizer(
network=self,
weight_decay=self.weight_decay,
learning_rate=self.learning_rate,
optimizer_name=self.optimizer_name,
)
if not isinstance(optimizer, Optimizer):
raise TypeError(f"Expected an Optimizer, but got {type(optimizer)}")
self.optimizer = optimizer
dataset = data.dataset
num_users = data.num_user
num_items = data.num_item
self.embedding_user = torch.nn.Embedding(
num_embeddings=num_users, embedding_dim=self.latent_dim
)
self.embedding_item = torch.nn.Embedding(
num_embeddings=num_items, embedding_dim=self.latent_dim
)
self.negatives = self._sample_negative(dataset)
with tqdm(total=self.epochs) as progress:
for epoch in range(self.epochs):
train_loader = self.instance_a_train_loader(
dataset, self.negative_sample_size, self.batch_size
)
loss = self.train_an_epoch(train_loader)
progress.update(1)
progress.set_postfix({"loss": loss})
def instance_a_train_loader(self, dataset, num_negatives, batch_size):
"""instance train loader for one training epoch"""
users, items, ratings = [], [], []
train_ratings = pd.merge(
dataset, self.negatives[["userId", "negative_items"]], on="userId"
)
train_ratings["negatives"] = train_ratings["negative_items"].apply(
lambda x: random.sample(list(x), num_negatives)
)
user_ids = train_ratings["userId"].tolist()
item_ids = train_ratings["itemId"].tolist()
rating_values = train_ratings["rating"].tolist()
negatives_lists = train_ratings["negatives"].tolist()
for user, item, rating, negatives in zip(
user_ids, item_ids, rating_values, negatives_lists
):
users.append(user)
items.append(item)
ratings.append(rating)
for neg_item in negatives:
users.append(user)
items.append(neg_item)
ratings.append(float(0)) # negative samples get 0 rating
# negative samples get 0 rating
dataset = UserItemRatingDataset(
user_tensor=torch.LongTensor(users),
item_tensor=torch.LongTensor(items),
target_tensor=torch.FloatTensor(ratings),
)
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
def train_an_epoch(self, train_loader):
self.train()
cnt = 0
total_loss = 0
for batch_id, batch in enumerate(train_loader):
assert isinstance(batch[0], torch.LongTensor)
user, item, rating = batch[0], batch[1], batch[2]
rating = rating.float()
loss = self.train_single_batch(user, item, rating)
total_loss += loss
cnt += 1
return total_loss / cnt
def train_single_batch(self, users, items, ratings):
if self.cuda is True:
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
if self.optimizer is None:
raise RuntimeError(
"Optimizer is not initialized. Call fit() before training."
)
self.optimizer.zero_grad()
ratings_pred = self(users, items)
loss = self.criterion(ratings_pred.view(-1), ratings)
loss.backward()
self.optimizer.step()
loss = loss.item()
return loss
def _sample_negative(self, ratings):
"""return all negative items & 100 sampled negative items"""
interact_status = (
ratings.groupby("userId")["itemId"]
.apply(set)
.reset_index()
.rename(columns={"itemId": "interacted_items"})
)
self.item_catalogue = set(ratings.itemId)
interact_status["negative_items"] = interact_status["interacted_items"].apply(
lambda x: self.item_catalogue - x
)
return interact_status[["userId", "negative_items"]]
def forward(self, user_indices, item_indices):
user_embedding = self.embedding_user(user_indices)
item_embedding = self.embedding_item(item_indices)
element_product = torch.mul(user_embedding, item_embedding)
dot = self.affine_output(element_product)
rating = self.logistic(dot)
return rating
+22
View File
@@ -0,0 +1,22 @@
import torch.nn as nn
class Item2Vec(nn.Module):
def __init__(self, config):
super().__init__()
self.num_items = config['num_items']
self.latent_dim = config['latent_dim']
self.embedding = nn.Embedding(
num_embeddings=self.num_items,
embedding_dim=self.latent_dim)
self.fc = nn.Linear(
in_features=self.latent_dim,
out_features=self.num_items)
def forward(self, input_data):
embedding = self.embedding(input_data)
return self.fc(embedding)
def item_embedding(self):
return self.embedding.weight.detach()
+240
View File
@@ -0,0 +1,240 @@
from typing import Optional, Union
import numpy as np
import scipy.sparse as sp
from .recommender_model import RecommenderModel
from pygrex.data_reader import DataReader
class KNNBasic(RecommenderModel):
"""
An improved K-Nearest Neighbors collaborative filtering model.
This version uses Pearson correlation similarity and improved neighbor selection
for better performance on sparse datasets like MovieLens.
Args:
k (int): Number of neighbors to consider. Default 50.
min_k (int): Minimum number of neighbors required for prediction. Default 3.
sim_options (dict): Similarity options. Default pearson, user-based.
"""
def __init__(self, k: int = 50, min_k: int = 3, sim_options: Optional[dict] = None):
super().__init__()
self.k = k
self.min_k = min_k
self.sim_options = sim_options if sim_options is not None else {}
# Validate similarity options
if self.sim_options.get("user_based", True) is False:
raise NotImplementedError("Only the user-based approach is implemented.")
sim_name = self.sim_options.get("name", "pearson").lower()
if sim_name not in ["cosine", "pearson"]:
raise NotImplementedError(
"Only cosine and pearson similarity are implemented."
)
# Model attributes
self.trainset: Optional[sp.csr_matrix] = None
self.global_mean: float = 0
self.user_biases: Optional[np.ndarray] = None
self.item_biases: Optional[np.ndarray] = None
self.num_users: Optional[int] = None
self.num_items: Optional[int] = None
# For memory-efficient similarity computation
self.user_means: Optional[np.ndarray] = None
def fit(self, data: DataReader) -> None:
"""
Trains the KNN model with improved memory efficiency.
"""
print("Fitting the improved KNNBasic model...")
df = data.dataset
self.num_users = data.num_user
self.num_items = data.num_item
print(
f"Building ratings matrix for {self.num_users} users and {self.num_items} items..."
)
# 1. Build the sparse user-item ratings matrix
ratings = df["rating"].values
rows = df["userId"].values
cols = df["itemId"].values
self.trainset = sp.csr_matrix(
(ratings, (rows, cols)), shape=(self.num_users, self.num_items)
)
# 2. Calculate global mean and biases
print("Computing biases...")
self.global_mean = self.trainset.data.mean()
# User biases: bu = avg(ratings_u) - global_mean
user_sums = np.array(self.trainset.sum(axis=1)).flatten()
user_counts = np.diff(self.trainset.indptr)
with np.errstate(divide="ignore", invalid="ignore"):
user_avg_ratings = np.where(
user_counts > 0, user_sums / user_counts, self.global_mean
)
self.user_biases = np.where(
user_counts > 0, user_avg_ratings - self.global_mean, 0
)
# Item biases: bi = avg(ratings_i) - global_mean
item_sums = np.array(self.trainset.sum(axis=0)).flatten()
item_counts = np.diff(self.trainset.tocsc().indptr)
with np.errstate(divide="ignore", invalid="ignore"):
item_avg_ratings = np.where(
item_counts > 0, item_sums / item_counts, self.global_mean
)
self.item_biases = np.where(
item_counts > 0, item_avg_ratings - self.global_mean, 0
)
# Store user means for similarity computation
self.user_means = user_avg_ratings
print("Model fitting complete.")
def _compute_user_similarity(self, user1_id: int, user2_id: int) -> float:
"""
Compute Pearson correlation similarity between two users.
This works better than cosine similarity for collaborative filtering.
"""
assert self.trainset is not None
# Get rating vectors for both users
user1_ratings = self.trainset[user1_id].toarray().flatten()
user2_ratings = self.trainset[user2_id].toarray().flatten()
# Find commonly rated items
mask = (user1_ratings > 0) & (user2_ratings > 0)
n_common = np.sum(mask)
# Need at least 2 common ratings for correlation
if n_common < 2:
return 0.0
# Extract ratings for commonly rated items
u1_common = user1_ratings[mask]
u2_common = user2_ratings[mask]
# Mean-center the ratings
u1_mean = np.mean(u1_common)
u2_mean = np.mean(u2_common)
u1_centered = u1_common - u1_mean
u2_centered = u2_common - u2_mean
# Compute Pearson correlation
numerator = np.sum(u1_centered * u2_centered)
denom1 = np.sqrt(np.sum(u1_centered**2))
denom2 = np.sqrt(np.sum(u2_centered**2))
if denom1 == 0 or denom2 == 0:
return 0.0
correlation = numerator / (denom1 * denom2)
# Apply significance weighting based on number of common items
# More common items = more reliable similarity
significance_weight = min(n_common / 50.0, 1.0) # Cap at 50 common items
return correlation * significance_weight
def _get_neighbors_for_item(self, user_id: int, item_id: int):
"""
Get the top-k most similar users who have rated the given item.
"""
# Find users who rated this item
assert self.trainset is not None
item_col = self.trainset[:, item_id] # type: ignore
neighbor_candidates, _ = item_col.nonzero()
# Remove the target user if they're in the candidates
neighbor_candidates = neighbor_candidates[neighbor_candidates != user_id]
if len(neighbor_candidates) == 0:
return np.array([]), np.array([]), np.array([])
# Compute similarities
similarities = []
for neighbor_id in neighbor_candidates:
sim = self._compute_user_similarity(user_id, neighbor_id)
similarities.append((sim, neighbor_id))
# Sort by similarity and take top-k
similarities.sort(key=lambda x: x[0], reverse=True)
top_k = similarities[: min(self.k, len(similarities))]
if len(top_k) < self.min_k:
return np.array([]), np.array([]), np.array([])
# Extract data
neighbor_sims = np.array([sim for sim, _ in top_k])
neighbor_ids = np.array([nid for _, nid in top_k])
neighbor_ratings = np.array(
[self.trainset[nid, item_id] for nid in neighbor_ids]
)
return neighbor_sims, neighbor_ids, neighbor_ratings
def predict(self, user_id: Union[int, str], item_id: Union[int, str]) -> float:
"""
Predict rating for a user-item pair using KNN.
"""
if self.trainset is None:
raise RuntimeError("Model must be trained first using fit() method.")
assert self.num_users is not None
assert self.num_items is not None
assert self.user_biases is not None
assert self.item_biases is not None
user_id = int(user_id)
item_id = int(item_id)
# Handle out-of-bounds users/items
if user_id >= self.num_users or item_id >= self.num_items:
return self.global_mean
# 1. Calculate baseline estimate
baseline = (
self.global_mean + self.user_biases[user_id] + self.item_biases[item_id]
)
# 2. Get neighbors who rated this item
neighbor_sims, neighbor_ids, neighbor_ratings = self._get_neighbors_for_item(
user_id, item_id
)
if len(neighbor_ids) == 0:
return baseline
# 3. Calculate weighted prediction
neighbor_biases = self.user_biases[neighbor_ids]
neighbor_baselines = (
self.global_mean + neighbor_biases + self.item_biases[item_id]
)
deviations = neighbor_ratings - neighbor_baselines
# Only use neighbors with positive similarity
positive_mask = neighbor_sims > 0
if not np.any(positive_mask):
return baseline
neighbor_sims = neighbor_sims[positive_mask]
deviations = deviations[positive_mask]
numerator = np.sum(neighbor_sims * deviations)
denominator = np.sum(np.abs(neighbor_sims))
if denominator == 0:
return baseline
prediction = baseline + (numerator / denominator)
# Clip to valid rating range
return np.clip(prediction, 1.0, 5.0)
+136
View File
@@ -0,0 +1,136 @@
import numpy as np
import scipy
from typing import Union, Protocol, runtime_checkable
from implicit.recommender_base import RecommenderBase
from .recommender_model import RecommenderModel
from pygrex.data_reader import DataReader
@runtime_checkable
class FittableImplicitModel(Protocol):
user_factors: np.ndarray
item_factors: np.ndarray
def fit(self, item_user_data) -> None: ...
class MFImplicitModel(RecommenderModel):
def __init__(
self,
latent_dim,
reg_term,
learning_rate,
epochs,
num_users=None,
num_items=None,
):
self.latent_dim = latent_dim
self.reg_term = reg_term
self.learning_rate = learning_rate
self.epochs = epochs
self.model: Union[RecommenderBase, FittableImplicitModel, None] = None
self.total_users = num_users
self.total_items = num_items
def fit(self, data: DataReader) -> None:
if self.model is None:
raise RuntimeError(
"The model has not been initialized. Please use a specific subclass like ALS or BPR."
)
num_user_for_shape = data.dataset["userId"].max() + 1
num_item_for_shape = data.dataset["itemId"].max() + 1
self.total_users = num_user_for_shape
self.total_items = num_item_for_shape
item_user_data = self.rearrange_dataset(
ds=data.dataset,
num_user=num_user_for_shape,
num_item=num_item_for_shape,
).T.tocsr()
self.model.fit(item_user_data)
@staticmethod
def rearrange_dataset(ds, num_user: int, num_item: int) -> scipy.sparse.csr_matrix:
"""
Converts the dataset into a sparse matrix format for the implicit model.
Args:
ds: Dataset containing userId and itemId columns
num_user : Number of users in the dataset
num_item : Number of items in the dataset
Returns:
ds_mtr: Sparse matrix representation of the dataset
"""
# Create sparse matrix directly from data
data = np.ones(len(ds)) # Array of 1s for each interaction
rows = ds["userId"].values # User IDs as row indices
cols = ds["itemId"].values # Item IDs as column indices
ds_mtr = scipy.sparse.csr_matrix(
(data, (rows, cols)), shape=(num_user, num_item)
)
return ds_mtr
def predict(
self, user_id: Union[str, int], item_id: Union[str, int, list, np.ndarray]
) -> Union[float, list]:
"""
Predict ratings for a user and one or more items using efficient vectorization.
Args:
user_id : User identifier
item_id : Item identifier or a list/array of item identifiers
Returns:
A single predicted score (float) or an array of scores (np.ndarray)
"""
if not isinstance(self.model, FittableImplicitModel):
raise RuntimeError(
"The model has not been trained yet. Please call fit() first."
)
user_id = int(user_id)
# 1. Validate user_id
if not (0 <= user_id < self.model.user_factors.shape[0]):
raise ValueError(f"user_id {user_id} is out of bounds")
# 2. Unify input to always be a numpy array
is_single_item = not isinstance(item_id, (list, np.ndarray))
item_ids_arr = np.array(item_id, ndmin=1).astype(int)
# 3. Perform a single, vectorized bounds check for all items at once
max_item_id = self.model.item_factors.shape[0]
if not np.all((item_ids_arr >= 0) & (item_ids_arr < max_item_id)):
out_of_bounds_id = item_ids_arr[
(item_ids_arr < 0) | (item_ids_arr >= max_item_id)
][0]
raise ValueError(f"item_id {out_of_bounds_id} is out of bounds")
# 4. Get all item vectors in a single, highly efficient operation
item_vectors = self.model.item_factors[item_ids_arr]
user_vector = self.model.user_factors[user_id]
# 5. Calculate all scores with one dot product
scores = user_vector.dot(item_vectors.T)
# 6. Return a single float if the input was a single item, otherwise the array
return scores[0].item() if is_single_item else scores.tolist()
def user_embedding(self) -> np.ndarray:
if not isinstance(self.model, FittableImplicitModel):
raise RuntimeError(
"The model has not been trained yet. Please call fit() first."
)
return self.model.user_factors
def item_embedding(self) -> np.ndarray:
if not isinstance(self.model, FittableImplicitModel):
raise RuntimeError(
"The model has not been trained yet. Please call fit() first."
)
return self.model.item_factors
+179
View File
@@ -0,0 +1,179 @@
import random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Optimizer
from tqdm.auto import tqdm
from pygrex.data_reader import DataReader, UserItemRatingDataset
from pygrex.utils.torch_utils import use_optimizer
from .py_torch_model import PyTorchModel
class MLPModel(PyTorchModel):
def __init__(
self,
learning_rate: float,
weight_decay: float,
latent_dim: int,
epochs: int,
num_negative: int,
batch_size: int,
cuda: bool,
optimizer_name: str,
device_id=None,
):
super().__init__(
learning_rate=learning_rate,
latent_dim=latent_dim,
epochs=epochs,
batch_size=batch_size,
cuda=cuda,
optimizer_name=optimizer_name,
device_id=device_id,
)
self.negative_sample_size = num_negative
self.weight_decay = weight_decay
# layer dim is 2*self.latent_dim since the embeddings will be concatenated
self.affine_output = torch.nn.Linear(
in_features=2 * self.latent_dim, out_features=1
)
self.logistic = torch.nn.Sigmoid()
self.criterion = nn.BCELoss()
self.optimizer: Optimizer | None = None
def fit(self, data: DataReader):
optimizer = use_optimizer(
network=self,
weight_decay=self.weight_decay,
learning_rate=self.learning_rate,
optimizer_name=self.optimizer_name,
)
if not isinstance(optimizer, Optimizer):
raise TypeError(f"Expected an Optimizer, but got {type(optimizer)}")
self.optimizer = optimizer
dataset = data.dataset
num_users = data.num_user
num_items = data.num_item
self.embedding_user = torch.nn.Embedding(
num_embeddings=num_users, embedding_dim=self.latent_dim
)
self.embedding_item = torch.nn.Embedding(
num_embeddings=num_items, embedding_dim=self.latent_dim
)
self.negatives = self._sample_negative(dataset)
with tqdm(total=self.epochs) as progress:
for epoch in range(self.epochs):
train_loader = self.instance_a_train_loader(
dataset, self.negative_sample_size, self.batch_size
)
loss = self.train_an_epoch(train_loader)
progress.update(1)
progress.set_postfix({"loss": loss})
def instance_a_train_loader(self, dataset, num_negatives, batch_size):
"""instance train loader for one training epoch"""
users, items, ratings = [], [], []
train_ratings = pd.merge(
dataset, self.negatives[["userId", "negative_items"]], on="userId"
)
train_ratings["negatives"] = train_ratings["negative_items"].apply(
lambda x: random.sample(list(x), num_negatives)
)
user_ids = train_ratings["userId"].tolist()
item_ids = train_ratings["itemId"].tolist()
rating_values = train_ratings["rating"].tolist()
negatives_lists = train_ratings["negatives"].tolist()
for user, item, rating, negatives in zip(
user_ids, item_ids, rating_values, negatives_lists
):
users.append(user)
items.append(item)
ratings.append(rating)
for neg_item in negatives:
users.append(user)
items.append(neg_item)
ratings.append(float(0)) # negative samples get 0 rating
dataset = UserItemRatingDataset(
user_tensor=torch.LongTensor(users),
item_tensor=torch.LongTensor(items),
target_tensor=torch.FloatTensor(ratings),
)
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
def train_an_epoch(self, train_loader):
self.train()
cnt = 0
total_loss = 0
for batch_id, batch in enumerate(train_loader):
assert isinstance(batch[0], torch.LongTensor)
user, item, rating = batch[0], batch[1], batch[2]
rating = rating.float()
loss = self.train_single_batch(user, item, rating)
total_loss += loss
cnt += 1
return total_loss / cnt
def train_single_batch(self, users, items, ratings):
if self.cuda is True:
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
if self.optimizer is None:
raise RuntimeError(
"Optimizer is not initialized. Call fit() before training."
)
self.optimizer.zero_grad()
ratings_pred = self(users, items)
loss = self.criterion(ratings_pred.view(-1), ratings)
loss.backward()
self.optimizer.step()
loss = loss.item()
return loss
def _sample_negative(self, ratings):
"""return all negative items & 100 sampled negative items"""
interact_status = (
ratings.groupby("userId")["itemId"]
.apply(set)
.reset_index()
.rename(columns={"itemId": "interacted_items"})
)
self.item_catalogue = set(ratings.itemId)
interact_status["negative_items"] = interact_status["interacted_items"].apply(
lambda x: self.item_catalogue - x
)
return interact_status[["userId", "negative_items"]]
def forward(self, user_indices, item_indices):
user_embedding = self.embedding_user(user_indices)
item_embedding = self.embedding_item(item_indices)
# Ensure embeddings are 2D [batch_size, embedding_dim]
if user_embedding.dim() == 3:
user_embedding = user_embedding.squeeze(1)
if item_embedding.dim() == 3:
item_embedding = item_embedding.squeeze(1)
# This is needed because cat does not support broadcasting.
if user_embedding.size(0) == 1 and item_embedding.size(0) > 1:
user_embedding = user_embedding.repeat(item_embedding.size(0), 1)
elif item_embedding.size(0) == 1 and user_embedding.size(0) > 1:
item_embedding = item_embedding.repeat(user_embedding.size(0), 1)
element_concat = torch.cat((user_embedding, item_embedding), 1)
concat = self.affine_output(element_concat)
rating = self.logistic(concat)
return rating
+69
View File
@@ -0,0 +1,69 @@
import itertools
from typing import Union
import torch
from pygrex.utils.torch_utils import use_cuda
from .recommender_model import RecommenderModel
from pygrex.data_reader import DataReader
class PyTorchModel(RecommenderModel, torch.nn.Module):
"""Meta Learner
Note: Subclass should implement self.model !
"""
def __init__(
self,
learning_rate: float,
latent_dim: int,
epochs: int,
batch_size: int,
cuda: bool,
optimizer_name: str,
device_id: Union[int, None] = None,
):
if optimizer_name not in ["sgd", "adam", "rmsprop"]:
raise Exception("Wrong optimizer.")
if cuda is True and device_id is not None:
use_cuda(True, device_id)
self.latent_dim = latent_dim
self.learning_rate = learning_rate
self.epochs = epochs
self.batch_size = batch_size
self._cuda = cuda
self.optimizer_name = optimizer_name
self.dataset = None
self.dataset_metadata = None
self.embedding_user = None
self.embedding_item = None
self.optimizer = None
super().__init__()
def fit(self, data: DataReader):
pass
def predict(self, user_id, item_id) -> list:
if isinstance(user_id, int):
user_id = [user_id]
if isinstance(item_id, int):
item_id = [item_id]
user_id = torch.LongTensor(user_id)
item_id = torch.LongTensor(item_id)
with torch.no_grad():
if self._cuda:
user_id = user_id.cuda()
item_id = item_id.cuda()
pred = self.forward(user_id, item_id).cpu().tolist()
pred = list(itertools.chain.from_iterable(pred))
return pred
def user_embedding(self):
return self.state_dict()["embedding_user.weight"].cpu().numpy()
def item_embedding(self):
return self.state_dict()["embedding_item.weight"].cpu().numpy()
+35
View File
@@ -0,0 +1,35 @@
from abc import ABC, abstractmethod
from typing import Union
from pygrex.data_reader.data_reader import DataReader
class RecommenderModel(ABC):
"""
Abstract base class that defines the interface for recommendation models.
All model implementations should inherit from this class.
"""
@abstractmethod
def predict(
self, user_id: Union[str, int], item_id: Union[str, int]
) -> Union[float, list]:
"""
Make predictions for a specific user on a list of items.
Args:
user_id: The ID of the user
item_ids: List of item IDs to predict ratings/scores for
Returns:
A dictionary mapping item IDs to predicted ratings/scores
"""
pass
@abstractmethod
def fit(self, data: DataReader):
"""
Train the model on data.
The specific parameters depend on the model implementation.
"""
pass
+169
View File
@@ -0,0 +1,169 @@
from math import sqrt
import numpy as np
from pygrex.data_reader.data_reader import DataReader
from pygrex.models.recommender_model import RecommenderModel
class SVD(RecommenderModel):
def __init__(
self,
n_factors=50,
n_epochs=25,
lr=0.007,
reg=0.1,
init_mean=0.0,
init_std=0.1,
random_state=42,
early_stopping=True,
):
self.n_factors = n_factors
self.n_epochs = n_epochs
self.lr = lr
self.reg = reg
self.init_mean = init_mean
self.init_std = init_std
self.random_state = random_state
self.early_stopping = early_stopping
# Model parameters
self.user_factors = None
self.item_factors = None
self.user_biases = None
self.item_biases = None
self.global_mean = None
# Training history
self.training_rmse = []
def fit(self, data: DataReader, validation_data=None):
df = data.dataset
if data._num_user is None or data._num_item is None:
raise ValueError("The number of users and items cannot be None.")
num_users, num_items = data._num_user, data._num_item
# Initialize random number generator
rng = np.random.RandomState(self.random_state)
# Initialize parameters with better scaling
scale = 1.0 / sqrt(self.n_factors)
self.user_factors = rng.normal(
self.init_mean, scale, (num_users, self.n_factors)
) # type: ignore
self.item_factors = rng.normal(
self.init_mean, scale, (num_items, self.n_factors)
) # type: ignore
self.user_biases = np.zeros(num_users)
self.item_biases = np.zeros(num_items)
self.global_mean = df["rating"].mean()
# Convert to list of tuples for faster iteration
ratings_tuple = list(
df[["userId", "itemId", "rating"]].itertuples(index=False, name=None)
)
# Training loop with early stopping
best_rmse = float("inf")
patience = 3
patience_counter = 0
for epoch in range(self.n_epochs):
print(f"Epoch {epoch + 1}/{self.n_epochs}...")
# Shuffle training data
rng.shuffle(ratings_tuple)
# SGD updates
for user, item, rating in ratings_tuple:
# Predict rating
dot_product = np.dot(self.user_factors[user], self.item_factors[item])
prediction = (
self.global_mean
+ self.user_biases[user]
+ self.item_biases[item]
+ dot_product
)
# Compute error
error = rating - prediction
# Update biases
self.user_biases[user] += self.lr * (
error - self.reg * self.user_biases[user]
)
self.item_biases[item] += self.lr * (
error - self.reg * self.item_biases[item]
)
# Update factors
uf_temp = self.user_factors[user].copy()
self.user_factors[user] += self.lr * (
error * self.item_factors[item] - self.reg * self.user_factors[user]
)
self.item_factors[item] += self.lr * (
error * uf_temp - self.reg * self.item_factors[item]
)
# Calculate training RMSE
if epoch % 5 == 0 or epoch == self.n_epochs - 1:
train_rmse = self.calculate_rmse(ratings_tuple)
self.training_rmse.append(train_rmse)
print(f" Training RMSE: {train_rmse:.4f}")
# Early stopping
if self.early_stopping and validation_data is not None:
val_rmse = self.calculate_rmse(validation_data)
print(f" Validation RMSE: {val_rmse:.4f}")
if val_rmse < best_rmse:
best_rmse = val_rmse
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= patience:
print(f"Early stopping at epoch {epoch + 1}")
break
print("Fit complete.")
def calculate_rmse(self, ratings_data):
"""Calculate RMSE for given ratings data."""
total_error = 0
count = 0
for user, item, rating in ratings_data:
prediction = self.predict(user, item)
total_error += (rating - prediction) ** 2
count += 1
return sqrt(total_error / count) if count > 0 else 0
def predict(self, user_id: int | str, item_id: int | str) -> float:
# Check that all model components are initialized
if (
self.user_factors is None
or self.item_factors is None
or self.user_biases is None
or self.item_biases is None
or self.global_mean is None
):
raise RuntimeError("The model has not been trained yet.")
try:
user_id = int(user_id)
item_id = int(item_id)
except (ValueError, TypeError):
# If conversion fails, return the global mean rating
return self.global_mean
# Make prediction
dot_product = np.dot(self.user_factors[user_id], self.item_factors[item_id])
prediction = (
self.global_mean
+ self.user_biases[user_id]
+ self.item_biases[item_id]
+ dot_product
)
# Clip to valid rating range
return np.clip(prediction, 1, 5)
+4
View File
@@ -0,0 +1,4 @@
from .recommender import Recommender
from .group_recommender import GroupRecommender
__all__ = ["Recommender", "GroupRecommender"]
+72
View File
@@ -0,0 +1,72 @@
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm
class GenericRecommender:
def __init__(self, dataset_metadata, model, top_n: int = 10):
self.top_n = top_n
self.dataset = dataset_metadata.dataset
self.model = model
self.catalogue = set(self.dataset["itemId"])
def recommend_all(self):
"""
Get all recommendations.
:param top_n:
:return: recommendations for any user.
"""
ratings = self.dataset.groupby("userId")
recommendations = pd.DataFrame({"userId": [], "itemId": [], "rank": []})
with tqdm(
total=self.dataset["userId"].nunique(), desc="Recommending for users: "
) as pbar:
for user_id, user_ratings in ratings:
# Replace .append() with pd.concat() - pandas 2.2.x +
recommendations = pd.concat(
[recommendations, self.recommend_user(user_id, user_ratings)], # type: ignore
ignore_index=True,
)
pbar.update()
return recommendations
def rank_prediction(self, user_id, target_item_id, predictions):
# Ensure predictions are flattened if they're 2D
if isinstance(predictions, np.ndarray) and predictions.ndim > 1:
predictions = predictions.flatten()
recommendations = pd.DataFrame(
{"userId": user_id, "itemId": target_item_id, "prediction": predictions}
)
recommendations["rank"] = recommendations["prediction"].rank(
method="first", ascending=False
)
recommendations.sort_values(["userId", "rank"], inplace=True)
recommendations = recommendations[recommendations["rank"] <= self.top_n]
return recommendations[["userId", "itemId", "rank"]]
def get_unrated(self, user_ratings):
"""
Extract the set of items a user has not rated.
:param user_ratings: list, items rated.
:return: list, items not rated.
"""
unrated_item_id = self.catalogue - set(user_ratings)
unrated_item_id = list(unrated_item_id)
return unrated_item_id
def get_rated(self, user_id):
"""
Extract the set of items a user has not rated.
:param user_id: userId rated.
:return: list, rated items.
"""
rated = self.dataset[self.dataset["userId"] == user_id]
return rated
+391
View File
@@ -0,0 +1,391 @@
from typing import Dict, List, Union, Optional
import numpy as np
from pygrex.data_reader.data_reader import DataReader
from pygrex.models.recommender_model import RecommenderModel
from pygrex.utils.aggregation_strategy import ScoreAggregator, AggregationStrategy
from pygrex.utils.scale import Scale
class GroupRecommender:
"""
A class to represent a group recommender system that follows the workflow:
1. Setup and Candidate Selection
2. Individual Preference Collection
3. Score Aggregation
4. Final Recommendation List
"""
def __init__(self, data: DataReader):
"""Initialize the group recommender with data.
Args:
data: The dataset containing user-item interactions.
"""
self.data = data
self._group_predictions = None
self._members = None
self._item_pool = None
self._model = None
self._aggregation_strategy = None
self._score_aggregator = None
self._aggregated_scores = None
self._top_recommendation = None
def setup_recommendation(
self,
model: RecommenderModel,
members: List[Union[str, int]],
data: DataReader,
aggregation_strategy: AggregationStrategy, # type: ignore
most_respected_person: Optional[Union[str, int]] = None,
) -> None:
"""
Setup and Candidate Selection: Initialize the group recommendation process.
Args:
model: The recommendation model to use
members: List of user IDs representing the group members
data: DataReader object containing the dataset
aggregation_strategy: Strategy for aggregating individual predictions
most_respected_person: User ID of most respected person (required for MRP strategy)
"""
self._members = members
self._model = model
self._aggregation_strategy = aggregation_strategy
# Initialize score aggregator
self._score_aggregator = ScoreAggregator(
most_respected_person=most_respected_person
)
# get all item IDs from the dataset
item_ids = data.dataset["itemId"].unique()
# Get items that no group member has interacted with
self._item_pool = self.get_non_interacted_items_for_recommendation(
self.data,
item_ids, # type: ignore
members, # type: ignore
)
# Filter item_pool to only include IDs that are valid for the model
# This prevents out-of-bounds errors when the model was trained with a different
# number of items than what's currently in the dataset
max_item_id = self._get_max_valid_item_id(model)
# Convert to int array and filter out invalid IDs
item_pool_int = self._item_pool.astype(int)
valid_mask = (item_pool_int >= 0) & (item_pool_int < max_item_id)
self._item_pool = item_pool_int[valid_mask]
# Individual Preference Collection: Generate predictions for each group member
self._group_predictions = self._generate_group_predictions()
# Score Aggregation: Aggregate individual predictions into collective scores
self._aggregated_scores = self._aggregate_group_scores()
def _generate_group_predictions(self) -> Dict[Union[str, int], Dict[int, float]]:
"""
Individual Preference Collection: Generate predictions for all group members.
Returns:
A dictionary with user IDs as keys and their predictions as values
"""
if not self._members or self._model is None or self._item_pool is None:
raise ValueError(
"You must call setup_recommendation before generating predictions"
)
predictions = {}
for member in self._members:
user_pred = self.generate_recommendation(
self._model,
member,
self._item_pool, # type: ignore
self.data, # type: ignore
)
predictions[member] = user_pred
return predictions
def _aggregate_group_scores(self) -> Dict[int, float]:
"""
Score Aggregation: Aggregate individual predictions into collective scores.
Returns:
Dictionary mapping item IDs to aggregated scores
"""
if (
self._group_predictions is None
or self._score_aggregator is None
or self._aggregation_strategy is None
):
raise ValueError(
"You must call setup_recommendation before aggregating scores"
)
# For Borda Count, we need to create rankings from predictions
rankings = None
if self._aggregation_strategy == AggregationStrategy.BORDA_COUNT:
rankings = self._create_rankings_from_predictions()
# Use ScoreAggregator to aggregate scores
aggregated_scores = self._score_aggregator.aggregate_scores(
evaluations=self._group_predictions, # type: ignore
strategy=self._aggregation_strategy,
rankings=rankings, # type: ignore
)
# Sort items by their aggregated scores in descending order
sorted_scores = dict(
sorted(aggregated_scores.items(), key=lambda x: x[1], reverse=True)
)
return sorted_scores # type: ignore
def _create_rankings_from_predictions(self) -> Dict[Union[str, int], List[int]]:
"""
Create rankings from predictions for Borda Count aggregation.
Returns:
Dictionary mapping user IDs to ranked lists of item IDs
"""
if self._group_predictions is None:
raise ValueError("Group predictions not available")
rankings = {}
for user_id, predictions in self._group_predictions.items():
# Sort items by prediction score in descending order
sorted_items = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
rankings[user_id] = [item_id for item_id, _ in sorted_items]
return rankings
def _get_max_valid_item_id(self, model: RecommenderModel) -> int:
"""
Get the maximum valid item ID for the given model.
Args:
model: The recommendation model
Returns:
Maximum valid item ID (exclusive, so valid IDs are [0, max_item_id))
"""
# For implicit models (MFImplicitModel), check item_factors shape
if hasattr(model, 'model') and model.model is not None:
if hasattr(model.model, 'item_factors'):
return model.model.item_factors.shape[0]
# Check if model has total_items attribute (set during fit)
if hasattr(model, 'total_items') and model.total_items is not None:
return model.total_items
# Fallback to data.num_item if model shape is not available
return self.data.num_item
def get_non_interacted_items_for_recommendation(
self,
data: DataReader,
item_ids: List[Union[str, int]],
members: List[Union[str, int]],
) -> np.ndarray:
"""
Returns the list of item IDs that none of the specified group members have interacted with.
This method is typically used in recommendation systems to filter out items that have already
been interacted with by any member of the group, ensuring that recommendations focus on new or
unseen items.
Args:
data: The original dataset containing user-item interactions.
item_ids: A list of all available item IDs to consider.
members: A list of user IDs representing the group.
Returns:
np.ndarray: A list of item IDs that have not been interacted with by any member of the group.
"""
consecutive_member_ids = [data.get_new_user_id(int(m)) for m in members]
consecutive_member_ids = [m for m in consecutive_member_ids if m is not None]
# Get all unique item IDs interacted with by users in the group
interacted_item_ids = data.dataset.loc[
data.dataset.userId.isin(consecutive_member_ids), "itemId"
].unique()
# Use numpy set difference to get non-interacted item IDs
item_pool = np.setdiff1d(item_ids, interacted_item_ids, assume_unique=True)
return item_pool
def generate_recommendation(
self,
model: RecommenderModel,
member: Union[str, int],
item_pool: List[Union[str, int]],
data: DataReader,
) -> Dict[int, float]:
"""
Generate recommendations for a user based on the provided model.
Args:
model: A recommendation model that implements the RecommenderModel interface
member: The ID of the user
item_pool: List of item IDs to predict ratings/scores for
data: The dataset containing user-item interactions
Returns:
A dictionary mapping item IDs to predicted ratings/scores
"""
member = int(member)
new_member_id = data.get_new_user_id(member)
if new_member_id is None:
return {} # Return empty predictions for this user
# Additional safety check: filter item_pool to valid IDs before prediction
# This provides a second layer of protection in case filtering was missed earlier
max_valid_item_id = self._get_max_valid_item_id(model)
if isinstance(item_pool, np.ndarray):
item_pool = item_pool.astype(int)
item_pool = item_pool[(item_pool >= 0) & (item_pool < max_valid_item_id)]
elif isinstance(item_pool, list):
item_pool = [int(item) for item in item_pool if 0 <= int(item) < max_valid_item_id]
if len(item_pool) == 0:
print(f"No valid items found for user {new_member_id}. Returning empty predictions.")
return {} # Return empty predictions if no valid items
raw_predictions = model.predict(new_member_id, item_pool) # type: ignore
if not isinstance(raw_predictions, (list, np.ndarray)):
raise TypeError(
f"Model's predict function returned an unexpected type: {type(raw_predictions)}"
)
# raw_predictions = []
# # Generate predictions for each item in the pool
# for item in item_pool:
# item = int(item)
# raw_predictions.append(model.predict(new_member_id, item)) # type: ignore
# Ensure raw_predictions is a numpy array
raw_predictions = np.array(raw_predictions)
# # Flatten the predictions if it's a 2D array (single user, multiple items)
# if raw_predictions.ndim == 2 and raw_predictions.shape[0] == 1:
# raw_predictions = raw_predictions.flatten()
# # Check if the length of raw_predictions matches item_pool
# if len(raw_predictions) != len(item_pool):
# raise ValueError(
# "Mismatch between predictions and item IDs. Check the model's predict function."
# )
# Apply scaling to normalize predictions to 1-5 range
scaled_linear = Scale.linear(
np.array(raw_predictions),
target_min=1,
target_max=5,
)
# Convert the scaled predictions into a dictionary with original item IDs as keys
predictions = {}
for item, scaled_pred in zip(item_pool, scaled_linear):
# Ensure item_id is treated as an integer
item_original_id = data.get_original_item_id(int(item))
if item_original_id is not None:
predictions[int(item_original_id)] = scaled_pred # type: ignore
# Sort the predictions in descending order of scores
sorted_predictions = dict(
sorted(predictions.items(), key=lambda item: item[1], reverse=True)
)
return sorted_predictions
def get_group_recommendations(
self, top_k: Optional[int] = None
) -> Union[int, List[int]]:
"""
Final Recommendation List: Get recommendations for the group based on aggregated scores.
Args:
top_k: The number of recommendations to return.
If None, returns all recommendations sorted by score.
If 1, returns only the top recommendation as a single item ID.
If > 1, returns the top k recommendations as a list of item IDs.
Returns:
If top_k is 1, a single item ID. Otherwise, a list of item IDs.
"""
if self._aggregated_scores is None:
raise ValueError(
"You must call setup_recommendation before getting recommendations"
)
sorted_items = list(self._aggregated_scores.items())
# Return results based on top_k parameter
if top_k is None:
# Return all items as a list of item IDs
return [item_id for item_id, _ in sorted_items]
elif top_k == 1:
# Return only the top item ID
if sorted_items:
return sorted_items[0][0]
return None # type: ignore
else:
# Return top k item IDs
return [
item_id for item_id, _ in sorted_items[: min(top_k, len(sorted_items))]
]
def get_top_recommendation(self) -> int:
"""
Get the top recommendation for the group.
Returns:
The item ID with the highest aggregated score across all group members.
"""
if self._top_recommendation is None:
self._top_recommendation = self.get_group_recommendations(top_k=1)
return self._top_recommendation # type: ignore
def get_recommendation_scores(self) -> Dict[int, float]:
"""
Get the aggregated scores for all items across the group.
Returns:
A dictionary with item IDs as keys and their aggregated scores as values.
"""
if self._aggregated_scores is None:
raise ValueError(
"You must call setup_recommendation before getting recommendation scores"
)
return self._aggregated_scores.copy()
def get_aggregation_strategy(self) -> Optional[AggregationStrategy]:
"""
Get the current aggregation strategy.
Returns:
The aggregation strategy being used, or None if not set.
"""
return self._aggregation_strategy
def get_group_members(self) -> Optional[List[Union[str, int]]]:
"""
Get the current group members.
Returns:
List of group member IDs, or None if not set.
"""
return self._members.copy() if self._members else None
def get_individual_predictions(
self,
) -> Optional[Dict[Union[str, int], Dict[int, float]]]:
"""
Get the individual predictions for all group members.
Returns:
Dictionary mapping user IDs to their individual predictions, or None if not available.
"""
return self._group_predictions.copy() if self._group_predictions else None
+57
View File
@@ -0,0 +1,57 @@
import pandas as pd
from typing import Optional
from .generic_recommender import GenericRecommender
class Recommender(GenericRecommender):
def __init__(self, dataset_metadata, model, top_n: int = 10):
super(Recommender, self).__init__(dataset_metadata, model, top_n)
def get_predictions(
self,
user_id: int,
target_item_id: list,
):
predictions = self.model.predict(user_id, target_item_id)
return predictions
def recommend(self, user_id: int, target_item_id: list):
"""
Generate recommendations on specific itemId and userId
:param user_id: list, user Ids
:param target_item_id: list, item Ids
:param rated_items: list, of rated interactions.
:return: data.frame [userId, itemId, rank], recommendations ranking for the specified pairs of userId and itemId.
"""
predictions = self.get_predictions(user_id, target_item_id)
return self.rank_prediction(user_id, target_item_id, predictions)
def recommend_user(
self, user_id: Optional[int] = None, user_ratings: Optional[pd.DataFrame] = None
):
"""
Get recommendations for a user.
:param user_id: int, a user Id
:param user_ratings: list, interactions on the user
:return: dataframe [userId, itemId, rank], recommendations ranking for the specified userId.
"""
if user_ratings is None:
if user_id is None:
raise ValueError("Either 'user_id' or 'user_ratings' must be provided.")
user_ratings = self.get_rated(user_id=user_id)
if user_ratings is None:
return pd.DataFrame(
columns=["userId", "itemId", "rank"]
) # Return empty recommendations
if user_id is None:
raise ValueError(
"Could not determine user_id from the provided user_ratings."
)
unrated_item_id = self.get_unrated(user_ratings["itemId"])
return self.recommend(user_id=user_id, target_item_id=unrated_item_id)
View File
+17
View File
@@ -0,0 +1,17 @@
from .aggregation_strategy import AggregationStrategy
from .association_rules import AssociationRules
from .scale import Scale
from .sliding_window import SlidingWindow
from .emp_loss import EMFLoss
from .explanation_diversity import calculate_gild_for_explanations
from .sliding_window_ranker import SlidingWindowRanker
__all__ = [
"AggregationStrategy",
"AssociationRules",
"Scale",
"EMFLoss",
"calculate_gild_for_explanations",
"SlidingWindowRanker",
"SlidingWindow",
]
+210
View File
@@ -0,0 +1,210 @@
import numpy as np
from typing import Dict, List, Union, Optional, TypeAlias
from enum import Enum
# Type aliases for better readability
UserID: TypeAlias = Union[str, int]
ItemID: TypeAlias = Union[str, int]
EvaluationScore: TypeAlias = float
AggregatedScore: TypeAlias = float
# Main data structure types
UserEvaluations: TypeAlias = Dict[UserID, Dict[ItemID, EvaluationScore]]
UserRankings: TypeAlias = Dict[UserID, List[ItemID]]
AggregatedScores: TypeAlias = Dict[ItemID, AggregatedScore]
class AggregationStrategy(Enum):
"""Enumeration of available aggregation strategies."""
# Individual Predictions
AVG_PREDICTIONS = "avg_predictions"
LEAST_MISERY = "least_misery"
MOST_PLEASURE = "most_pleasure"
MOST_RESPECTED_PERSON = "most_respected_person"
# Individual Preferences
ADDITIVE_UTILITARIAN = "additive_utilitarian"
MULTIPLICATIVE = "multiplicative"
BORDA_COUNT = "borda_count"
class ScoreAggregator:
"""
A class for aggregating individual predictions or preferences into collective scores.
Supports two main approaches:
1. Individual Predictions: AVG, LM, MP, MRP
2. Individual Preferences: AVG, ADD, MUL, BRC
Felfernig, A., Boratto, L., Stettinger, M., Tkali, M.: Group Recommender Systems:
An Introduction. Springer Publishing Company, Incorporated, 1st edn. (2018)
"""
def __init__(self, most_respected_person: Optional[UserID] = None):
"""
Initialize the ScoreAggregator.
Args:
most_respected_person: User ID of the most respected person (required for MRP strategy)
"""
self.most_respected_person = most_respected_person
def aggregate_scores(
self,
evaluations: UserEvaluations,
strategy: AggregationStrategy,
rankings: Optional[UserRankings] = None,
) -> AggregatedScores:
"""
Aggregate individual evaluations into collective scores.
Args:
evaluations: Dictionary mapping user_id -> {item_id: evaluation_score}
strategy: Aggregation strategy to use
rankings: Dictionary mapping user_id -> [ordered_list_of_items] (required for Borda Count)
Returns:
Dictionary mapping item_id -> aggregated_score
"""
if not evaluations:
return {}
# Get all items across all users
all_items: set[ItemID] = set()
for user_evals in evaluations.values():
all_items.update(user_evals.keys())
result: AggregatedScores = {}
for item in all_items:
if strategy == AggregationStrategy.AVG_PREDICTIONS:
result[item] = self._avg_predictions(evaluations, item)
elif strategy == AggregationStrategy.LEAST_MISERY:
result[item] = self._least_misery(evaluations, item)
elif strategy == AggregationStrategy.MOST_PLEASURE:
result[item] = self._most_pleasure(evaluations, item)
elif strategy == AggregationStrategy.MOST_RESPECTED_PERSON:
result[item] = self._most_respected_person(evaluations, item)
elif strategy == AggregationStrategy.ADDITIVE_UTILITARIAN:
result[item] = self._additive_utilitarian(evaluations, item)
elif strategy == AggregationStrategy.MULTIPLICATIVE:
result[item] = self._multiplicative(evaluations, item)
elif strategy == AggregationStrategy.BORDA_COUNT:
if rankings is None:
raise ValueError("Rankings required for Borda Count strategy")
result[item] = self._borda_count(rankings, item)
else:
raise ValueError(f"Unknown aggregation strategy: {strategy}")
return result
def get_top_recommendation(
self,
evaluations: UserEvaluations,
strategy: AggregationStrategy,
rankings: Optional[UserRankings] = None,
) -> ItemID:
"""
Get the top recommended item based on aggregated scores.
Args:
evaluations: Dictionary mapping user_id -> {item_id: evaluation_score}
strategy: Aggregation strategy to use
rankings: Dictionary mapping user_id -> [ordered_list_of_items] (required for Borda Count)
Returns:
Item ID with highest aggregated score
"""
aggregated_scores = self.aggregate_scores(evaluations, strategy, rankings)
return max(aggregated_scores.items(), key=lambda x: x[1])[0]
def _avg_predictions(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Average of item-specific evaluations."""
item_evals = [
user_evals.get(item, 0)
for user_evals in evaluations.values()
if item in user_evals
]
return np.mean(item_evals) if item_evals else 0.0 # type: ignore
def _least_misery(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Minimum item-specific evaluation."""
item_evals = [
user_evals.get(item, 0)
for user_evals in evaluations.values()
if item in user_evals
]
return min(item_evals) if item_evals else 0.0
def _most_pleasure(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Maximum item-specific evaluation."""
item_evals = [
user_evals.get(item, 0)
for user_evals in evaluations.values()
if item in user_evals
]
return max(item_evals) if item_evals else 0.0
def _most_respected_person(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Item-evaluations of most respected user."""
if self.most_respected_person is None:
raise ValueError("Most respected person not specified")
if self.most_respected_person not in evaluations:
raise ValueError(
f"Most respected person '{self.most_respected_person}' not found in evaluations"
)
return evaluations[self.most_respected_person].get(item, 0.0)
def _avg_preferences(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Average of item-specific evaluations (same as avg_predictions)."""
return self._avg_predictions(evaluations, item)
def _additive_utilitarian(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Sum of item-specific evaluations."""
item_evals = [
user_evals.get(item, 0)
for user_evals in evaluations.values()
if item in user_evals
]
return sum(item_evals)
def _multiplicative(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Multiplication of item-specific evaluations."""
item_evals = [
user_evals.get(item, 0)
for user_evals in evaluations.values()
if item in user_evals
]
if not item_evals:
return 0.0
result = 1.0
for eval_score in item_evals:
result *= eval_score
return result
def _borda_count(self, rankings: UserRankings, item: ItemID) -> AggregatedScore:
"""Sum of item-specific scores derived from item ranking."""
total_score = 0.0
for user_ranking in rankings.values():
if item in user_ranking:
# Score is based on position in ranking (higher position = higher score)
position = user_ranking.index(item)
score = len(user_ranking) - position - 1 # Reverse position for score
total_score += score
return total_score
+255
View File
@@ -0,0 +1,255 @@
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules
import pandas as pd
from pygrex.data_reader.data_reader import DataReader
from typing import List, Optional, Union
class AssociationRules:
"""
A class to represent association rules mining for recommendation systems.
This class implements association rules mining using the FP-Growth algorithm
to discover frequent itemsets and generate association rules from user-item
interaction data. It can be used to find patterns in user behavior and
generate item recommendations based on item associations.
"""
def __init__(
self,
data: DataReader,
min_support: float = 0.2,
min_confidence: float = 0.2,
rating_threshold: float = 4.0,
) -> None:
"""Initialize the association rules miner with data and parameters.
Args:
data: The DataReader object containing user-item interactions with ratings.
min_support: Minimum support threshold for frequent itemsets.
Must be between 0 and 1. Default is 0.2.
min_confidence: Minimum confidence threshold for association rules.
Must be between 0 and 1. Default is 0.2.
rating_threshold: Minimum rating threshold to consider an interaction
as positive. Default is 4.0.
Raises:
ValueError: If support, confidence, or rating_threshold values are invalid.
"""
self._validate_parameters(min_support, min_confidence, rating_threshold)
self.data = data
self.min_support = min_support
self.min_confidence = min_confidence
self.rating_threshold = rating_threshold
self._frequent_itemsets: Optional[pd.DataFrame] = None
self._association_rules: Optional[pd.DataFrame] = None
def _validate_parameters(
self, min_support: float, min_confidence: float, rating_threshold: float
) -> None:
"""Validate initialization parameters.
Args:
min_support: Minimum support threshold to validate.
min_confidence: Minimum confidence threshold to validate.
rating_threshold: Rating threshold to validate.
Raises:
ValueError: If any parameter is invalid.
"""
if not (0 < min_support <= 1):
raise ValueError("min_support must be between 0 and 1")
if not (0 < min_confidence <= 1):
raise ValueError("min_confidence must be between 0 and 1")
if rating_threshold < 0:
raise ValueError("rating_threshold must be non-negative")
def get_df_filtered_by_rating_threshold(self) -> pd.DataFrame:
df = self.data.dataset.copy()
# Filter interactions based on rating threshold
df_filtered = df[df["rating"] >= self.rating_threshold]
if df_filtered.empty:
raise ValueError(
f"No interactions found with rating >= {self.rating_threshold}"
)
return df_filtered
def _prepare_transactions(self) -> List[List[str]]:
"""Prepare transaction data from the dataset.
Filters the dataset based on rating threshold and groups items
by user to create transaction lists.
Returns:
A list of transactions, where each transaction is a list of item IDs
that a user has positively interacted with.
"""
df_filtered = self.get_df_filtered_by_rating_threshold()
# Group items by user to create transactions
transactions = df_filtered.groupby("userId")["itemId"].apply(list).tolist()
# Convert item IDs to strings for consistency
transactions = [
[str(item) for item in transaction] for transaction in transactions
]
return transactions
def _mine_frequent_itemsets(
self, transactions: List[List[Union[str, int]]]
) -> pd.DataFrame:
"""Mine frequent itemsets using FP-Growth algorithm.
Args:
transactions: List of transactions to mine frequent itemsets from.
Returns:
DataFrame containing frequent itemsets with their support values.
Raises:
ValueError: If no frequent itemsets are found.
"""
# Encode transactions into binary matrix
transaction_encoder = TransactionEncoder()
transaction_matrix = transaction_encoder.fit_transform(transactions)
df_encoded = pd.DataFrame(
transaction_matrix, # type: ignore
columns=transaction_encoder.columns_,
)
# Apply FP-Growth to find frequent itemsets
frequent_itemsets = fpgrowth(
df_encoded, min_support=self.min_support, use_colnames=True
)
if frequent_itemsets.empty:
raise ValueError(
f"No frequent itemsets found with min_support={self.min_support}"
)
return frequent_itemsets
def _generate_association_rules(
self, frequent_itemsets: pd.DataFrame
) -> pd.DataFrame:
"""Generate association rules from frequent itemsets.
Args:
frequent_itemsets: DataFrame containing frequent itemsets.
Returns:
DataFrame containing association rules with their metrics.
Raises:
ValueError: If no association rules are found.
"""
rules = association_rules(
frequent_itemsets, metric="confidence", min_threshold=self.min_confidence
)
if rules.empty:
raise ValueError(
f"No association rules found with min_confidence={self.min_confidence}"
)
return rules
def compute(self) -> pd.DataFrame:
"""Compute association rules from the dataset.
This method performs the complete association rules mining process:
1. Prepares transactions from the dataset
2. Mines frequent itemsets using FP-Growth
3. Generates association rules from frequent itemsets
Returns:
DataFrame containing association rules with metrics including
antecedents, consequents, support, confidence, lift, etc.
Raises:
ValueError: If the dataset is empty, no transactions meet the
criteria, or no rules can be generated with the given parameters.
"""
if self.data.dataset.empty:
raise ValueError("Dataset is empty")
# Prepare transactions
transactions = self._prepare_transactions()
if not transactions:
raise ValueError("No transactions found after filtering")
# Mine frequent itemsets
self._frequent_itemsets = self._mine_frequent_itemsets(transactions) # type: ignore
# Generate association rules
self._association_rules = self._generate_association_rules(
self._frequent_itemsets
)
return self._association_rules
def get_frequent_itemsets(self) -> Optional[pd.DataFrame]:
"""Get the computed frequent itemsets.
Returns:
DataFrame containing frequent itemsets if compute() has been called,
None otherwise.
"""
return self._frequent_itemsets
def get_recommendations_for_items(
self, items: List[Union[str, int]], top_k: int = 10
) -> pd.DataFrame:
"""Get item recommendations based on association rules.
Args:
items: List of item IDs to get recommendations for.
top_k: Maximum number of recommendations to return. Default is 10.
Returns:
DataFrame containing recommended items sorted by confidence.
Raises:
RuntimeError: If compute() hasn't been called yet.
ValueError: If items list is empty.
"""
if self._association_rules is None:
raise RuntimeError("Must call compute() before getting recommendations")
if not items:
raise ValueError("Items list cannot be empty")
items_set = set(str(item) for item in items)
# Filter rules where antecedents match the given items
matching_rules = self._association_rules[
self._association_rules["antecedents"].apply(
lambda x: items_set.issubset(set(str(item) for item in x))
)
]
if matching_rules.empty:
return pd.DataFrame()
# Sort by confidence and return top_k recommendations
recommendations = matching_rules.nlargest(top_k, "confidence")
return recommendations[
["antecedents", "consequents", "confidence", "lift", "support"]
]
def __str__(self) -> str:
"""Return string representation of the AssociationRules object."""
return (
f"AssociationRules(min_support={self.min_support}, "
f"min_confidence={self.min_confidence}, "
f"rating_threshold={self.rating_threshold})"
)
def __repr__(self) -> str:
"""Return detailed string representation of the AssociationRules object."""
return self.__str__()
+17
View File
@@ -0,0 +1,17 @@
import torch
class EMFLoss(torch.nn.Module):
def __init__(self):
super(EMFLoss, self).__init__()
def forward(self, ratings_pred, ratings, u, v, reg_term, expl, expl_reg_term):
mse = (ratings - ratings_pred.view(-1)) ** 2
u_l2 = reg_term * torch.norm(u, 2, -1)
v_l2 = reg_term * torch.norm(v, 2, -1)
expl_constraint = expl_reg_term * torch.norm(u - v, 1, -1) * expl
loss = mse + u_l2 + v_l2 + expl_constraint
return loss.mean()
+80
View File
@@ -0,0 +1,80 @@
from itertools import combinations
import numpy as np
def _get_explanation_feature_set(explanation, explainer_type, details=None):
"""Helper to extract a consistent feature set from different explanation types."""
if explainer_type == "Sliding Window":
return set(explanation.get("items", []))
elif explainer_type == "EXPGRS":
if details is not None:
return set(details.get("antecedent", frozenset()))
else:
return set()
elif explainer_type == "LORE4Groups":
rules_data = explanation.get("group_factual_rule", {})
if isinstance(rules_data, dict):
return set(
rule for tier_rules in rules_data.values() for rule in tier_rules
)
elif isinstance(rules_data, list):
return set(rules_data)
return set()
def calculate_gild_for_explanations(explanations_dict, explainer_type, use_median=True):
"""Calculate Gaussian Inter-List Diversity (GILD) for a set of explanations."""
if not explanations_dict or len(explanations_dict) < 2:
return 0.0
feature_sets = []
if explainer_type == "EXPGRS":
for item_id, rules_list in explanations_dict.items():
if rules_list:
feature_sets.append(
_get_explanation_feature_set(
None, explainer_type, details=rules_list[0]
)
)
elif explainer_type == "Sliding Window":
for call, exp_data in explanations_dict.items():
feature_sets.append(_get_explanation_feature_set(exp_data, explainer_type))
elif explainer_type == "LORE4Groups":
for item_id, exp_data in explanations_dict.items():
feature_sets.append(_get_explanation_feature_set(exp_data, explainer_type))
feature_sets = [fs for fs in feature_sets if fs]
if len(feature_sets) < 2:
return 0.0
# Calculate pairwise Jaccard distances
distances = []
for set1, set2 in combinations(feature_sets, 2):
intersection_len = len(set1.intersection(set2))
union_len = len(set1.union(set2))
jaccard_dist = 1.0 - (intersection_len / union_len) if union_len > 0 else 1.0
distances.append(jaccard_dist)
if not distances:
return 0.0
# Calculate sigma using paper's formula
k_choose_2 = len(distances)
if use_median:
reference_dist = np.median(distances)
else:
reference_dist = min(distances)
denominator = np.sqrt(2 * np.log(k_choose_2 - 1)) if k_choose_2 > 1 else 1.0
sigma = reference_dist / denominator if denominator > 0 else reference_dist
if sigma == 0:
sigma = 1e-9
kernel_distances_sum = 0.0
for d in distances:
kernel_distance = np.sqrt(2 - 2 * np.exp(-(d**2) / (2 * sigma**2)))
kernel_distances_sum += kernel_distance
gild = kernel_distances_sum / k_choose_2 if distances else 0
return gild
+138
View File
@@ -0,0 +1,138 @@
from typing import List, Union, Optional
import numpy as np
from scipy import stats
class Scale:
"""
A class for scaling numerical values using different methods.
Methods:
quantile: Scale values using quantile-based ranking.
linear: Scale values linearly to a target range with outlier handling.
"""
@staticmethod
def quantile(
raw_predictions: Union[List[float], np.ndarray],
target_min: float = 1,
target_max: float = 5,
) -> np.ndarray:
"""
Scale raw predictions to the target range using quantile-based ranking.
Args:
raw_predictions: The raw prediction values.
target_min: Minimum of the target range (default: 1).
target_max: Maximum of the target range (default: 5).
Returns:
numpy.ndarray: Scaled predictions.
Raises:
ValueError: If raw_predictions is empty.
"""
if len(raw_predictions) == 0:
raise ValueError("Raw predictions array is empty.")
# Convert to numpy array if it's not already
raw_predictions = np.array(raw_predictions)
ranks = stats.rankdata(raw_predictions, method="average")
if len(raw_predictions) == 1:
# Handle single element case
scaled_predictions = np.array([(target_min + target_max) / 2])
else:
scaled_predictions = target_min + (ranks - 1) * (
target_max - target_min
) / (len(raw_predictions) - 1)
# Ensure scaled predictions are within [target_min, target_max]
scaled_predictions = np.clip(scaled_predictions, target_min, target_max)
return scaled_predictions
@staticmethod
def linear(
raw_predictions: Union[List[float], np.ndarray],
target_min: float = 1,
target_max: float = 5,
ref_min: Optional[float] = None,
ref_max: Optional[float] = None,
handle_outliers: bool = True,
) -> np.ndarray:
"""
Scale raw predictions to the target range [target_min, target_max].
Args:
raw_predictions: The raw prediction values.
target_min: Minimum of the target range (default: 1).
target_max: Maximum of the target range (default: 5).
ref_min: Reference minimum for raw predictions. If None, will be calculated
from the data or from outlier bounds if handle_outliers=True.
ref_max: Reference maximum for raw predictions. If None, will be calculated
from the data or from outlier bounds if handle_outliers=True.
handle_outliers: Whether to handle outliers using IQR method (default: True).
Returns:
numpy.ndarray: Scaled predictions.
Raises:
ValueError: If raw_predictions is empty.
"""
if len(raw_predictions) == 0:
raise ValueError("Raw predictions array is empty.")
# Convert to numpy array if it's not already
raw_predictions = np.array(raw_predictions)
# Handle single element case
if len(raw_predictions) == 1:
if ref_min is not None and ref_max is not None:
# Scale based on provided reference range
value = raw_predictions[0]
scaled_value = (
target_min
+ (value - ref_min)
* (target_max - target_min)
/ (ref_max - ref_min)
if ref_max != ref_min
else (target_min + target_max) / 2
)
scaled_value = np.clip(scaled_value, target_min, target_max)
return np.array([scaled_value])
else:
# Can't determine range from single value, return middle of target range
return np.array([(target_min + target_max) / 2])
clipped_predictions = raw_predictions.copy()
# Handle outliers if requested
if handle_outliers:
q1, q3 = np.percentile(raw_predictions, [25, 75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
clipped_predictions = np.clip(raw_predictions, lower_bound, upper_bound)
# Determine min and max values
min_raw = np.min(clipped_predictions)
max_raw = np.max(clipped_predictions)
# Use provided reference bounds if given, otherwise use data bounds
actual_ref_min = ref_min if ref_min is not None else min_raw
actual_ref_max = ref_max if ref_max is not None else max_raw
# Scale to [target_min, target_max]
if actual_ref_max == actual_ref_min:
# Reference bounds are equal, return the middle of the target range
return np.full_like(raw_predictions, (target_min + target_max) / 2)
else:
scaled_predictions = target_min + (raw_predictions - actual_ref_min) * (
target_max - target_min
) / (actual_ref_max - actual_ref_min)
# Ensure scaled predictions are within [target_min, target_max]
scaled_predictions = np.clip(scaled_predictions, target_min, target_max)
return scaled_predictions
+90
View File
@@ -0,0 +1,90 @@
from typing import List, Optional, TypeVar, Generic, Iterator
T = TypeVar("T")
class SlidingWindow(Generic[T]):
"""Class for creating and managing sliding windows over a sequence.
This class provides functionality to iterate through windows of a fixed size
over a sequence of items.
"""
def __init__(self, sequence: List[T], window_size: int):
"""Initialize the sliding window.
Args:
sequence: The sequence of items to slide over
window_size: The size of each window (must be positive)
Raises:
ValueError: If window_size is less than 1
TypeError: If sequence is not iterable
"""
if window_size < 1:
raise ValueError("Window size must be at least 1")
if not hasattr(sequence, "__iter__"):
raise TypeError("Sequence must be iterable")
self.sequence = sequence
self.window_size = window_size
self.index = 0
self.max_index = len(sequence) - window_size + 1 if sequence else 0
def get_next_window(self) -> Optional[List[T]]:
"""Return the next window and advance the current position.
Returns:
A list containing the next window of items, or None if all windows
have been processed.
"""
if self.index >= self.max_index:
return None
window = self.sequence[self.index : self.index + self.window_size]
self.index += 1
return window
def reset(self) -> None:
"""Reset the window position to the beginning of the sequence."""
self.index = 0
def has_next(self) -> bool:
"""Check if there are more windows available.
Returns:
True if there are more windows, False otherwise.
"""
return self.index < self.max_index
def __iter__(self) -> Iterator[List[T]]:
"""Make the class iterable.
Returns:
An iterator over all windows in the sequence.
"""
self.reset()
return self
def __next__(self) -> List[T]:
"""Get the next window for iteration.
Returns:
The next window as a list.
Raises:
StopIteration: When all windows have been processed.
"""
window = self.get_next_window()
if window is None:
raise StopIteration
return window
def __len__(self) -> int:
"""Return the total number of windows.
Returns:
The number of complete windows in the sequence.
"""
return max(0, self.max_index)
+631
View File
@@ -0,0 +1,631 @@
import operator
from typing import Any, Dict, List, Union, Optional
import numpy as np
import pandas as pd
from scipy.signal import (
find_peaks,
peak_widths,
)
from pygrex.data_reader import DataReader
class SlidingWindowRanker:
"""
Stratigi, M., Bikakis, N., Stefanidis, K.: Counterfactual explanations for group
recommendations. In: Proceedings of the 27th International Workshop on Design,
Optimization, Languages and Analytical Processing of Big Data (DOLAP 2025)
"""
def __init__(self, config: Dict[str, Any]):
"""
Initialize the SlidingWindowRanker.
Args:
config: Configuration parameters for the evaluator
"""
self.config = config
self.group_predictions: Optional[
Dict[Union[str, int], Dict[Union[str, int], float]]
] = None
self.top_recommendation: Optional[Union[str, int]] = None
def set_group_recommender_values(
self,
group_predictions: Dict[Union[str, int], Dict[Union[str, int], float]],
top_recommendation: Union[str, int],
) -> None:
"""
Set group recommender values.
Args:
group_predictions: Dictionary mapping user IDs to their item predictions
top_recommendation: List of top recommended items for the group
"""
self.group_predictions = group_predictions
self.top_recommendation = top_recommendation
def evaluate(self, data: DataReader) -> Dict[str, Any]:
"""
Evaluate the data using the Stratigis evaluator.
Args:
data: DataReader object containing dataset and transformation methods
Returns:
Dictionary with evaluation metrics
"""
# Implementation would go here
return {}
def calculate_item_popularity_score(
self, items: List[Union[str, int]], data: DataReader
) -> Dict[Union[str, int], float]:
"""
Calculate the normalized popularity of each item based on the number of interactions received.
Args:
items: List of item IDs
data: Data object containing the dataset and transformation methods
Returns:
Dictionary with item IDs as keys and normalized popularity (0-1) as values
"""
# Calculate popularity (number of interactions) for each item
popularity_counts = {}
for item_id in items:
internal_item_id = data.get_new_item_id(item_id)
count = len(data.dataset[data.dataset["itemId"] == internal_item_id])
popularity_counts[item_id] = count
# Find min and max values for normalization
min_count = min(popularity_counts.values()) if popularity_counts else 0
max_count = max(popularity_counts.values()) if popularity_counts else 0
# Add 1% padding to the range
range_value = max_count - min_count
padded_range = range_value + (
range_value / 50
) # Add 2% to range (1% on each end)
padded_min = min_count - (
range_value / 100
) # Subtract 1% of range from minimum
if padded_range == 0:
padded_range = 1 # Avoid division by zero
# Normalize popularity values to [0,1]
popularity_mask = {}
for item_id, count in popularity_counts.items():
popularity_mask[item_id] = (count - padded_min) / padded_range
return popularity_mask
def calculate_relevance_mask(
self,
target_item_id: Union[str, int],
) -> Dict[Union[str, int], float]:
"""
Create a mapping between users and their prediction scores for a specific target item.
Args:
target_item_id :The ID of the item for which prediction scores are needed
Returns:
Dictionary mapping user IDs to their predicted scores for the target item
Note: Users without a prediction for the target item will have a value of 0
Examples
>>> user_preds = {'user1': {'item1': 4.5, 'item2': 3.2}, 'user2': {'item2': 2.8}}
>>> evaluator.set_group_recommender_values(user_preds,top_recommendation)
>>> evaluator.calculate_relevance_mask('item1')
{'user1': 4.5, 'user2': 0}
"""
if self.group_predictions is None:
raise ValueError(
"User predictions not set. Call set_group_recommender_values first."
)
individual_predictions = {}
for user_id, predictions in self.group_predictions.items():
# Get the prediction for the target item if it exists, otherwise default to 0
individual_predictions[user_id] = predictions.get(target_item_id, 0)
return individual_predictions
def calculate_relevance_score(
self,
item_id: Union[str, int],
data: DataReader,
prediction_scores: Dict[Union[str, int], float],
members: List[Union[str, int]],
rating_scale: tuple = (0, 5), # Default rating scale
) -> float:
"""
Calculate the normalized average prediction score for an item based on group members' predictions.
Agrs
item_id: ID of the item to calculate relevance for
data : DataReader object containing dataset and ID mapping methods
prediction_scores : Dictionary mapping user IDs to their prediction scores for items
members : List of user IDs in the group
rating_scale: Tuple indicating (min_rating, max_rating) for normalization
Returns
Normalized average prediction score in range [0,1]
Returns 0 if no users in the group have interacted with the item
Notes
1. Calculates the average prediction score for the item from group members
2. Normalizes the score to [0,1] range with 1% padding
"""
total_score = 0
valid_users_count = 0
internal_item_id = data.get_new_item_id(item_id)
for user_id in members:
# Convert user ID to internal format
internal_user_id = (
data.get_new_user_id(int(user_id))
if isinstance(user_id, (int, np.integer))
else user_id
)
# Check if user has interacted with the item
user_item_data = data.dataset[
(data.dataset["userId"] == internal_user_id)
& (data.dataset["itemId"] == internal_item_id)
]
if user_item_data.empty:
continue
# Get the prediction score for this user
if user_id in prediction_scores:
total_score += prediction_scores[user_id]
valid_users_count += 1
# Return 0 if no valid users found
if valid_users_count == 0:
return 0
# Calculate average score
average_score = total_score / valid_users_count
# Normalize to [0,1] with 1% padding
min_value, max_value = rating_scale
range_value = max_value - min_value
padded_range = range_value + (
range_value / 50
) # Add 2% to range (1% on each end)
padded_min = min_value - (
range_value / 100
) # Subtract 1% of range from minimum
if padded_range == 0:
return 0.0
normalized_score = (average_score - padded_min) / padded_range
return float(normalized_score)
def calculate_item_intensity_score(
self, item_id: Union[str, int], members: List[Union[str, int]], data: DataReader
) -> float:
"""
Calculate what proportion of group members have interacted with the specified item.
Args
item_id : ID of the item to calculate interaction rate for
members : List of user IDs in the group
data : DataReader object containing dataset and ID mapping methods
Returns
Proportion of group members who have interacted with the item (range [0,1])
0 means no group members have interacted with the item
1 means all group members have interacted with the item
"""
# Convert item ID to internal format
if data is None:
print("Error: DataReader object is None. Cannot convert item_id.")
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
internal_item_id = data.get_new_item_id(item_id)
# Convert all user IDs to internal format
internal_members = [data.get_new_user_id(user_id) for user_id in members]
# Count how many users have interacted with the item
interaction_count = len(
data.dataset[
(data.dataset.itemId == internal_item_id)
& data.dataset.userId.isin(internal_members)
]
)
# Calculate proportion of group members who interacted with item
if not members:
return 0 # Avoid division by zero if no members
interaction_rate = interaction_count / len(members)
return interaction_rate
def calculate_rating_score(
self,
item_id: Union[str, int],
members: List[Union[str, int]],
data: DataReader,
rating_scale: tuple = (0, 5),
) -> float:
"""
Calculate the normalized average rating given to an item by group members.
Args
item_id : ID of the item to calculate average rating for
data : DataReader object containing dataset and ID mapping methods
members : List of user IDs in the group
rating_scale: Tuple indicating (min_rating, max_rating) for normalization
Returns
Normalized average rating in range [0,1]
Notes
- Considers all group members in the denominator even if some haven't rated the item
- Normalizes the resulting average to [0,1] with 1% padding
"""
# Convert item ID to internal format
if data is None:
print("Error: DataReader object is None. Cannot convert item_id.")
return 0.0
internal_item_id = data.get_new_item_id(item_id)
# Convert all user IDs to internal format
internal_members = [data.get_new_user_id(user_id) for user_id in members]
# Get ratings from users who have rated this item
rating_data = data.dataset[
(data.dataset.itemId == internal_item_id)
& data.dataset.userId.isin(internal_members)
]
# Calculate average rating (sum of ratings divided by total group size)
if len(members) == 0:
return 0 # Avoid division by zero if no members
total_rating = rating_data["rating"].sum()
average_rating = total_rating / len(members)
# Normalize to [0,1] with 1% padding
min_value, max_value = rating_scale
range_value = max_value - min_value
padded_range = range_value + (
range_value / 50
) # Add 2% to range (1% on each end)
padded_min = min_value - (
range_value / 100
) # Subtract 1% of range from minimum
if padded_range == 0:
return 0.0
normalized_rating = (average_rating - padded_min) / padded_range
return float(normalized_rating)
def calculate_trending_score(
self,
members: List[Union[str, int]],
item_id: Union[str, int],
data: Optional[DataReader] = None,
peak_norm_min_height: float = 0.1,
peak_norm_min_prominence: float = 0.05,
peak_min_distance: int = 3,
peak_width_rel_height: float = 0.5,
) -> tuple[float, Dict[Union[str, int], float], pd.DataFrame]:
"""
Calculates a trending score for a user, using normalized data for hype period detection.
Args
members : List of user IDs in the group
item_id : ID of the item to calculate trending score for
data : DataReader object containing dataset and ID mapping methods
peak_norm_min_height : Minimum height of peaks in normalized data to consider as significant
peak_norm_min_prominence : Minimum prominence of peaks in normalized data
peak_min_distance : Minimum distance between peaks in months
peak_width_rel_height : Relative height for peak width calculation
Returns
tuple: (average_trending_score, individual_scores, hype_periods_for_item)
average_trending_score: Average trending score across all group members (0-1)
individual_scores: Dictionary mapping user IDs to their individual trending scores
hype_periods_for_item: DataFrame containing detected hype periods for the item
"""
if not members:
print("Error: No group members provided for trending score calculation.")
return 0.0, {}, pd.DataFrame()
_df = pd.DataFrame()
if data is not None and isinstance(data, DataReader):
_df = data.dataset.copy()
else:
if data is not None:
print(
f"Warning: data was provided but is not a DataReader object (type: {type(data)})."
)
if _df.empty:
print(
"Error: The DataFrame (_df) is empty. Cannot calculate score or plot."
)
return 0.0, {}, pd.DataFrame()
required_columns = [
"userId",
"itemId",
"rating",
"timestamp",
]
missing_columns = [col for col in required_columns if col not in _df.columns]
if missing_columns:
print(
f"Error: Missing required columns in DataFrame: {', '.join(missing_columns)}"
)
return 0.0, {}, pd.DataFrame()
try:
if "timestamp_dt" not in _df.columns or _df["timestamp_dt"].isnull().all():
_df["timestamp_dt"] = pd.to_datetime(_df["timestamp"], unit="s")
if "year_month" not in _df.columns or _df["year_month"].isnull().all():
_df["year_month"] = _df["timestamp_dt"].dt.to_period("M")
except Exception as e:
print(f"Error during timestamp conversion or year-month extraction: {e}")
return 0.0, {}, pd.DataFrame()
if data is None: # Should not happen if _df is not empty, but as a safeguard
return 0.0, {}, pd.DataFrame()
# Convert item ID to internal format
internal_item_id = data.get_new_item_id(item_id)
# Convert all user IDs to internal format
internal_members = [data.get_new_user_id(user_id) for user_id in members]
# Filter data for the specific item ID only
item_df = _df[_df["itemId"] == internal_item_id]
if item_df.empty:
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
# movie_ratings_per_month contains original rating counts
movie_ratings_per_month = (
item_df.groupby(["itemId", "year_month"], observed=False)
.size()
.reset_index(name="rating_count")
)
if movie_ratings_per_month.empty:
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
hype_periods_for_item = None
# Process the specific item for hype period detection
group_sorted = movie_ratings_per_month.sort_values("year_month").reset_index(
drop=True
)
original_ratings = group_sorted["rating_count"].to_numpy()
# Normalization Step
min_rating = np.min(original_ratings)
max_rating = np.max(original_ratings)
normalized_ratings = None
if (
max_rating > min_rating
): # Avoid division by zero if all ratings are the same
normalized_ratings = (original_ratings - min_rating) / (
max_rating - min_rating
)
elif len(original_ratings) > 0:
normalized_ratings = np.zeros_like(original_ratings, dtype=float)
else: # No ratings for this item in group_sorted (should not happen if groupby is correct)
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
# Peak Detection on Normalized Data
peaks_indices, properties = find_peaks(
normalized_ratings,
height=peak_norm_min_height,
distance=peak_min_distance,
prominence=peak_norm_min_prominence,
)
hype_periods_list = []
if len(peaks_indices) > 0:
widths, _, left_ips, right_ips = peak_widths(
normalized_ratings, peaks_indices, rel_height=peak_width_rel_height
)
for i, peak_idx in enumerate(peaks_indices):
start_idx = max(0, int(round(left_ips[i])))
end_idx = min(len(group_sorted) - 1, int(round(right_ips[i])))
if start_idx <= end_idx:
start_month = group_sorted.iloc[start_idx]["year_month"]
end_month = group_sorted.iloc[end_idx]["year_month"]
hype_periods_list.append(
{
"itemId": item_id,
"hype_start_month": start_month,
"hype_end_month": end_month,
"peak_month": group_sorted.iloc[peak_idx]["year_month"],
"peak_rating_count_original": original_ratings[peak_idx],
"peak_rating_count_normalized": normalized_ratings[
peak_idx
],
}
)
if hype_periods_list:
hype_periods_for_item = pd.DataFrame(hype_periods_list)
else:
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
# Calculate trending scores for each user in the group
individual_scores = {}
valid_scores = []
for idx, user_id in enumerate(internal_members):
user_ratings = item_df[item_df["userId"] == user_id].copy()
if user_ratings.empty:
individual_scores[members[idx]] = 0.0
continue
# Merge user ratings with hype periods
user_ratings_merged = pd.merge(
user_ratings, hype_periods_for_item, on="itemId", how="left"
)
user_ratings_merged["is_match"] = (
(
user_ratings_merged["year_month"]
>= user_ratings_merged["hype_start_month"]
)
& (
user_ratings_merged["year_month"]
<= user_ratings_merged["hype_end_month"]
)
& user_ratings_merged["hype_start_month"].notna()
)
if (
not user_ratings_merged.empty
and "is_match" in user_ratings_merged.columns
):
is_event_trending = user_ratings_merged.groupby(
["userId", "itemId", "timestamp_dt"]
)["is_match"].any()
num_trending_ratings = is_event_trending.sum()
total_unique_rating_events = len(is_event_trending)
else:
num_trending_ratings = 0
total_unique_rating_events = len(
user_ratings.drop_duplicates(
subset=["userId", "itemId", "timestamp_dt"]
)
)
if total_unique_rating_events == 0:
individual_scores[members[idx]] = 0.0
else:
trending_score = num_trending_ratings / total_unique_rating_events
individual_scores[members[idx]] = trending_score
valid_scores.append(trending_score)
# Calculate average trending score across all group members
# Include users with 0.0 scores (no ratings for the item) in the average
all_scores = [individual_scores[user_id] for user_id in members]
average_trending_score = sum(all_scores) / len(members) if members else 0.0
return average_trending_score, individual_scores, hype_periods_for_item
def generate_ranked_items(
self,
all_rated_items: List[Union[str, int]],
data: DataReader,
group_members: List[Union[str, int]],
component_weights: Optional[Dict[str, float]] = None,
) -> tuple[List[Union[str, int]], Dict]:
"""
Ranks items based on multiple scoring factors for a group of users.
Calculates a composite score for each item based on:
- Item popularity
- Group preference intensity
- Predicted ratings
- Relevance to the group
- Trends in the group
Args:
candidate_items: List of items that at least one group member has interacted with
data: The DataReader object containing user-item interactions
group_members: List of user identifiers in the group
component_weights: Optional dictionary with weights for each component
(popularity, intensity, rating, relevance, trend)
Returns:
List of item IDs sorted in descending order by their composite scores
"""
if self.group_predictions is None:
raise ValueError(
"User predictions not set. Call set_group_recommender_values first."
)
if self.top_recommendation is None:
raise ValueError(
"Top recommendation not set. Call set_group_recommender_values first."
)
# Default weights if not provided
if component_weights is None:
component_weights = {
"popularity": 1.0,
"intensity": 1.0,
"rating": 1.0,
"relevance": 1.0,
"trend": 1.0,
}
item_scores = {}
item_metric_details = {}
popularity_scores = self.calculate_item_popularity_score(all_rated_items, data)
relevance_mask = self.calculate_relevance_mask(self.top_recommendation)
for item_id in all_rated_items:
# Calculate individual score components
popularity_score = popularity_scores[item_id]
intensity_score = self.calculate_item_intensity_score(
item_id, group_members, data
)
rating_score = self.calculate_rating_score(item_id, group_members, data)
relevance_score = self.calculate_relevance_score(
item_id, data, relevance_mask, group_members
)
trending_score, _, _ = self.calculate_trending_score(
group_members,
item_id,
data,
0.3,
0.2,
9,
0.6,
)
composite_score = (
component_weights["popularity"] * popularity_score
+ component_weights["intensity"] * intensity_score
+ component_weights["rating"] * rating_score
+ component_weights["relevance"] * relevance_score
+ component_weights["trend"] * trending_score
)
item_metric_details[item_id] = {
"Popularity": popularity_score,
"Intensity": intensity_score,
"Rating": rating_score,
"Relevance": relevance_score,
"Trend": trending_score,
"Composite Score": composite_score,
}
item_scores[item_id] = composite_score
# Sort items by score in descending order
ranked_items = sorted(
item_scores.items(), key=operator.itemgetter(1), reverse=True
)
# Return the sorted item IDs and the detailed metrics
return [item_id for item_id, _ in ranked_items], item_metric_details
+55
View File
@@ -0,0 +1,55 @@
"""
Some handy functions for pytroch model training ...
"""
import torch
from torch.optim import Optimizer
# Checkpoints
def save_checkpoint(model, model_dir):
torch.save(model.state_dict(), model_dir)
def resume_checkpoint(model, model_dir, device_id):
device = f"cuda:{device_id}"
state_dict = torch.load(model_dir, map_location=device)
model.load_state_dict(state_dict)
# Hyper params
def use_cuda(enabled, device_id=0):
if enabled:
assert torch.cuda.is_available(), "CUDA is not available"
torch.cuda.set_device(device_id)
def use_optimizer(
optimizer_name: str,
network: torch.nn.Module,
learning_rate: float,
momentum: float = 0,
weight_decay: float = 0,
alpha: float = 0.99,
) -> Optimizer:
if optimizer_name == "sgd":
optimizer = torch.optim.SGD(
network.parameters(),
lr=learning_rate,
momentum=momentum,
weight_decay=weight_decay,
)
elif optimizer_name == "adam":
optimizer = torch.optim.Adam(
network.parameters(), lr=learning_rate, weight_decay=weight_decay
)
elif optimizer_name == "rmsprop":
optimizer = torch.optim.RMSprop(
network.parameters(), lr=learning_rate, alpha=alpha, momentum=momentum
)
else:
raise ValueError(f"Optimizer '{optimizer_name}' is not supported")
return optimizer