public code v1
This commit is contained in:
@@ -0,0 +1,7 @@
|
||||
import yaml
|
||||
from box import Box
|
||||
|
||||
with open("configs/config.yml", "r") as yml_file:
|
||||
full_cfg = yaml.safe_load(yml_file)
|
||||
|
||||
cfg = Box({**full_cfg["base"]}, default_box=True, default_box_attr=None)
|
||||
@@ -0,0 +1,11 @@
|
||||
from .data_reader import DataReader
|
||||
from .user_item_rating_dataset import UserItemRatingDataset
|
||||
from .group_interaction_handler import GroupInteractionHandler
|
||||
from .user_item_dict import UserItemDict
|
||||
|
||||
__all__ = [
|
||||
"DataReader",
|
||||
"UserItemRatingDataset",
|
||||
"GroupInteractionHandler",
|
||||
"UserItemDict",
|
||||
]
|
||||
@@ -0,0 +1,416 @@
|
||||
from typing import List, Optional, Union, cast
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import warnings
|
||||
|
||||
|
||||
class DataReader:
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: Optional[str] = None,
|
||||
sep: Optional[str] = None,
|
||||
names: Optional[List[str]] = None,
|
||||
skiprows: int = 0,
|
||||
dataframe: Optional[pd.DataFrame] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the DataReader with either a DataFrame or file parameters.
|
||||
|
||||
Args:
|
||||
filepath_or_buffer (Optional[str]): Path to the CSV file or buffer.
|
||||
sep (Optional[str]): Separator used in the CSV file.
|
||||
names (Optional[List[str]]): List of column names for the CSV file.
|
||||
skiprows (int, optional): Number of rows to skip in the CSV file. Defaults to 0.
|
||||
dataframe (Optional[pd.DataFrame], optional): A DataFrame to use directly. Defaults to None.
|
||||
|
||||
Raises:
|
||||
ValueError: If neither `dataframe` nor valid file parameters are provided.
|
||||
FileNotFoundError: If the file cannot be found when loading from file.
|
||||
pd.errors.ParserError: If the CSV file cannot be parsed when loading from file.
|
||||
|
||||
Note:
|
||||
If `dataframe` is provided, it takes precedence, and file-related parameters
|
||||
are ignored but stored for reference. A warning is issued in this case.
|
||||
The DataFrame must contain columns: 'userId', 'itemId', 'rating', 'timestamp'.
|
||||
"""
|
||||
if dataframe is None and (not filepath_or_buffer or not sep or not names):
|
||||
raise ValueError(
|
||||
"Must provide either a DataFrame or valid file parameters."
|
||||
)
|
||||
|
||||
self.filepath_or_buffer = filepath_or_buffer
|
||||
self.sep = sep
|
||||
self.names = names
|
||||
self.skiprows = skiprows
|
||||
self._dataset = None
|
||||
self._raw_dataset = None
|
||||
self._num_user: Optional[int] = None
|
||||
self._num_item: Optional[int] = None
|
||||
self.original_user_id: Optional[pd.DataFrame] = None
|
||||
self.original_item_id: Optional[pd.DataFrame] = None
|
||||
self.new_user_id: Optional[pd.DataFrame] = None
|
||||
self.new_item_id: Optional[pd.DataFrame] = None
|
||||
|
||||
if dataframe is not None:
|
||||
if any(param is not None for param in [filepath_or_buffer, sep, names]):
|
||||
warnings.warn(
|
||||
"DataFrame provided; file parameters (filepath_or_buffer, sep, names) are ignored.",
|
||||
UserWarning,
|
||||
)
|
||||
self.dataset = dataframe
|
||||
|
||||
elif filepath_or_buffer and sep and names:
|
||||
# Eagerly load data if file parameters are provided
|
||||
try:
|
||||
assert self.filepath_or_buffer is not None
|
||||
|
||||
loaded_df = pd.read_csv(
|
||||
filepath_or_buffer=self.filepath_or_buffer,
|
||||
sep=self.sep,
|
||||
names=self.names,
|
||||
skiprows=self.skiprows,
|
||||
engine="python",
|
||||
)
|
||||
self._raw_dataset = loaded_df.copy()
|
||||
# Use the setter to handle dataset validation and setting _num_user/_num_item
|
||||
self.dataset = loaded_df
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(f"File not found: {self.filepath_or_buffer}")
|
||||
except pd.errors.ParserError as e:
|
||||
raise pd.errors.ParserError(f"Failed to parse CSV: {str(e)}")
|
||||
else:
|
||||
raise ValueError(
|
||||
"Must provide either a DataFrame or valid file parameters."
|
||||
)
|
||||
|
||||
@property
|
||||
def dataset(self) -> pd.DataFrame:
|
||||
"""
|
||||
Get the dataset DataFrame.
|
||||
"""
|
||||
if self._dataset is None:
|
||||
if self._dataset is None:
|
||||
# If it reach here and _dataset is None, it means initialization failed
|
||||
# or an empty DataFrame was set.
|
||||
# This state should ideally not be reached with eager loading if file params were valid.
|
||||
raise ValueError("Dataset is not loaded or is not valid.")
|
||||
return self._dataset
|
||||
|
||||
@dataset.setter
|
||||
def dataset(self, new_data: pd.DataFrame) -> None:
|
||||
"""
|
||||
Set the dataset and compute the number of unique users and items.
|
||||
|
||||
Args:
|
||||
new_data (pd.DataFrame): The new dataset to set.
|
||||
|
||||
Raises:
|
||||
ValueError: If the DataFrame is None, empty, lacks required columns,
|
||||
or contains invalid data types/missing values.
|
||||
"""
|
||||
if new_data is None:
|
||||
raise ValueError("DataFrame cannot be None")
|
||||
if new_data.empty:
|
||||
raise ValueError("DataFrame cannot be empty")
|
||||
|
||||
# Validate data types
|
||||
for col in ["userId", "itemId", "rating"]:
|
||||
if not pd.api.types.is_numeric_dtype(new_data[col]):
|
||||
warnings.warn(
|
||||
f"Column '{col}' is not numeric. Attempting conversion.",
|
||||
UserWarning,
|
||||
)
|
||||
try:
|
||||
new_data[col] = pd.to_numeric(new_data[col])
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Column '{col}' cannot be converted to a numeric type."
|
||||
)
|
||||
|
||||
# Check for missing values in essential columns
|
||||
if new_data[["userId", "itemId", "rating"]].isnull().any().any():
|
||||
raise ValueError(
|
||||
"DataFrame contains missing values in essential columns (userId, itemId, rating)."
|
||||
)
|
||||
|
||||
self._dataset = new_data
|
||||
self._raw_dataset = new_data.copy()
|
||||
self._num_user = int(self._dataset["userId"].nunique())
|
||||
self._num_item = int(self._dataset["itemId"].nunique())
|
||||
# Set the index to userId and itemId for easier access
|
||||
# Reset id mappings as they are now invalid for the new dataset
|
||||
self.original_user_id = None
|
||||
self.original_item_id = None
|
||||
self.new_user_id = None
|
||||
self.new_item_id = None
|
||||
|
||||
def get_raw_dataset(self) -> pd.DataFrame:
|
||||
"""
|
||||
Get the raw dataset as loaded from the file or initially set.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: The raw dataset.
|
||||
|
||||
Raises:
|
||||
ValueError: If the raw dataset is not set.
|
||||
"""
|
||||
if self._raw_dataset is None:
|
||||
raise ValueError(
|
||||
"Raw dataset is not set. Load data from file or set a DataFrame first."
|
||||
)
|
||||
return self._raw_dataset
|
||||
|
||||
@staticmethod
|
||||
def _create_id_mapping(column: pd.Series, new_column_name: str) -> pd.DataFrame:
|
||||
"""
|
||||
Create a mapping for consecutive IDs.
|
||||
|
||||
Args:
|
||||
column (pd.Series): The column to map.
|
||||
new_column_name (str): The name of the new column for consecutive IDs.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: A DataFrame with the original and mapped IDs.
|
||||
|
||||
Raises:
|
||||
ValueError: If the column is empty.
|
||||
"""
|
||||
if column.empty:
|
||||
raise ValueError("Cannot create ID mapping for an empty column")
|
||||
unique_values = column.drop_duplicates().reset_index(drop=True)
|
||||
mapping = pd.DataFrame(
|
||||
{column.name: unique_values, new_column_name: np.arange(len(unique_values))}
|
||||
)
|
||||
return mapping
|
||||
|
||||
def make_consecutive_ids_in_dataset(self) -> None:
|
||||
"""
|
||||
Map user and item IDs to consecutive integers starting from 0 in a deterministic way.
|
||||
Modifies the dataset in-place and stores mappings for original and new IDs.
|
||||
"""
|
||||
if self._dataset is None:
|
||||
raise ValueError("Dataset must be loaded or set before mapping IDs")
|
||||
|
||||
dataset = self.dataset.copy()
|
||||
|
||||
# Get unique IDs and SORT them to ensure the mapping is identical every time.
|
||||
sorted_unique_users = sorted(dataset["userId"].unique())
|
||||
sorted_unique_items = sorted(dataset["itemId"].unique())
|
||||
|
||||
# Create user ID mapping from the sorted list
|
||||
user_id_mapping = pd.DataFrame(
|
||||
{
|
||||
"userId": sorted_unique_users,
|
||||
"new_userId": range(len(sorted_unique_users)),
|
||||
}
|
||||
)
|
||||
dataset["userId"] = dataset["userId"].map(
|
||||
user_id_mapping.set_index("userId")["new_userId"]
|
||||
)
|
||||
|
||||
# Create item ID mapping from the sorted list
|
||||
item_id_mapping = pd.DataFrame(
|
||||
{
|
||||
"itemId": sorted_unique_items,
|
||||
"new_itemId": range(len(sorted_unique_items)),
|
||||
}
|
||||
)
|
||||
dataset["itemId"] = dataset["itemId"].map(
|
||||
item_id_mapping.set_index("itemId")["new_itemId"]
|
||||
)
|
||||
|
||||
# Store mappings for lookups
|
||||
self.original_user_id = user_id_mapping.set_index("new_userId")
|
||||
self.original_item_id = item_id_mapping.set_index("new_itemId")
|
||||
self.new_user_id = user_id_mapping.set_index("userId")
|
||||
self.new_item_id = item_id_mapping.set_index("itemId")
|
||||
|
||||
# Update the internal dataset
|
||||
dataset["userId"] = dataset["userId"].astype(int)
|
||||
dataset["itemId"] = dataset["itemId"].astype(int)
|
||||
self._dataset = dataset
|
||||
|
||||
self._num_user = self._dataset["userId"].max() + 1
|
||||
self._num_item = self._dataset["itemId"].max() + 1
|
||||
|
||||
def binarize(
|
||||
self, binary_threshold: float = 1, inplace: bool = True
|
||||
) -> Optional[pd.DataFrame]:
|
||||
"""
|
||||
Binarize ratings into 0 or 1 based on a threshold (implicit feedback).
|
||||
|
||||
Args:
|
||||
binary_threshold (float, optional): Threshold for binarization. Defaults to 1.0.
|
||||
inplace (bool, optional): If True, modify the dataset in-place. If False, return a new DataFrame.
|
||||
Defaults to True.
|
||||
|
||||
Returns:
|
||||
Optional[pd.DataFrame]: The binarized dataset if inplace=False, else None.
|
||||
|
||||
Raises:
|
||||
ValueError: If the dataset is not set or binary_threshold is invalid.
|
||||
|
||||
Example:
|
||||
Ratings [0.5, 2.0, 3.0] with threshold=1.0 -> [0, 1, 1]
|
||||
"""
|
||||
if self._dataset is None:
|
||||
raise ValueError("Dataset must be loaded or set before binarization")
|
||||
if not isinstance(binary_threshold, (int, float)):
|
||||
raise ValueError("binary_threshold must be a number")
|
||||
|
||||
dataset = self._dataset if inplace else self._dataset.copy()
|
||||
dataset["rating"] = (dataset["rating"] > binary_threshold).astype(int)
|
||||
|
||||
if not inplace:
|
||||
return dataset
|
||||
self._dataset = dataset
|
||||
return None
|
||||
|
||||
@property
|
||||
def num_user(self) -> int:
|
||||
"""
|
||||
Get the number of unique users.
|
||||
|
||||
Returns:
|
||||
int: Number of unique users.
|
||||
|
||||
Raises:
|
||||
ValueError: If the dataset is not set.
|
||||
"""
|
||||
if self._num_user is None:
|
||||
raise ValueError("Dataset must be loaded or set to compute num_user")
|
||||
return self._num_user
|
||||
|
||||
@property
|
||||
def num_item(self) -> int:
|
||||
"""
|
||||
Get the number of unique items.
|
||||
|
||||
Returns:
|
||||
int: Number of unique items.
|
||||
|
||||
Raises:
|
||||
ValueError: If the dataset is not set.
|
||||
"""
|
||||
if self._num_item is None:
|
||||
raise ValueError("Dataset must be loaded or set to compute num_item")
|
||||
return self._num_item
|
||||
|
||||
def get_original_user_id(self, u: Union[int, List[int]]) -> Union[int, List[int]]:
|
||||
"""
|
||||
Get the original user ID(s) from the new (consecutive) ID(s).
|
||||
|
||||
Args:
|
||||
u (Union[int, List[int]]): New user ID(s).
|
||||
|
||||
Returns:
|
||||
Union[int, List[int]]: Original user ID(s).
|
||||
|
||||
Raises:
|
||||
ValueError: If ID mapping is not set or if any ID is not found.
|
||||
"""
|
||||
if self.original_user_id is None:
|
||||
raise ValueError(
|
||||
"ID mapping not set. Call make_consecutive_ids_in_dataset first"
|
||||
)
|
||||
try:
|
||||
if isinstance(u, (int, np.integer)):
|
||||
return int(self.original_user_id.loc[u, "userId"]) # type: ignore
|
||||
series = cast(pd.Series, self.original_user_id.loc[u, "userId"])
|
||||
return series.tolist()
|
||||
except KeyError as e:
|
||||
raise ValueError(f"User ID(s) not found: {e}")
|
||||
|
||||
def get_original_item_id(self, i: Union[int, List[int]]) -> Union[int, List[int]]:
|
||||
"""
|
||||
Get the original item ID(s) from the new (consecutive) ID(s).
|
||||
|
||||
Args:
|
||||
i (Union[int, List[int]]): New item ID(s).
|
||||
|
||||
Returns:
|
||||
Union[int, List[int]]: Original item ID(s).
|
||||
|
||||
Raises:
|
||||
ValueError: If ID mapping is not set or if any ID is not found.
|
||||
"""
|
||||
if self.original_item_id is None:
|
||||
raise ValueError(
|
||||
"ID mapping not set. Call make_consecutive_ids_in_dataset first"
|
||||
)
|
||||
try:
|
||||
if isinstance(i, (int, np.integer)):
|
||||
return int(self.original_item_id.loc[i, "itemId"]) # type: ignore
|
||||
|
||||
series = cast(pd.Series, self.original_item_id.loc[i, "itemId"])
|
||||
return series.tolist()
|
||||
except KeyError as e:
|
||||
raise ValueError(f"Item ID(s) not found: {e}")
|
||||
|
||||
def get_new_user_id(
|
||||
self, u: Union[Union[str, int], List[Union[str, int]]]
|
||||
) -> Union[int, List[int]]:
|
||||
"""
|
||||
Get the new (consecutive) user ID(s) from the original ID(s).
|
||||
|
||||
Args:
|
||||
u: Original user ID(s).
|
||||
|
||||
Returns:
|
||||
New user ID(s).
|
||||
|
||||
Raises:
|
||||
ValueError: If ID mapping is not set or if any ID is not found.
|
||||
"""
|
||||
if self.new_user_id is None:
|
||||
raise ValueError(
|
||||
"ID mapping not set. Call make_consecutive_ids_in_dataset first"
|
||||
)
|
||||
try:
|
||||
if isinstance(u, str):
|
||||
u = int(u)
|
||||
return int(self.new_user_id.loc[u, "new_userId"]) # type: ignore
|
||||
if isinstance(u, list) and all(isinstance(x, str) for x in u):
|
||||
u = [int(x) for x in u]
|
||||
series = cast(pd.Series, self.new_user_id.loc[u, "new_userId"])
|
||||
return series.tolist()
|
||||
if isinstance(u, (int, np.integer)):
|
||||
return int(self.new_user_id.loc[u, "new_userId"]) # type: ignore
|
||||
series = cast(pd.Series, self.new_user_id.loc[u, "new_userId"])
|
||||
return series.tolist()
|
||||
except KeyError as e:
|
||||
raise ValueError(f"User ID(s) not found: {e}")
|
||||
|
||||
def get_new_item_id(
|
||||
self, i: Union[Union[str, int], List[Union[str, int]]]
|
||||
) -> Union[int, List[int]]:
|
||||
"""
|
||||
Get the new (consecutive) item ID(s) from the original ID(s).
|
||||
|
||||
Args:
|
||||
i: Original item ID(s).
|
||||
|
||||
Returns:
|
||||
New item ID(s).
|
||||
|
||||
Raises:
|
||||
ValueError: If ID mapping is not set or if any ID is not found.
|
||||
"""
|
||||
if self.new_item_id is None:
|
||||
raise ValueError(
|
||||
"ID mapping not set. Call make_consecutive_ids_in_dataset first"
|
||||
)
|
||||
try:
|
||||
if isinstance(i, str):
|
||||
i = int(i)
|
||||
return int(self.new_item_id.loc[i, "new_itemId"]) # type: ignore
|
||||
if isinstance(i, list) and all(isinstance(x, str) for x in i):
|
||||
i = [int(x) for x in i]
|
||||
series = cast(pd.Series, self.new_item_id.loc[i, "new_itemId"])
|
||||
return series.tolist()
|
||||
if isinstance(i, (int, np.integer)):
|
||||
return int(self.new_item_id.loc[i, "new_itemId"]) # type: ignore
|
||||
series = cast(pd.Series, self.new_item_id.loc[i, "new_itemId"])
|
||||
return series.tolist()
|
||||
except KeyError as e:
|
||||
raise ValueError(f"Item ID(s) not found: {e}")
|
||||
@@ -0,0 +1,289 @@
|
||||
from typing import List, Optional, Union
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
|
||||
from pygrex.data_reader.data_reader import DataReader
|
||||
|
||||
|
||||
class GroupInteractionHandler:
|
||||
def __init__(self, filepath_or_buffer: Union[str, Path, List[Union[str, Path]]]):
|
||||
"""
|
||||
Initialize the GroupInteractionHandler.
|
||||
|
||||
Args:
|
||||
filepath_or_buffer: Path to directory containing group files or list of file paths
|
||||
"""
|
||||
# Convert to Path objects
|
||||
if isinstance(filepath_or_buffer, (str, Path)):
|
||||
path = Path(filepath_or_buffer)
|
||||
# If a single directory path is provided, get all files in it
|
||||
if path.is_dir():
|
||||
self.filepath_or_buffer = [
|
||||
str(file) for file in path.iterdir() if file.is_file()
|
||||
]
|
||||
else:
|
||||
self.filepath_or_buffer = [str(path)]
|
||||
else:
|
||||
# If a list of paths is provided, convert all to Path and then to strings
|
||||
self.filepath_or_buffer = [str(Path(p)) for p in filepath_or_buffer]
|
||||
|
||||
def _get_group_filepath(self, filename: str) -> str:
|
||||
"""
|
||||
Get a specific group file path by matching the filename.
|
||||
|
||||
Args:
|
||||
filename (str): The name of the file to search for.
|
||||
|
||||
Returns:
|
||||
str: The matched file path.
|
||||
|
||||
Raises:
|
||||
ValueError: Error: File does not exist
|
||||
ValueError: No file found containing '{filename}' in its name.
|
||||
"""
|
||||
for path_str in self.filepath_or_buffer:
|
||||
if filename in path_str: # Check if filename is part of the path
|
||||
path = Path(path_str).resolve()
|
||||
if path.exists():
|
||||
return str(path)
|
||||
else:
|
||||
raise ValueError(f"Error: File does not exist: {path}")
|
||||
|
||||
raise ValueError(f"Error: No file found containing '{filename}' in its name.")
|
||||
|
||||
def read_groups(self, filename: str) -> List[str]:
|
||||
"""
|
||||
Method to read group IDs from a specified file.
|
||||
|
||||
Args:
|
||||
filename (str): Name of the file containing group IDs.
|
||||
|
||||
Returns:
|
||||
List[str]: List of group IDs.
|
||||
|
||||
Raises:
|
||||
ValueError: If groups path is not specified in configuration
|
||||
"""
|
||||
if not filename:
|
||||
raise ValueError("Groups path not specified in configuration")
|
||||
|
||||
filepath = self._get_group_filepath(filename)
|
||||
|
||||
# Use Path for file reading
|
||||
path = Path(filepath)
|
||||
return [line.strip() for line in path.read_text().splitlines()]
|
||||
|
||||
def parse_group_members(self, group: str) -> List[int]:
|
||||
"""
|
||||
Parse group ID to get member IDs.
|
||||
|
||||
Args:
|
||||
group: Group ID string
|
||||
|
||||
Returns:
|
||||
List of member IDs
|
||||
"""
|
||||
group = group.strip()
|
||||
members = group.split("_")
|
||||
return [int(m) for m in members]
|
||||
|
||||
def get_group_members(self, group: Union[List[Union[int, str]], str]) -> List[int]:
|
||||
"""
|
||||
Get group members from a group ID string or list.
|
||||
|
||||
Args:
|
||||
group: Group ID string in format "id1_id2_id3" or list of IDs
|
||||
|
||||
Returns:
|
||||
List of member IDs as integers
|
||||
|
||||
Raises:
|
||||
ValueError: If any member ID cannot be converted to an integer
|
||||
TypeError: If group is neither a string nor a list
|
||||
"""
|
||||
|
||||
if isinstance(group, list):
|
||||
return [int(member) for member in group]
|
||||
|
||||
if not isinstance(group, str):
|
||||
raise TypeError(f"Expected string or list, got {type(group).__name__}")
|
||||
|
||||
group = group.strip()
|
||||
if not group:
|
||||
return []
|
||||
|
||||
try:
|
||||
return [int(member) for member in group.split("_")]
|
||||
except ValueError as e:
|
||||
raise ValueError(f"Invalid member ID in group: {str(e)}")
|
||||
|
||||
def create_modified_dataset(
|
||||
self,
|
||||
original_data: Union[pd.DataFrame, DataReader],
|
||||
group_ids: List[Union[int, str]],
|
||||
item_ids: List[Union[int, str]],
|
||||
data: Optional[DataReader] = None,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Creates a modified dataset by removing interactions between specified groups and items.
|
||||
|
||||
Args:
|
||||
original_data: Either a pandas DataFrame or a DataReader object containing the dataset
|
||||
group_ids: List of group IDs to consider for removal
|
||||
item_ids: List of item IDs to consider for removal
|
||||
data: Optional DataReader object if original_data is a DataFrame
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: A pandas DataFrame with the specified interactions removed
|
||||
|
||||
Raises:
|
||||
ValueError: If input data types are incorrect
|
||||
"""
|
||||
# Determine the data source and target dataset
|
||||
if isinstance(original_data, DataReader):
|
||||
data_reader = original_data
|
||||
dataset = original_data.dataset
|
||||
elif isinstance(original_data, pd.DataFrame) and isinstance(data, DataReader):
|
||||
data_reader = data
|
||||
dataset = original_data
|
||||
else:
|
||||
raise ValueError(
|
||||
"Either original_data must be a DataReader or data must be provided as a DataReader"
|
||||
)
|
||||
|
||||
# Convert IDs to internal representation
|
||||
new_group_ids = [
|
||||
data_reader.get_new_user_id(
|
||||
int(g) if isinstance(g, (int, np.integer)) else g
|
||||
)
|
||||
for g in group_ids
|
||||
]
|
||||
|
||||
new_item_ids = [
|
||||
data_reader.get_new_item_id(
|
||||
int(i) if isinstance(i, (int, np.integer)) else i
|
||||
)
|
||||
for i in item_ids
|
||||
]
|
||||
|
||||
# Create mask for rows to keep (inverse of rows to drop)
|
||||
mask = ~(dataset.itemId.isin(new_item_ids) & dataset.userId.isin(new_group_ids))
|
||||
|
||||
return dataset[mask]
|
||||
|
||||
def get_rated_items_by_all_group_members(
|
||||
self, group: List[Union[int, str]], original_data: DataReader
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Get all items rated by any member of the group.
|
||||
|
||||
Args:
|
||||
group: List of user IDs
|
||||
original_data: Data object with mapping methods
|
||||
|
||||
Returns:
|
||||
np.ndarray: Array of original item IDs rated by any group member
|
||||
"""
|
||||
# Convert group members to new user IDs
|
||||
new_group = [
|
||||
original_data.get_new_user_id(
|
||||
int(g) if isinstance(g, (int, np.integer)) else g
|
||||
)
|
||||
for g in group
|
||||
]
|
||||
|
||||
# Get unique items rated by any group member
|
||||
group_items = original_data.dataset[
|
||||
original_data.dataset.userId.isin(new_group)
|
||||
]["itemId"].unique()
|
||||
|
||||
# Convert back to original item IDs
|
||||
original_ids = original_data.get_original_item_id(group_items.tolist())
|
||||
return np.array(original_ids)
|
||||
|
||||
def get_common_rated_items(
|
||||
self, group: List[Union[int, str]], original_data: DataReader
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Get items rated by all members of the group (intersection of rated items).
|
||||
|
||||
Args:
|
||||
group: List of user IDs
|
||||
original_data: DataReader object with mapping methods
|
||||
|
||||
Returns:
|
||||
np.ndarray: Array of original item IDs rated by all group members
|
||||
"""
|
||||
# Convert group members to new user IDs
|
||||
new_group = [
|
||||
original_data.get_new_user_id(
|
||||
int(g) if isinstance(g, (int, np.integer)) else g
|
||||
)
|
||||
for g in group
|
||||
]
|
||||
|
||||
# Get items rated by each group member
|
||||
rated_items_per_member = []
|
||||
for user_id in new_group:
|
||||
user_items = original_data.dataset[original_data.dataset.userId == user_id][
|
||||
"itemId"
|
||||
].unique()
|
||||
rated_items_per_member.append(set(user_items))
|
||||
|
||||
# Find intersection of all rated items
|
||||
if rated_items_per_member:
|
||||
common_items = set.intersection(*rated_items_per_member)
|
||||
common_items_array = np.array(list(common_items))
|
||||
# Convert back to original item IDs
|
||||
original_ids = original_data.get_original_item_id(
|
||||
common_items_array.tolist()
|
||||
)
|
||||
return np.array(original_ids)
|
||||
else:
|
||||
return np.array([])
|
||||
|
||||
def get_items_for_group_recommendation(
|
||||
self, data: pd.DataFrame, item_ids: np.ndarray, group: List[int]
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Get items for group recommendation (those not interacted with by any group member).
|
||||
|
||||
Args:
|
||||
data: DataFrame with interaction data
|
||||
item_ids: Array of all item IDs
|
||||
group: List of group member IDs
|
||||
|
||||
Returns:
|
||||
Array of item IDs not interacted with by any group member
|
||||
"""
|
||||
item_ids_group = data.loc[data.userId.isin(group), "itemId"]
|
||||
return np.setdiff1d(item_ids, item_ids_group)
|
||||
|
||||
def get_group_preferences(
|
||||
self, group: List[Union[int, str]], data_reader: DataReader
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Get all preferences (ratings) by all members of the group.
|
||||
|
||||
Args:
|
||||
group: List of user IDs
|
||||
data_reader: DataReader object with the dataset
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame containing all preferences by group members
|
||||
"""
|
||||
# Convert group members to new user IDs
|
||||
new_group = [
|
||||
data_reader.get_new_user_id(
|
||||
int(g) if isinstance(g, (int, np.integer)) else g
|
||||
)
|
||||
for g in group
|
||||
]
|
||||
|
||||
# Get all interactions by group members
|
||||
group_preferences = data_reader.dataset[
|
||||
data_reader.dataset.userId.isin(new_group)
|
||||
].copy()
|
||||
|
||||
return group_preferences
|
||||
@@ -0,0 +1,36 @@
|
||||
from torch.utils.data import Dataset
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
|
||||
class UserItemDict(Dataset):
|
||||
"""Wrapper, convert <user, item, rating> Tensor into Pytorch Dataset"""
|
||||
|
||||
def __init__(self, data, expl_matrix, expl):
|
||||
"""
|
||||
args:
|
||||
|
||||
target_tensor: torch.Tensor, the corresponding rating for <user, item> pair
|
||||
"""
|
||||
|
||||
grp_data = data.groupby('userId')
|
||||
self.users_dict = dict()
|
||||
for userId, itemId_rating in grp_data:
|
||||
self.users_dict[userId] = {'items': list(itemId_rating.itemId),
|
||||
'rating': list(itemId_rating.rating)}
|
||||
self.n_items = data.itemId.nunique()
|
||||
self.n_users = data.userId.nunique()
|
||||
self.expl_matrix = expl_matrix
|
||||
self.expl = expl
|
||||
|
||||
def __getitem__(self, index):
|
||||
ratings = np.zeros(self.n_items)
|
||||
ratings[self.users_dict[index]['items']] = self.users_dict[index]['rating']
|
||||
if self.expl:
|
||||
return torch.tensor(ratings) + self.expl_matrix[index, :]
|
||||
else:
|
||||
return torch.tensor(ratings)
|
||||
|
||||
def __len__(self):
|
||||
return self.n_users
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
|
||||
class UserItemRatingDataset(Dataset):
|
||||
"""Wrapper, convert <user, item, rating> Tensor into Pytorch Dataset"""
|
||||
|
||||
def __init__(self, user_tensor, item_tensor, target_tensor):
|
||||
"""
|
||||
args:
|
||||
|
||||
target_tensor: torch.Tensor, the corresponding rating for <user, item> pair
|
||||
"""
|
||||
self.user_tensor = user_tensor
|
||||
self.item_tensor = item_tensor
|
||||
self.target_tensor = target_tensor
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]
|
||||
|
||||
def __len__(self):
|
||||
return self.user_tensor.size(0)
|
||||
@@ -0,0 +1,15 @@
|
||||
from .splitter import Splitter
|
||||
from .model_evaluator import ModelEvaluator
|
||||
from .explainer_evaluator import ExplanationEvaluator
|
||||
from .evaluation_pipelines import (
|
||||
run_evaluation_with_proper_split,
|
||||
run_leave_one_out_evaluation,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"Splitter",
|
||||
"ModelEvaluator",
|
||||
"ExplanationEvaluator",
|
||||
"run_evaluation_with_proper_split",
|
||||
"run_leave_one_out_evaluation",
|
||||
]
|
||||
@@ -0,0 +1,251 @@
|
||||
import time
|
||||
from typing import Dict
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pygrex.data_reader.data_reader import DataReader
|
||||
from pygrex.evaluator import Splitter, ModelEvaluator
|
||||
|
||||
|
||||
def run_leave_one_out_evaluation(
|
||||
data_reader: DataReader, model, top_n: int = 10
|
||||
) -> Dict:
|
||||
print("Starting leave-one-out evaluation...")
|
||||
start_time = time.time()
|
||||
|
||||
# 1. Proper leave-one-out split (one item per user)
|
||||
train_dr, test_df = Splitter.split_leave_n_out(
|
||||
data_reader, n=1
|
||||
) # n=1 for true leave-one-out
|
||||
print(f"Split completed: {len(test_df)} test interactions")
|
||||
|
||||
train_users = set(train_dr.dataset["userId"].unique())
|
||||
train_items = set(train_dr.dataset["itemId"].unique())
|
||||
|
||||
original_test_len = len(test_df)
|
||||
test_df = test_df[
|
||||
test_df["userId"].isin(train_users) & test_df["itemId"].isin(train_items)
|
||||
]
|
||||
print(
|
||||
f"Filtered test set: {len(test_df)} interactions remaining from {original_test_len}"
|
||||
)
|
||||
|
||||
# 2. Train model on training data
|
||||
print("Training model on reduced dataset...")
|
||||
train_start = time.time()
|
||||
model.fit(train_dr)
|
||||
train_time = time.time() - train_start
|
||||
print(f"Model training completed in {train_time:.2f} seconds")
|
||||
|
||||
# 3. Generate recommendations efficiently
|
||||
print("Generating recommendations...")
|
||||
rec_start = time.time()
|
||||
recommendations = generate_recommendations_batch(model, train_dr, test_df, top_n)
|
||||
rec_time = time.time() - rec_start
|
||||
print(f"Recommendations generated in {rec_time:.2f} seconds")
|
||||
|
||||
# 4. Use the existing Evaluator class
|
||||
evaluator = ModelEvaluator(test_df, top_n=top_n)
|
||||
|
||||
# Calculate metrics
|
||||
hit_ratio = evaluator.cal_hit_ratio(recommendations)
|
||||
ndcg = evaluator.cal_ndcg(recommendations)
|
||||
|
||||
total_time = time.time() - start_time
|
||||
print(f"Total evaluation time: {total_time:.2f} seconds")
|
||||
|
||||
return {
|
||||
"Hit Ratio": hit_ratio,
|
||||
"NDCG": ndcg, # Using standard NDCG instead of eNDCG for now
|
||||
"evaluation_time": total_time,
|
||||
}
|
||||
|
||||
|
||||
def generate_recommendations_batch(
|
||||
model, train_dr: DataReader, test_df: pd.DataFrame, top_n: int
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Generate recommendations in batch mode for efficiency.
|
||||
Returns DataFrame with columns: ['userId', 'itemId', 'rank', 'score']
|
||||
"""
|
||||
all_items = set(train_dr.dataset["itemId"].unique())
|
||||
recommendations = []
|
||||
|
||||
test_users = test_df["userId"].unique()
|
||||
print(f"Generating recommendations for {len(test_users)} users...")
|
||||
|
||||
for i, user_id in enumerate(test_users):
|
||||
if i % 100 == 0: # Progress indicator
|
||||
print(f"Processing user {i}/{len(test_users)}")
|
||||
|
||||
# Get items the user has already interacted with
|
||||
user_items = set(
|
||||
train_dr.dataset[train_dr.dataset["userId"] == user_id]["itemId"]
|
||||
)
|
||||
|
||||
# Candidate items (unseen items)
|
||||
candidate_items = list(all_items - user_items)
|
||||
|
||||
# For efficiency, limit candidates if there are too many
|
||||
if len(candidate_items) > 10000: # Adjust this threshold based on your needs
|
||||
candidate_items = np.random.choice(
|
||||
candidate_items, 10000, replace=False
|
||||
).tolist()
|
||||
|
||||
# Generate predictions - try to use batch prediction if available
|
||||
try:
|
||||
# Check if model has batch prediction capability
|
||||
if hasattr(model, "predict_batch") or hasattr(model, "recommend"):
|
||||
user_recs = generate_recommendations_efficient(
|
||||
model, user_id, candidate_items, top_n
|
||||
)
|
||||
else:
|
||||
# Fall back to individual predictions (slower)
|
||||
user_recs = generate_recommendations_individual(
|
||||
model, user_id, candidate_items, top_n
|
||||
)
|
||||
|
||||
recommendations.extend(user_recs)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating recommendations for user {user_id}: {e}")
|
||||
continue
|
||||
|
||||
# Convert to DataFrame
|
||||
if recommendations:
|
||||
rec_df = pd.DataFrame(
|
||||
recommendations, columns=["userId", "itemId", "rank", "score"]
|
||||
)
|
||||
else:
|
||||
# Return empty DataFrame with correct structure
|
||||
rec_df = pd.DataFrame(columns=["userId", "itemId", "rank", "score"])
|
||||
|
||||
return rec_df
|
||||
|
||||
|
||||
def generate_recommendations_efficient(
|
||||
model, user_id: int, candidate_items: list, top_n: int
|
||||
) -> list:
|
||||
"""
|
||||
Try to use efficient recommendation methods if available.
|
||||
"""
|
||||
recommendations = []
|
||||
|
||||
# Try different efficient methods based on model type
|
||||
if hasattr(model, "recommend"):
|
||||
# Some models have a recommend method
|
||||
try:
|
||||
recs = model.recommend(user_id, candidate_items, top_n)
|
||||
for rank, (item_id, score) in enumerate(recs, 1):
|
||||
recommendations.append((user_id, item_id, rank, score))
|
||||
except Exception:
|
||||
# Fall back to individual predictions
|
||||
return generate_recommendations_individual(
|
||||
model, user_id, candidate_items, top_n
|
||||
)
|
||||
|
||||
elif hasattr(model, "predict_batch"):
|
||||
# Batch prediction if available
|
||||
try:
|
||||
user_items_batch = [(user_id, item_id) for item_id in candidate_items]
|
||||
scores = model.predict_batch(user_items_batch)
|
||||
|
||||
# Sort by score and get top-N
|
||||
scored_items = list(zip(candidate_items, scores))
|
||||
scored_items.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
for rank, (item_id, score) in enumerate(scored_items[:top_n], 1):
|
||||
recommendations.append((user_id, item_id, rank, score))
|
||||
except Exception:
|
||||
return generate_recommendations_individual(
|
||||
model, user_id, candidate_items, top_n
|
||||
)
|
||||
|
||||
else:
|
||||
return generate_recommendations_individual(
|
||||
model, user_id, candidate_items, top_n
|
||||
)
|
||||
|
||||
return recommendations
|
||||
|
||||
|
||||
def generate_recommendations_individual(
|
||||
model, user_id: int, candidate_items: list, top_n: int
|
||||
) -> list:
|
||||
"""
|
||||
Fall back to individual predictions (slower but works with any model).
|
||||
"""
|
||||
predictions = []
|
||||
|
||||
# Batch the individual predictions for better performance
|
||||
batch_size = 100
|
||||
for i in range(0, len(candidate_items), batch_size):
|
||||
batch_items = candidate_items[i : i + batch_size]
|
||||
|
||||
for item_id in batch_items:
|
||||
try:
|
||||
score = model.predict(user_id, item_id)
|
||||
predictions.append((item_id, score))
|
||||
except Exception as e:
|
||||
print(f"Prediction error for user {user_id}, item {item_id}: {e}")
|
||||
# Skip items that cause prediction errors
|
||||
continue
|
||||
|
||||
# Sort by score and get top-N
|
||||
predictions.sort(key=lambda x: x[1], reverse=True)
|
||||
top_predictions = predictions[:top_n]
|
||||
|
||||
recommendations = []
|
||||
for rank, (item_id, score) in enumerate(top_predictions, 1):
|
||||
recommendations.append((user_id, item_id, rank, score))
|
||||
|
||||
return recommendations
|
||||
|
||||
|
||||
def run_evaluation_with_proper_split(
|
||||
data_reader: DataReader, model, test_size: float = 0.2, top_n: int = 10
|
||||
) -> Dict:
|
||||
"""
|
||||
Alternative evaluation using a proper train/test split instead of leave-one-out.
|
||||
"""
|
||||
print(f"Starting evaluation with {test_size * 100}% test split...")
|
||||
start_time = time.time()
|
||||
|
||||
# 1. Split data into train/test
|
||||
train_dr, test_df = Splitter.split_leave_n_out(data_reader, frac=test_size)
|
||||
print(f"Split completed: {len(test_df)} test interactions")
|
||||
|
||||
# 2. Filter test set to ensure all users/items exist in the training set
|
||||
train_users = set(train_dr.dataset["userId"].unique())
|
||||
train_items = set(train_dr.dataset["itemId"].unique())
|
||||
|
||||
original_test_len = len(test_df)
|
||||
test_df = test_df[
|
||||
test_df["userId"].isin(train_users) & test_df["itemId"].isin(train_items)
|
||||
]
|
||||
print(
|
||||
f"Filtered test set: {len(test_df)} interactions remaining from {original_test_len}"
|
||||
)
|
||||
|
||||
# 2. Train model
|
||||
print("Training model...")
|
||||
model.fit(train_dr)
|
||||
|
||||
# 3. Generate recommendations
|
||||
print("Generating recommendations...")
|
||||
recommendations = generate_recommendations_batch(model, train_dr, test_df, top_n)
|
||||
|
||||
# 4. Evaluate
|
||||
evaluator = ModelEvaluator(test_df, top_n=top_n)
|
||||
hit_ratio = evaluator.cal_hit_ratio(recommendations)
|
||||
ndcg = evaluator.cal_ndcg(recommendations)
|
||||
|
||||
total_time = time.time() - start_time
|
||||
print(f"Evaluation completed in {total_time:.2f} seconds")
|
||||
|
||||
return {
|
||||
"Hit Ratio": hit_ratio,
|
||||
"NDCG": ndcg,
|
||||
"evaluation_time": total_time,
|
||||
"test_interactions": len(test_df),
|
||||
"total_recommendations": len(recommendations),
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
from typing import Dict, Any
|
||||
|
||||
from pygrex.utils import calculate_gild_for_explanations
|
||||
|
||||
|
||||
class ExplanationEvaluator:
|
||||
"""
|
||||
A unified evaluator for different explanation methods.
|
||||
|
||||
This class takes the results generated by an explainer and calculates
|
||||
a standard set of quality metrics, such as Fidelity and Diversity (GILD).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initializes the ExplanationEvaluator."""
|
||||
# This class is stateless, so __init__ is simple.
|
||||
pass
|
||||
|
||||
def evaluate(
|
||||
self, explanation_results: Dict[str, Any], explainer_type: str
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Calculates all relevant metrics for a given explanation result.
|
||||
|
||||
Args:
|
||||
explanation_results: The dictionary returned by an explainer's
|
||||
`find_explanation` method.
|
||||
explainer_type: A string identifier for the explainer used
|
||||
(e.g., "LORE4Groups", "EXPGRS").
|
||||
|
||||
Returns:
|
||||
A dictionary containing the calculated metric scores.
|
||||
"""
|
||||
if not explanation_results:
|
||||
return {"fidelity": 0.0, "gild": 0.0}
|
||||
|
||||
fidelity = self._calculate_fidelity(explanation_results)
|
||||
gild = self._calculate_gild(explanation_results, explainer_type)
|
||||
|
||||
return {"fidelity": fidelity, "gild": gild}
|
||||
|
||||
def _calculate_fidelity(self, explanation_results: Dict[str, Any]) -> float:
|
||||
"""
|
||||
Extracts the fidelity score from the explanation results.
|
||||
|
||||
Fidelity is computed by the explainer itself, as it's the ratio of
|
||||
items it was able to explain. This method standardizes its retrieval.
|
||||
"""
|
||||
return explanation_results.get("fidelity", 0.0)
|
||||
|
||||
def _calculate_gild(
|
||||
self, explanation_results: Dict[str, Any], explainer_type: str
|
||||
) -> float:
|
||||
"""
|
||||
Calculates the Gaussian Inter-List Diversity (GILD) of the explanations.
|
||||
|
||||
This is a wrapper around the utility function that handles the details.
|
||||
It uses the 'details' part of the explanation results.
|
||||
"""
|
||||
explanation_details = explanation_results.get("details", {})
|
||||
if not explanation_details:
|
||||
return 0.0
|
||||
|
||||
# The GILD function is now called from a central, logical place.
|
||||
gild_score = calculate_gild_for_explanations(
|
||||
explanation_details, explainer_type
|
||||
)
|
||||
return gild_score
|
||||
@@ -0,0 +1,179 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class ModelEvaluator:
|
||||
disc_functions = ["log", "linear"]
|
||||
|
||||
def __init__(self, test_set, top_n: int = 10, discount_function: str = "log"):
|
||||
self.test_set = test_set
|
||||
self._top_n = top_n
|
||||
assert discount_function in self.disc_functions, "Wrong Discount Function."
|
||||
self._discount_function = discount_function
|
||||
self.num_users = self.test_set.userId.nunique()
|
||||
|
||||
@property
|
||||
def top_n(self):
|
||||
return self._top_n
|
||||
|
||||
@top_n.setter
|
||||
def top_n(self, top_n: int):
|
||||
self._top_n = top_n
|
||||
|
||||
@property
|
||||
def discount_function(self):
|
||||
return self._discount_function
|
||||
|
||||
@discount_function.setter
|
||||
def discount_function(self, discount_function: str):
|
||||
assert discount_function in self.disc_functions, "Wrong Discount Function."
|
||||
self._discount_function = discount_function
|
||||
|
||||
def cal_hit_ratio(self, recommendations):
|
||||
"""
|
||||
Hit Ratio
|
||||
:param recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
|
||||
:return: hit rate.
|
||||
"""
|
||||
test_in_top_n = self.get_hits(recommendations)
|
||||
# count hits per user
|
||||
hits_per_user = self.count_positives(test_in_top_n)
|
||||
# merge with the entire list of positive items for user
|
||||
hits_per_user = hits_per_user.merge(
|
||||
self.count_positives(self.test_set),
|
||||
on="userId",
|
||||
suffixes=("_true", ""),
|
||||
how="right",
|
||||
)
|
||||
# if there are users with 0 hits the merge will have NA.
|
||||
hits_per_user = hits_per_user.fillna(0)
|
||||
# get the hit rate per user
|
||||
hit_rate = hits_per_user.positive_true / hits_per_user.positive
|
||||
# average
|
||||
hit_rate = hit_rate.mean()
|
||||
return hit_rate
|
||||
|
||||
def get_hits(self, recommendations):
|
||||
"""
|
||||
Find which items in the test set have a hit on the recommendations.
|
||||
:param recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
|
||||
:return: dataframe, removing the rows missing in the test set.
|
||||
"""
|
||||
# check whether there are top_n items per user
|
||||
top_n_recommendations = self.filter_to_top_n(recommendations)
|
||||
# find the hits
|
||||
test_in_top_n = pd.merge(
|
||||
top_n_recommendations, self.test_set, on=["userId", "itemId"]
|
||||
)
|
||||
return test_in_top_n
|
||||
|
||||
def filter_to_top_n(self, dataset):
|
||||
"""
|
||||
if rank > top_n, we do not use it for evaluation
|
||||
:param dataset: dataframe, columns = ['userId', 'itemId', 'rank']
|
||||
:return: dataframe, columns = ['userId', 'itemId', 'rank']
|
||||
"""
|
||||
return dataset[dataset["rank"] <= self.top_n]
|
||||
|
||||
def cal_ndcg(self, recommendations):
|
||||
r"""
|
||||
For evaluating the top-N recommendation list, we also provide the normalized Discounted Cumulative Gain at N
|
||||
recommendation (nDCG@N) computed as the ratio of the Discounted Cumulative Gain(DCG) with the ideal Discounted
|
||||
Cumulative Gain(IDCG):
|
||||
DGC_{pos} = rel_1 + \sum_{i=2}^{pos} \frac{rel_i}{\log_2i} \qquad \qquad
|
||||
IDGC_{pos} = rel_1 + \sum_{i=2}^{|h|-1} \frac{rel_i}{\log_2i} \\
|
||||
nDCG_{pos} = \frac{DCG}{IDCG}
|
||||
where pos denotes the position up to which relevance is accumulated, and $rel_i$ is the relevance of the recommended item at position \textit{i}.
|
||||
Ref: Y. Wang, L. Wang, Y. Li, D. He, T.-Y. Liu, and W. Chen.
|
||||
A theoretical analysis of ndcgtype ranking measures.
|
||||
:param recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
|
||||
:return: nDCG
|
||||
"""
|
||||
# get hits
|
||||
hits = self.get_hits(recommendations)
|
||||
|
||||
DCG = self.cal_dcg(hits)
|
||||
iDCG = self.cal_idcg()
|
||||
|
||||
# join to check if there are users in the test without hits
|
||||
nDCG = iDCG.merge(DCG, on="userId", how="left")
|
||||
nDCG = nDCG.fillna(0)
|
||||
# normalize
|
||||
nDCG["ndcg"] = nDCG["dcg"] / nDCG["idcg"]
|
||||
|
||||
return nDCG["ndcg"].mean()
|
||||
|
||||
def cal_dcg(self, hits):
|
||||
"""
|
||||
Discounted Comulative Gain
|
||||
:param hits: recommendations: dataframe, columns = ['userId', 'itemId', 'rank']
|
||||
:return: DCG
|
||||
"""
|
||||
# todo: the gain so far is set to a constant.
|
||||
|
||||
if self.discount_function == "log":
|
||||
hits["discounted_gain"] = np.log(2) / np.log(hits["rank"] + 1)
|
||||
elif self.discount_function == "linear":
|
||||
hits["discounted_gain"] = 1 / hits["rank"]
|
||||
|
||||
DCG = hits.groupby("userId")["discounted_gain"].sum()
|
||||
|
||||
return pd.DataFrame(
|
||||
{"userId": hits["userId"].unique(), "dcg": DCG}
|
||||
).reset_index(drop=True)
|
||||
|
||||
def cal_idcg(self):
|
||||
"""
|
||||
the Ideal DCG, is the DCG for the best ranking possible (i.e. all true positives were recommended first).
|
||||
:return: iDCG
|
||||
"""
|
||||
# create a fake ranking for test set items.
|
||||
# We assume that the items in the test set are all on the Top-N list.
|
||||
count_positives = self.count_positives(self.test_set)
|
||||
ideal_rank = [i for x in count_positives["positive"] for i in (range(1, x + 1))]
|
||||
test_ideal_ranking = self.test_set.copy()
|
||||
test_ideal_ranking["rank"] = ideal_rank
|
||||
# Filter to have at most top-N items.
|
||||
test_ideal_ranking = self.filter_to_top_n(test_ideal_ranking)
|
||||
# get the dcg for the ideal ranking
|
||||
idcg = self.cal_dcg(test_ideal_ranking)
|
||||
idcg = idcg.rename(columns={"dcg": "idcg"})
|
||||
return idcg
|
||||
|
||||
@staticmethod
|
||||
def count_positives(dataset):
|
||||
"""
|
||||
Returns the positives count.
|
||||
:param dataset: dataframe, columns = ['userId', 'itemId', 'rank']
|
||||
:return: dataframe, columns = ['userId', 'positive']
|
||||
"""
|
||||
users_with_positives = dataset.userId.unique()
|
||||
positives_per_user = dataset.groupby("userId")["itemId"].count()
|
||||
positives_per_user = pd.DataFrame(
|
||||
{"userId": users_with_positives, "positive": positives_per_user}
|
||||
)
|
||||
|
||||
return positives_per_user.reset_index(drop=True)
|
||||
|
||||
|
||||
# if __name__ == '__main__':
|
||||
## recoms = pd.DataFrame({
|
||||
# 'userId': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||||
# 'itemId': [1, 2, 3, 4, 1, 2, 2, 3, 4],
|
||||
# 'rank': [1, 2, 3, 1, 2, 3, 1, 2, 3]
|
||||
# })
|
||||
|
||||
# test = pd.DataFrame({
|
||||
# 'userId': [1, 1, 2, 3],
|
||||
# 'itemId': [1, 4, 1, 5]
|
||||
# })
|
||||
|
||||
# eval = Evaluator(test_set=test, top_n=2)
|
||||
|
||||
# assert eval.num_users == 3, 'number of users'
|
||||
# assert eval.top_n == 2, 'number of top n'
|
||||
# eval.top_n = 3
|
||||
# assert eval.top_n == 3, 'changing of top n'
|
||||
|
||||
# print(eval.cal_hit_ratio(recoms))
|
||||
# print(eval.cal_ndcg(recoms))
|
||||
@@ -0,0 +1,169 @@
|
||||
import sys
|
||||
import random
|
||||
import pandas as pd
|
||||
import copy
|
||||
|
||||
from pygrex.data_reader.data_reader import DataReader
|
||||
|
||||
|
||||
def fix_data_reader_mappings(source: DataReader, target: DataReader):
|
||||
target._num_user = source._num_user
|
||||
target._num_item = source._num_item
|
||||
# Copy over the original ID mappings
|
||||
target.original_user_id = source.original_user_id
|
||||
target.original_item_id = source.original_item_id
|
||||
target.new_user_id = source.new_user_id
|
||||
target.new_item_id = source.new_item_id
|
||||
return target
|
||||
|
||||
|
||||
class Splitter:
|
||||
"""
|
||||
Super Splitting Class.
|
||||
args:
|
||||
data: DataReader object, which contains in its dataset attribute 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def split_leave_latest_out(data: DataReader, n_latest: int = 1):
|
||||
"""
|
||||
Leave N latest interactions out train/test split.
|
||||
Ref:
|
||||
Campos, Pedro G., Fernando Díez, and Iván Cantador. "Time-aware recommender systems: a comprehensive survey and
|
||||
analysis of existing evaluation protocols." User Modeling and User-Adapted Interaction 24.1-2 (2014): 67-119.
|
||||
:param data:
|
||||
:param n_latest: int, number of latest interactions to be in the the test set.
|
||||
:returns train as DataReader, test as data.frames
|
||||
"""
|
||||
|
||||
# group items by suer id and rank them by timestamp
|
||||
rank_latest = data.dataset.groupby(["userId"])["timestamp"].rank(
|
||||
method="first", ascending=False
|
||||
)
|
||||
|
||||
# keep in test items that are ranked higher than n_latest
|
||||
test = data.dataset[rank_latest <= n_latest]
|
||||
# keep in train the rest
|
||||
train = DataReader(dataframe=data.dataset.copy())
|
||||
train.dataset = data.dataset[rank_latest > n_latest]
|
||||
|
||||
train = fix_data_reader_mappings(data, train)
|
||||
|
||||
return train, test
|
||||
|
||||
@staticmethod
|
||||
def split_leave_n_out(data: DataReader, n: int = 1, frac: float | None = None):
|
||||
"""
|
||||
Leave N latest interactions out train/test split.
|
||||
Ref:
|
||||
Shani, Guy, and Asela Gunawardana. "Evaluating recommendation systems." Recommender systems handbook. Springer,
|
||||
Boston, MA, 2011. 257-297.
|
||||
:param data:
|
||||
:param n int, number of interactions to be in the the test set.
|
||||
:param frac float, fraction.
|
||||
:returns dataframe train and test
|
||||
"""
|
||||
min_nr_ratings_user = min(data.dataset["userId"].value_counts())
|
||||
|
||||
if min_nr_ratings_user < n:
|
||||
sys.exit(
|
||||
"split_leave_n_out: There are users with less ratings than n (required number of interactions "
|
||||
"in the test set)."
|
||||
)
|
||||
|
||||
if frac is not None and frac > 1:
|
||||
sys.exit("f (i.e.) fraction should be smaller than 1.")
|
||||
|
||||
# group items by user id and extraxt a random number of items per user
|
||||
grouped = data.dataset.groupby(["userId"])
|
||||
if frac is not None:
|
||||
test = grouped.sample(frac=frac)
|
||||
else:
|
||||
test = grouped.sample(n=n)
|
||||
|
||||
test = test.reset_index(drop=True)
|
||||
train_pd = pd.merge(
|
||||
data.dataset,
|
||||
test,
|
||||
on=list(data.dataset.columns),
|
||||
how="outer",
|
||||
indicator=True,
|
||||
)
|
||||
train_pd = train_pd[train_pd["_merge"] == "left_only"]
|
||||
train_pd = train_pd.drop(columns="_merge")
|
||||
|
||||
train = copy.deepcopy(data)
|
||||
train.dataset = train_pd
|
||||
train = fix_data_reader_mappings(data, train)
|
||||
assert test.shape[0] + train_pd.shape[0] == data.dataset.shape[0]
|
||||
|
||||
return train, test
|
||||
|
||||
def rel_plus_n(
|
||||
self,
|
||||
data,
|
||||
negative_sample_size: int = 99,
|
||||
splitting: str = "latest",
|
||||
n: int = 1,
|
||||
):
|
||||
"""
|
||||
RelPlusN: We build the users test set by extracting one relevant random item ($HR_u$) from the entire set of
|
||||
rated items. Then a set of random items with unknown relevance ($NR_u$), is extracted for each user $u$, where $u$
|
||||
had no previous interaction with these items. Finally, for each item $i$ in $HR_u$, the algorithm requests a ranking
|
||||
of the top-$N$ items from the set $ {i} cup NR_u$, on which the evaluation is performed. The evaluation metrics
|
||||
are averaged over all the items in $HR_u$ and later over all the users. In the following, all experiments have been
|
||||
conducted according to this protocol.
|
||||
Ref:
|
||||
- Paolo Cremonesi, Yehuda Koren, and Roberto Turrin. 2010. Performance of Recommender Algorithms on Top-n
|
||||
Recommendation Tasks. InProceedings ofthe Fourth ACM Conference on Recommender Systems (RecSys ’10).
|
||||
- Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. 2017. Neural Collaborative
|
||||
Filtering. In Proceedings of the 26th InternationalConference on World Wide Web (WWW ’17).
|
||||
:param data
|
||||
:param negative_sample_size how many negative items to compute
|
||||
:param splitting either latest for leave n latest out, or n for leave n out
|
||||
:param n how many to leave out
|
||||
|
||||
"""
|
||||
|
||||
if splitting == "latest":
|
||||
train, test = self.split_leave_latest_out(data, n)
|
||||
elif splitting == "n":
|
||||
train, test = self.split_leave_n_out(data, n)
|
||||
else:
|
||||
sys.exit('splitting can be either "latest" or "n". ')
|
||||
|
||||
neg_sample = self.sample_negative(data, negative_sample_size)
|
||||
|
||||
return train, pd.concat([test, neg_sample], ignore_index=True)
|
||||
|
||||
@staticmethod
|
||||
def sample_negative(data, negative_sample_size):
|
||||
"""return all negative items"""
|
||||
|
||||
item_catalogue = set(data.dataset["itemId"])
|
||||
|
||||
interact_status = (
|
||||
data.dataset.groupby("userId")["itemId"]
|
||||
.apply(set)
|
||||
.reset_index()
|
||||
.rename(columns={"itemId": "interacted_items"})
|
||||
)
|
||||
interact_status["negative_items"] = interact_status["interacted_items"].apply(
|
||||
lambda x: item_catalogue - x
|
||||
)
|
||||
interact_status["negative_samples"] = interact_status["negative_items"].apply(
|
||||
lambda x: random.sample(x, negative_sample_size)
|
||||
)
|
||||
interact_status = interact_status[["userId", "negative_samples"]]
|
||||
|
||||
userId = []
|
||||
itemId = []
|
||||
for row in interact_status.itertuples():
|
||||
for i in range(negative_sample_size):
|
||||
userId.append(int(row.userId))
|
||||
itemId.append(int(row.negative_samples[i]))
|
||||
|
||||
return pd.DataFrame.from_dict({"userId": userId, "itemId": itemId})
|
||||
@@ -0,0 +1,18 @@
|
||||
from .individual.model_based_emf import EMFExplainer
|
||||
from .individual.model_based_als_explain import ALSExplainer
|
||||
from .individual.post_hoc_association_rules import ARPostHocExplainer
|
||||
from .individual.post_hoc_knn import KNNPostHocExplainer
|
||||
from .groups.rule_based_group_rec_explainer import RuleBasedGroupRecExplainer
|
||||
from .groups.sliding_window_explainer import SlidingWindowExplainer
|
||||
from .groups.lore4groups_explainer import LORE4GroupsExplainer
|
||||
|
||||
|
||||
__all__ = [
|
||||
"EMFExplainer",
|
||||
"ALSExplainer",
|
||||
"ARPostHocExplainer",
|
||||
"KNNPostHocExplainer",
|
||||
"RuleBasedGroupRecExplainer",
|
||||
"SlidingWindowExplainer",
|
||||
"LORE4GroupsExplainer",
|
||||
]
|
||||
@@ -0,0 +1,10 @@
|
||||
from .rule_based_group_rec_explainer import RuleBasedGroupRecExplainer
|
||||
from .sliding_window_explainer import SlidingWindowExplainer
|
||||
from .lore4groups_explainer import LORE4GroupsExplainer
|
||||
|
||||
|
||||
__all__ = [
|
||||
"RuleBasedGroupRecExplainer",
|
||||
"SlidingWindowExplainer",
|
||||
"LORE4GroupsExplainer",
|
||||
]
|
||||
@@ -0,0 +1,731 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import re
|
||||
import logging
|
||||
import traceback
|
||||
from collections import Counter
|
||||
from typing import Dict, Set, List, Optional, Any, Tuple, Union
|
||||
from sklearn.tree import DecisionTreeClassifier, _tree
|
||||
|
||||
ItemId = Union[str, int]
|
||||
UserId = Union[str, int]
|
||||
FactualRule = List[str]
|
||||
CounterfactualSet = List[List[str]]
|
||||
Explanation = Tuple[Optional[FactualRule], Optional[CounterfactualSet]]
|
||||
|
||||
|
||||
class LORE4GroupsExplainer:
|
||||
"""
|
||||
Enhanced LORE4Groups explainer that incorporates genre information
|
||||
and stores decision trees for visualization
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
item_profiles: Dict[str, Set[str]],
|
||||
item_label_matrix: pd.DataFrame,
|
||||
config: Dict,
|
||||
genre_profiles: Optional[Dict[str, Set[str]]] = None,
|
||||
):
|
||||
self.item_profiles = {str(k): v for k, v in item_profiles.items()}
|
||||
self.item_label_matrix = item_label_matrix
|
||||
self.params = config["explainer"]["lore4groups"]
|
||||
|
||||
# NEW: Store genre information
|
||||
self.genre_profiles = (
|
||||
{str(k): v for k, v in genre_profiles.items()} if genre_profiles else {}
|
||||
)
|
||||
|
||||
all_columns = item_label_matrix.columns.tolist()
|
||||
self.all_labels = [col for col in all_columns if col != "like"]
|
||||
|
||||
# Add 'like' back for target variable access (but not as feature)
|
||||
if "like" in all_columns:
|
||||
self.all_labels.append("like")
|
||||
|
||||
def _enhanced_jaccard_similarity(self, item1_id: ItemId, item2_id: ItemId) -> float:
|
||||
"""Enhanced Jaccard similarity that considers both tags and genres"""
|
||||
# Get regular tags
|
||||
tags1 = self.item_profiles.get(str(item1_id), set())
|
||||
tags2 = self.item_profiles.get(str(item2_id), set())
|
||||
|
||||
# Get genres and add them as features
|
||||
genres1 = self.genre_profiles.get(str(item1_id), set())
|
||||
genres2 = self.genre_profiles.get(str(item2_id), set())
|
||||
|
||||
# Combine tags and genres for enhanced similarity
|
||||
features1 = tags1.union({f"genre_{g.lower()}" for g in genres1})
|
||||
features2 = tags2.union({f"genre_{g.lower()}" for g in genres2})
|
||||
|
||||
if not features1 or not features2:
|
||||
return 0.0
|
||||
|
||||
union_len = len(features1.union(features2))
|
||||
intersection_len = len(features1.intersection(features2))
|
||||
|
||||
return intersection_len / union_len if union_len > 0 else 0.0
|
||||
|
||||
def _jaccard_similarity(self, item1_id: ItemId, item2_id: ItemId) -> float:
|
||||
"""Original jaccard similarity (kept for compatibility)"""
|
||||
tags1 = self.item_profiles.get(str(item1_id), set())
|
||||
tags2 = self.item_profiles.get(str(item2_id), set())
|
||||
if not tags1 or not tags2:
|
||||
return 0.0
|
||||
union_len = len(tags1.union(tags2))
|
||||
return len(tags1.intersection(tags2)) / union_len if union_len > 0 else 0.0
|
||||
|
||||
def _get_enhanced_similar_examples(
|
||||
self,
|
||||
user_id_consecutive: UserId,
|
||||
target_item_id: ItemId,
|
||||
user_hist: Set[ItemId],
|
||||
dataset: pd.DataFrame,
|
||||
model=None,
|
||||
data_reader=None,
|
||||
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
||||
"""Enhanced version that returns both DataFrame and metadata for visualization"""
|
||||
|
||||
# 1. Find all similar items using enhanced similarity
|
||||
similarities = [
|
||||
(seen_id, self._enhanced_jaccard_similarity(target_item_id, seen_id))
|
||||
for seen_id in user_hist
|
||||
]
|
||||
similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
|
||||
|
||||
sim_th = self.params.get("similarity_threshold", 0.0)
|
||||
top_similar_items_str = {
|
||||
item[0]
|
||||
for item in similarities[: self.params["n_similar_for_tree"]]
|
||||
if item[1] >= sim_th
|
||||
}
|
||||
|
||||
if not top_similar_items_str:
|
||||
return pd.DataFrame(), {}
|
||||
|
||||
# 2. Build the local dataset
|
||||
top_similar_items_int = [int(i) for i in top_similar_items_str]
|
||||
|
||||
# Get existing ratings for similar items
|
||||
local_df = dataset[
|
||||
(dataset["userId"] == user_id_consecutive)
|
||||
& (dataset["itemId"].isin(top_similar_items_int))
|
||||
].copy()
|
||||
|
||||
rated_items = set(local_df["itemId"])
|
||||
items_to_predict = [
|
||||
item for item in top_similar_items_int if item not in rated_items
|
||||
]
|
||||
|
||||
# Add predictions for unrated items
|
||||
if model and data_reader and items_to_predict:
|
||||
try:
|
||||
orig_user_id = data_reader.get_original_user_id(
|
||||
int(user_id_consecutive)
|
||||
)
|
||||
predicted_ratings = []
|
||||
|
||||
for item_id_consecutive in items_to_predict:
|
||||
orig_item_id = data_reader.get_original_item_id(
|
||||
int(item_id_consecutive)
|
||||
)
|
||||
pred = model.predict(orig_user_id, orig_item_id)
|
||||
predicted_ratings.append(
|
||||
{
|
||||
"userId": user_id_consecutive,
|
||||
"itemId": item_id_consecutive,
|
||||
"rating": float(pred),
|
||||
}
|
||||
)
|
||||
|
||||
if predicted_ratings:
|
||||
pred_df = pd.DataFrame(predicted_ratings)
|
||||
local_df = pd.concat([local_df, pred_df], ignore_index=True)
|
||||
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
|
||||
# Check minimum samples requirement
|
||||
if len(local_df) < 2:
|
||||
return pd.DataFrame(), {}
|
||||
|
||||
# 3. Apply thresholding with fallbacks
|
||||
rating_threshold = self.params["rating_threshold_for_like"]
|
||||
|
||||
threshold_info = {
|
||||
"was_overridden": False,
|
||||
"original_threshold": rating_threshold,
|
||||
"final_threshold": rating_threshold,
|
||||
}
|
||||
|
||||
local_df["like"] = (local_df["rating"] >= rating_threshold).astype(int)
|
||||
|
||||
# Apply fallback thresholds if needed
|
||||
like_counts = local_df["like"].value_counts()
|
||||
|
||||
if len(like_counts) < 2:
|
||||
# Try mean-based threshold
|
||||
mean_rating = local_df["rating"].mean()
|
||||
local_df["like"] = (local_df["rating"] >= mean_rating).astype(int)
|
||||
threshold_info["was_overridden"] = True
|
||||
threshold_info["final_threshold"] = mean_rating
|
||||
like_counts = local_df["like"].value_counts()
|
||||
if len(like_counts) < 2:
|
||||
return pd.DataFrame(), {}
|
||||
|
||||
# Check for severe imbalance (>90% one class)
|
||||
min_class_ratio = like_counts.min() / len(local_df)
|
||||
if min_class_ratio < 0.1:
|
||||
if like_counts.min() < 2:
|
||||
return pd.DataFrame(), {}
|
||||
|
||||
# 4. Construct the enhanced feature matrix (including genres)
|
||||
feature_labels = [label for label in self.all_labels if label != "like"]
|
||||
|
||||
examples = []
|
||||
genre_features_used = set()
|
||||
|
||||
for idx, row in local_df.iterrows():
|
||||
item_id = str(int(row["itemId"]))
|
||||
tags = self.item_profiles.get(item_id, set())
|
||||
genres = self.genre_profiles.get(item_id, set())
|
||||
|
||||
# Create base example with target variables
|
||||
example = {
|
||||
"movie_id": item_id,
|
||||
"rating": row["rating"],
|
||||
"like": int(row["like"]),
|
||||
}
|
||||
|
||||
# Add tag features (excluding 'like')
|
||||
for label in feature_labels:
|
||||
example[label] = 1 if label in tags else 0
|
||||
|
||||
# Add genre features dynamically
|
||||
for genre in genres:
|
||||
genre_feature = f"genre_{genre.lower()}"
|
||||
example[genre_feature] = 1
|
||||
genre_features_used.add(genre_feature)
|
||||
|
||||
# Also add to feature_labels if not already there
|
||||
if genre_feature not in feature_labels:
|
||||
feature_labels.append(genre_feature)
|
||||
|
||||
examples.append(example)
|
||||
|
||||
# Ensure all examples have all genre features
|
||||
for example in examples:
|
||||
for genre_feature in genre_features_used:
|
||||
if genre_feature not in example:
|
||||
example[genre_feature] = 0
|
||||
|
||||
final_df = pd.DataFrame(examples)
|
||||
|
||||
# Final validation
|
||||
if final_df["like"].nunique() < 2:
|
||||
return pd.DataFrame(), {}
|
||||
|
||||
# Prepare metadata for visualization
|
||||
metadata = {
|
||||
"feature_labels": [label for label in feature_labels if label != "like"],
|
||||
"genre_features": list(genre_features_used),
|
||||
"similarity_scores": dict(similarities[:5]), # Top 5 similarities
|
||||
"target_item_genres": self.genre_profiles.get(str(target_item_id), set()),
|
||||
"rating_threshold": threshold_info["final_threshold"],
|
||||
"threshold_info": threshold_info,
|
||||
}
|
||||
|
||||
return final_df, metadata
|
||||
|
||||
def _get_factual_path_for_item(
|
||||
self,
|
||||
clf: DecisionTreeClassifier,
|
||||
x_item: pd.DataFrame,
|
||||
metadata: Dict[str, Any],
|
||||
) -> Optional[List[str]]:
|
||||
"""
|
||||
Traces the specific path an item takes through the decision tree
|
||||
and returns the corresponding factual rule set.
|
||||
"""
|
||||
feature_labels = metadata.get("feature_labels", [])
|
||||
if not feature_labels:
|
||||
return None
|
||||
|
||||
# 1. Get the sequence of nodes the item travels through
|
||||
node_indicator = clf.decision_path(x_item)
|
||||
node_index = node_indicator.indices[ # type: ignore
|
||||
node_indicator.indptr[0] : node_indicator.indptr[ # type: ignore
|
||||
1
|
||||
]
|
||||
]
|
||||
|
||||
rules = []
|
||||
tree = clf.tree_
|
||||
|
||||
# 2. Iterate through the path to build the rules
|
||||
# We stop at the second to last node because the last one is the leaf
|
||||
for i in range(len(node_index) - 1):
|
||||
node_id = node_index[i]
|
||||
child_node_id = node_index[i + 1]
|
||||
|
||||
# Ensure this is not a leaf node
|
||||
if tree.feature[node_id] != _tree.TREE_UNDEFINED: # type: ignore
|
||||
feature_name = feature_labels[tree.feature[node_id]] # type: ignore
|
||||
threshold = tree.threshold[node_id] # type: ignore
|
||||
|
||||
# 3. Determine if the path went left or right to form the rule
|
||||
if child_node_id == tree.children_left[node_id]: # type: ignore
|
||||
# Path went left (True condition for <= threshold)
|
||||
rule = f"{feature_name} <= {threshold:.2f}"
|
||||
else:
|
||||
# Path went right (False condition for <= threshold)
|
||||
rule = f"{feature_name} > {threshold:.2f}"
|
||||
|
||||
# Use the same enhanced formatting as before for consistency
|
||||
if feature_name.startswith("genre_"):
|
||||
genre_name = feature_name.replace("genre_", "").title()
|
||||
if child_node_id == tree.children_left[node_id]: # type: ignore
|
||||
rules.append(f"Does NOT have genre: `{genre_name}`")
|
||||
else:
|
||||
rules.append(f"Has genre: `{genre_name}`")
|
||||
else:
|
||||
rules.append(rule)
|
||||
|
||||
return rules if rules else None
|
||||
|
||||
def _train_enhanced_decision_tree(
|
||||
self,
|
||||
user_id_consecutive: UserId,
|
||||
item_id: ItemId,
|
||||
user_hist: Set[ItemId],
|
||||
dataset: pd.DataFrame,
|
||||
model=None,
|
||||
data_reader=None,
|
||||
) -> Tuple[Optional[DecisionTreeClassifier], Dict[str, Any]]:
|
||||
"""Enhanced tree training that returns both classifier and metadata"""
|
||||
|
||||
df_examples, metadata = self._get_enhanced_similar_examples(
|
||||
user_id_consecutive, item_id, user_hist, dataset, model, data_reader
|
||||
)
|
||||
|
||||
if df_examples.empty:
|
||||
return None, {}
|
||||
|
||||
like_counts = df_examples["like"].value_counts()
|
||||
|
||||
if len(like_counts) < 2 or like_counts.min() < 2:
|
||||
return None, {}
|
||||
|
||||
feature_labels = metadata.get("feature_labels", [])
|
||||
X = df_examples[feature_labels]
|
||||
y = df_examples["like"]
|
||||
|
||||
# Verify feature matrix has variance
|
||||
feature_variances = X.var()
|
||||
if (feature_variances == 0).all():
|
||||
return None, {}
|
||||
|
||||
clf = DecisionTreeClassifier(
|
||||
max_depth=5, # Slightly deeper to accommodate genre features
|
||||
min_samples_split=max(4, len(df_examples) // 4),
|
||||
min_samples_leaf=2,
|
||||
random_state=42,
|
||||
class_weight="balanced",
|
||||
)
|
||||
|
||||
try:
|
||||
clf.fit(X, y)
|
||||
|
||||
# Enhanced feature importance analysis
|
||||
feature_importance = list(zip(feature_labels, clf.feature_importances_))
|
||||
important_features = [
|
||||
(f, imp) for f, imp in feature_importance if imp > 0.001
|
||||
]
|
||||
genre_important_features = [
|
||||
(f, imp) for f, imp in important_features if f.startswith("genre_")
|
||||
]
|
||||
|
||||
# Add classifier and feature info to metadata
|
||||
metadata.update(
|
||||
{
|
||||
"classifier": clf,
|
||||
"feature_importance": dict(feature_importance),
|
||||
"important_features": important_features,
|
||||
"genre_important_features": genre_important_features,
|
||||
"training_data_size": len(df_examples),
|
||||
"class_distribution": like_counts.to_dict(),
|
||||
}
|
||||
)
|
||||
|
||||
return clf, metadata
|
||||
|
||||
except Exception as _:
|
||||
return None, {}
|
||||
|
||||
def _get_enhanced_explanation_path(
|
||||
self,
|
||||
clf: DecisionTreeClassifier,
|
||||
x_item: pd.DataFrame,
|
||||
metadata: Dict[str, Any],
|
||||
) -> Optional[List[str]]:
|
||||
"""Enhanced explanation path that provides better rule descriptions"""
|
||||
|
||||
if 1 not in clf.classes_:
|
||||
return None
|
||||
|
||||
leaf_id = clf.apply(x_item)[0] # type: ignore
|
||||
class_index = np.where(clf.classes_ == 1)[0]
|
||||
if not class_index.size or clf.tree_.value[leaf_id][0][class_index[0]] == 0: # type: ignore
|
||||
return None
|
||||
|
||||
node_indicator = clf.decision_path(x_item)
|
||||
node_index = node_indicator.indices[ # type: ignore
|
||||
node_indicator.indptr[0] : node_indicator.indptr[ # type: ignore
|
||||
1
|
||||
]
|
||||
]
|
||||
|
||||
rules = []
|
||||
feature_labels = metadata.get("feature_labels", [])
|
||||
|
||||
for i in range(len(node_index) - 1): # Exclude leaf node
|
||||
node_id = node_index[i]
|
||||
next_node_id = node_index[i + 1]
|
||||
|
||||
if clf.tree_.feature[node_id] != _tree.TREE_UNDEFINED: # type: ignore
|
||||
feature_name = feature_labels[clf.tree_.feature[node_id]] # type: ignore
|
||||
threshold = clf.tree_.threshold[node_id] # type: ignore
|
||||
|
||||
# Enhanced rule formatting based on feature type
|
||||
if feature_name.startswith("genre_"):
|
||||
genre_name = feature_name.replace("genre_", "").title()
|
||||
if next_node_id == clf.tree_.children_left[node_id]: # type: ignore
|
||||
rules.append(f"Does NOT have genre: `{genre_name}`")
|
||||
else:
|
||||
rules.append(f"Has genre: `{genre_name}`")
|
||||
else:
|
||||
# Regular tag features
|
||||
if next_node_id == clf.tree_.children_left[node_id]: # type: ignore
|
||||
rules.append(f"{feature_name} <= {threshold}")
|
||||
else:
|
||||
rules.append(f"{feature_name} > {threshold}")
|
||||
|
||||
return rules
|
||||
|
||||
def _generate_enhanced_individual_explanation(
|
||||
self, clf: DecisionTreeClassifier, item_id: ItemId, metadata: Dict[str, Any]
|
||||
) -> Optional[Explanation]:
|
||||
"""Enhanced individual explanation generation"""
|
||||
|
||||
if str(item_id) not in self.item_label_matrix.index:
|
||||
return None
|
||||
|
||||
x_item_full = self.item_label_matrix.loc[[str(item_id)]]
|
||||
feature_labels = metadata.get("feature_labels", [])
|
||||
|
||||
try:
|
||||
# For genre features, we need to dynamically add them to the item
|
||||
item_genres = self.genre_profiles.get(str(item_id), set())
|
||||
|
||||
# Create enhanced item representation
|
||||
enhanced_item_data = x_item_full.copy()
|
||||
|
||||
# Add genre features
|
||||
for genre in item_genres:
|
||||
genre_feature = f"genre_{genre.lower()}"
|
||||
if genre_feature in feature_labels:
|
||||
enhanced_item_data[genre_feature] = 1
|
||||
|
||||
# Ensure all genre features exist (set to 0 if not present)
|
||||
for feature in feature_labels:
|
||||
if (
|
||||
feature.startswith("genre_")
|
||||
and feature not in enhanced_item_data.columns
|
||||
):
|
||||
enhanced_item_data[feature] = 0
|
||||
|
||||
# Select only the features used in training
|
||||
x_item = enhanced_item_data[feature_labels]
|
||||
|
||||
except KeyError as _:
|
||||
return None
|
||||
# Get enhanced factual rule
|
||||
# factual_rule = self._get_enhanced_explanation_path(clf, x_item, metadata)
|
||||
factual_rule = self._get_factual_path_for_item(clf, x_item, metadata)
|
||||
|
||||
if not factual_rule:
|
||||
return None
|
||||
|
||||
# Get counterfactuals (reuse existing method)
|
||||
counterfactual_set = self._get_counterfactual_paths(clf, x_item)
|
||||
if not counterfactual_set:
|
||||
return None
|
||||
|
||||
return (factual_rule, counterfactual_set)
|
||||
|
||||
def _get_counterfactual_paths(
|
||||
self, clf: DecisionTreeClassifier, x_item: pd.DataFrame
|
||||
) -> Optional[CounterfactualSet]:
|
||||
"""Original counterfactual path method (kept for compatibility)"""
|
||||
tree = clf.tree_
|
||||
paths = []
|
||||
|
||||
def find_paths(node_id, current_path):
|
||||
if tree.feature[node_id] == _tree.TREE_UNDEFINED: # type: ignore
|
||||
class_index = np.where(clf.classes_ == 0)[0]
|
||||
if class_index.size and tree.value[node_id][0][class_index[0]] > 0:
|
||||
paths.append(list(current_path))
|
||||
return
|
||||
feature_idx = tree.feature[node_id] # type: ignore
|
||||
threshold = tree.threshold[node_id] # type: ignore
|
||||
current_path.append((feature_idx, "<=", threshold))
|
||||
find_paths(tree.children_left[node_id], current_path) # type: ignore
|
||||
current_path.pop()
|
||||
current_path.append((feature_idx, ">", threshold))
|
||||
find_paths(tree.children_right[node_id], current_path) # type: ignore
|
||||
current_path.pop()
|
||||
|
||||
find_paths(0, [])
|
||||
if not paths:
|
||||
return None
|
||||
|
||||
min_nf = float("inf")
|
||||
counterfactuals = []
|
||||
for path in paths:
|
||||
nf = 0
|
||||
for feature_idx, op, threshold in path:
|
||||
if feature_idx < len(x_item.columns):
|
||||
item_val = x_item.iloc[0, feature_idx]
|
||||
if not (
|
||||
(op == "<=" and item_val <= threshold)
|
||||
or (op == ">" and item_val > threshold)
|
||||
):
|
||||
nf += 1
|
||||
if nf < min_nf:
|
||||
min_nf = nf
|
||||
counterfactuals = [path]
|
||||
elif nf == min_nf:
|
||||
counterfactuals.append(path)
|
||||
|
||||
# Enhanced counterfactual formatting
|
||||
formatted_counterfactuals = []
|
||||
for cf_path in counterfactuals:
|
||||
formatted_path = []
|
||||
for idx, op, _ in cf_path:
|
||||
if idx < len(x_item.columns):
|
||||
feature_name = x_item.columns[idx]
|
||||
if feature_name.startswith("genre_"):
|
||||
genre_name = feature_name.replace("genre_", "").title()
|
||||
if op == "<=":
|
||||
formatted_path.append(
|
||||
f"Does NOT have genre: `{genre_name}`"
|
||||
)
|
||||
else:
|
||||
formatted_path.append(f"Has genre: `{genre_name}`")
|
||||
else:
|
||||
formatted_path.append(f"{feature_name} {op} 0.5")
|
||||
if formatted_path:
|
||||
formatted_counterfactuals.append(formatted_path)
|
||||
|
||||
return formatted_counterfactuals if formatted_counterfactuals else None
|
||||
|
||||
def _aggregate_factual_rules(
|
||||
self, individual_explanations: Dict[UserId, List[str]], total_group_size: int
|
||||
) -> Dict[str, List[str]]:
|
||||
"""
|
||||
Aggregates individual factual rules into a group consensus by finding
|
||||
the rules supported by a majority of members.
|
||||
"""
|
||||
|
||||
# Flatten the list of all rules from all users into a single list
|
||||
all_rules_flat = [
|
||||
rule
|
||||
for rules_list in individual_explanations.values()
|
||||
for rule in rules_list
|
||||
]
|
||||
|
||||
if not all_rules_flat:
|
||||
return {"unanimous": [], "majority": [], "minority": []}
|
||||
|
||||
# Count the occurrences of each rule
|
||||
rule_counts = Counter(all_rules_flat)
|
||||
|
||||
majority_threshold = (total_group_size // 2) + 1 if total_group_size > 1 else 1
|
||||
minority_threshold = 1
|
||||
cleaned_rules_set = self._clean_contradictory_rules(set(rule_counts.keys()))
|
||||
categorized_rules = {"unanimous": [], "majority": [], "minority": []}
|
||||
|
||||
for rule in sorted(list(cleaned_rules_set)):
|
||||
count = rule_counts[rule]
|
||||
rule_with_support = f"{rule} ({count}/{total_group_size} members)"
|
||||
|
||||
if count == total_group_size:
|
||||
categorized_rules["unanimous"].append(rule_with_support)
|
||||
elif count >= majority_threshold:
|
||||
categorized_rules["majority"].append(rule_with_support)
|
||||
elif count >= minority_threshold:
|
||||
categorized_rules["minority"].append(rule_with_support)
|
||||
|
||||
return categorized_rules
|
||||
|
||||
def _clean_contradictory_rules(self, rules_set: Set[str]) -> Set[str]:
|
||||
"""Enhanced contradiction cleaning that handles genre rules"""
|
||||
conditions_by_attr = {}
|
||||
|
||||
for rule in rules_set:
|
||||
# Handle genre rules
|
||||
if "Has genre:" in rule or "Does NOT have genre:" in rule:
|
||||
genre_match = re.search(r"`([^`]+)`", rule)
|
||||
if genre_match:
|
||||
genre = genre_match.group(1)
|
||||
attr = f"genre_{genre}"
|
||||
op = "has" if "Has genre:" in rule else "not_has"
|
||||
conditions_by_attr.setdefault(attr, set()).add(op)
|
||||
else:
|
||||
# Handle regular rules
|
||||
match = re.match(r"(.+?)\s*([<>]=?)\s*(\d+\.?\d*)", rule)
|
||||
if match:
|
||||
attr, op, _ = match.groups()
|
||||
conditions_by_attr.setdefault(attr.strip(), set()).add(op)
|
||||
|
||||
# Find contradictory attributes
|
||||
invalid_attrs = set()
|
||||
for attr, ops in conditions_by_attr.items():
|
||||
if attr.startswith("genre_"):
|
||||
# Genre contradiction: has and not_has same genre
|
||||
if "has" in ops and "not_has" in ops:
|
||||
invalid_attrs.add(attr)
|
||||
else:
|
||||
# Numerical contradiction: <= and >
|
||||
if any(op in ops for op in ["<=", "<"]) and any(
|
||||
op in ops for op in [">", ">="]
|
||||
):
|
||||
invalid_attrs.add(attr)
|
||||
|
||||
# Remove contradictory rules
|
||||
clean_rules = set()
|
||||
for rule in rules_set:
|
||||
is_invalid = False
|
||||
for invalid_attr in invalid_attrs:
|
||||
if invalid_attr.startswith("genre_"):
|
||||
genre = invalid_attr.replace("genre_", "")
|
||||
if f"`{genre}`" in rule:
|
||||
is_invalid = True
|
||||
break
|
||||
else:
|
||||
if invalid_attr in rule:
|
||||
is_invalid = True
|
||||
break
|
||||
|
||||
if not is_invalid:
|
||||
clean_rules.add(rule)
|
||||
|
||||
return clean_rules
|
||||
|
||||
def find_explanation(
|
||||
self,
|
||||
recommended_items: List[ItemId],
|
||||
members: List[UserId],
|
||||
user_hist: Dict[UserId, Set[ItemId]],
|
||||
dataset: pd.DataFrame,
|
||||
model=None,
|
||||
data_reader=None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Enhanced explanation finding with tree storage for visualization"""
|
||||
if data_reader is None:
|
||||
raise ValueError(
|
||||
"A 'data_reader' object must be provided to find explanations."
|
||||
)
|
||||
|
||||
detailed_explanations = {}
|
||||
explainable_count = 0
|
||||
|
||||
if not recommended_items:
|
||||
return {"fidelity": 0.0, "details": {}}
|
||||
|
||||
for item_id in recommended_items:
|
||||
all_individual_rules = {}
|
||||
all_counterfactuals = {}
|
||||
stored_classifiers = {} # Store classifiers for visualization
|
||||
stored_metadata = {} # Store metadata for visualization
|
||||
representative_decision_path = None
|
||||
threshold_info_for_item = None
|
||||
|
||||
for user_id in members:
|
||||
user_id_consecutive = data_reader.get_new_user_id(user_id)
|
||||
clf, metadata = self._train_enhanced_decision_tree(
|
||||
user_id_consecutive,
|
||||
item_id,
|
||||
user_hist.get(user_id, set()),
|
||||
dataset,
|
||||
model,
|
||||
data_reader,
|
||||
)
|
||||
|
||||
if clf and metadata:
|
||||
if threshold_info_for_item is None and "threshold_info" in metadata:
|
||||
threshold_info_for_item = metadata["threshold_info"]
|
||||
|
||||
explanation = self._generate_enhanced_individual_explanation(
|
||||
clf, item_id, metadata
|
||||
)
|
||||
|
||||
if explanation:
|
||||
r, phi = explanation
|
||||
all_individual_rules[user_id] = r
|
||||
all_counterfactuals[user_id] = phi
|
||||
|
||||
if representative_decision_path is None:
|
||||
representative_decision_path = r
|
||||
# Store for visualization (use first successful classifier)
|
||||
if not stored_classifiers:
|
||||
stored_classifiers[user_id] = clf
|
||||
stored_metadata[user_id] = metadata
|
||||
|
||||
total_members_in_group = len(members)
|
||||
factual_set = self._aggregate_factual_rules(
|
||||
all_individual_rules, total_members_in_group
|
||||
)
|
||||
|
||||
if representative_decision_path and factual_set:
|
||||
explainable_count += 1
|
||||
|
||||
# Enhanced detailed explanations with visualization data
|
||||
item_explanation = {
|
||||
"decision_path": representative_decision_path,
|
||||
"group_factual_rule": factual_set,
|
||||
"individual_counterfactuals": all_counterfactuals,
|
||||
}
|
||||
|
||||
if threshold_info_for_item:
|
||||
item_explanation["threshold_info"] = threshold_info_for_item
|
||||
|
||||
# Add visualization data if available
|
||||
if stored_classifiers:
|
||||
user_id_for_viz = list(stored_classifiers.keys())[0]
|
||||
item_explanation.update(
|
||||
{
|
||||
"decision_tree": stored_classifiers[user_id_for_viz],
|
||||
"feature_names": stored_metadata[user_id_for_viz].get(
|
||||
"feature_labels", []
|
||||
),
|
||||
"tree_metadata": stored_metadata[user_id_for_viz],
|
||||
"item_genres": self.genre_profiles.get(str(item_id), set()),
|
||||
}
|
||||
)
|
||||
|
||||
detailed_explanations[item_id] = item_explanation
|
||||
|
||||
fidelity = (
|
||||
explainable_count / len(recommended_items) if recommended_items else 0.0
|
||||
)
|
||||
|
||||
group_explanations = {
|
||||
"fidelity": fidelity,
|
||||
"details": detailed_explanations,
|
||||
}
|
||||
|
||||
logging.info(
|
||||
f"Enhanced fidelity for {members}: {fidelity:.3f} ({explainable_count}/{len(recommended_items)})"
|
||||
)
|
||||
|
||||
return group_explanations
|
||||
@@ -0,0 +1,314 @@
|
||||
"""Rule-based group recommendation explainer module."""
|
||||
|
||||
from typing import Dict, List, Optional, Set, Union
|
||||
import logging
|
||||
|
||||
from pygrex.data_reader.data_reader import DataReader
|
||||
from pygrex.utils.association_rules import AssociationRules
|
||||
|
||||
# Type aliases for better readability
|
||||
ItemId = Union[str, int]
|
||||
MemberId = Union[str, int]
|
||||
UserHistory = Dict[MemberId, Set[ItemId]]
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RuleBasedGroupRecExplainer:
|
||||
"""
|
||||
A class to explain group recommendations using rule-based methods.
|
||||
|
||||
This class provides methods to generate explanations for group recommendations
|
||||
based on association rules and user interaction history.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
rules: AssociationRules,
|
||||
data: DataReader,
|
||||
pool_recommendations: Optional[Union[List[ItemId], ItemId]] = None,
|
||||
members: Optional[List[MemberId]] = None,
|
||||
user_history: Optional[UserHistory] = None,
|
||||
min_members_threshold: int = 1,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the RuleBasedGroupRecExplainer.
|
||||
|
||||
Args:
|
||||
rules: An instance of AssociationRules containing the rules for explanations.
|
||||
pool_recommendations: A list of item IDs to explain, or a single item ID.
|
||||
members: A list of member IDs in the group.
|
||||
user_history: A dictionary mapping member IDs to sets of item IDs
|
||||
they have interacted with.
|
||||
min_members_threshold: Minimum number of members that must satisfy
|
||||
the rule for it to be considered valid.
|
||||
|
||||
Raises:
|
||||
ValueError: If min_members_threshold is less than 1.
|
||||
"""
|
||||
if min_members_threshold < 1:
|
||||
raise ValueError("min_members_threshold must be at least 1")
|
||||
|
||||
self.rules = rules
|
||||
self.members = members or []
|
||||
self.min_members_threshold = min_members_threshold
|
||||
self.user_history = user_history or {}
|
||||
self.data = data
|
||||
|
||||
# Normalize pool_recommendations to always be a list
|
||||
self.pool_recommendations = self._normalize_recommendations(
|
||||
pool_recommendations
|
||||
)
|
||||
|
||||
def _normalize_recommendations(
|
||||
self, recommendations: Optional[Union[List[ItemId], ItemId]]
|
||||
) -> List[ItemId]:
|
||||
"""
|
||||
Normalize recommendations input to a list format.
|
||||
|
||||
Args:
|
||||
recommendations: Single item ID, list of item IDs, or None.
|
||||
|
||||
Returns:
|
||||
List of item IDs.
|
||||
"""
|
||||
if recommendations is None:
|
||||
return []
|
||||
|
||||
if isinstance(recommendations, (str, int)):
|
||||
return [recommendations]
|
||||
|
||||
return recommendations
|
||||
|
||||
def _is_rule_satisfied_by_member(
|
||||
self, member: MemberId, antecedent: Set[ItemId]
|
||||
) -> bool:
|
||||
"""
|
||||
Check if a member satisfies the rule's antecedent.
|
||||
|
||||
Args:
|
||||
member: The member ID to check.
|
||||
antecedent: The set of items that form the rule's antecedent.
|
||||
|
||||
Returns:
|
||||
True if the member's history contains all items in the antecedent.
|
||||
"""
|
||||
|
||||
member_history = self.user_history.get(member, set())
|
||||
member_history_str = {str(item) for item in member_history}
|
||||
|
||||
x = member_history_str.issuperset(antecedent)
|
||||
return x
|
||||
|
||||
def _count_satisfied_members(self, antecedent: Set[ItemId]) -> int:
|
||||
"""
|
||||
Count how many members satisfy the given antecedent.
|
||||
|
||||
Args:
|
||||
antecedent: The set of items that form the rule's antecedent.
|
||||
|
||||
Returns:
|
||||
Number of members whose history satisfies the antecedent.
|
||||
"""
|
||||
return sum(
|
||||
1
|
||||
for member in self.members
|
||||
if self._is_rule_satisfied_by_member(member, antecedent)
|
||||
)
|
||||
|
||||
def _find_applicable_rules(self, item_id: ItemId):
|
||||
"""
|
||||
Find rules that have the given item in their consequents.
|
||||
|
||||
Args:
|
||||
item_id: The item ID to find rules for.
|
||||
|
||||
Returns:
|
||||
DataFrame containing applicable rules.
|
||||
"""
|
||||
item_id = self.data.get_new_item_id(item_id) # type: ignore
|
||||
|
||||
applicable_rules = self.rules[ # type: ignore
|
||||
self.rules["consequents"].apply(lambda x: str(item_id) in x) # type: ignore
|
||||
]
|
||||
|
||||
return applicable_rules
|
||||
|
||||
def find_explanation(self) -> float:
|
||||
"""
|
||||
Generate explanations for the group recommendations based on the rules.
|
||||
|
||||
Returns:
|
||||
The fidelity of the explanations, which is the ratio of explained
|
||||
recommendations to total recommendations in the pool.
|
||||
"""
|
||||
if not self.pool_recommendations:
|
||||
logger.warning("No recommendations to explain")
|
||||
return 0.0
|
||||
|
||||
explained_count = 0
|
||||
total_recommendations = len(self.pool_recommendations)
|
||||
|
||||
for item_id in self.pool_recommendations:
|
||||
if self._can_explain_item(item_id):
|
||||
explained_count += 1
|
||||
|
||||
fidelity = explained_count / total_recommendations
|
||||
logger.info(
|
||||
f"Explained {explained_count}/{total_recommendations} recommendations "
|
||||
f"(fidelity: {fidelity:.3f})"
|
||||
)
|
||||
|
||||
return fidelity
|
||||
|
||||
def _can_explain_item(self, item_id: ItemId) -> bool:
|
||||
"""
|
||||
Check if an item can be explained by any rule.
|
||||
|
||||
Args:
|
||||
item_id: The item ID to check.
|
||||
|
||||
Returns:
|
||||
True if at least one rule can explain the item.
|
||||
"""
|
||||
applicable_rules = self._find_applicable_rules(item_id)
|
||||
|
||||
for _, rule in applicable_rules.iterrows():
|
||||
antecedent = rule["antecedents"]
|
||||
satisfied_count = self._count_satisfied_members(antecedent)
|
||||
|
||||
if satisfied_count >= self.min_members_threshold:
|
||||
logger.debug(f"Rule fired for item {item_id}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def get_explanation_details(self) -> Dict[ItemId, List[Dict]]:
|
||||
"""
|
||||
Get detailed explanations for each recommendation.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping item IDs to lists of applicable rule details.
|
||||
"""
|
||||
explanations = {}
|
||||
|
||||
for item_id in self.pool_recommendations:
|
||||
item_explanations = []
|
||||
applicable_rules = self._find_applicable_rules(item_id)
|
||||
|
||||
for _, rule in applicable_rules.iterrows():
|
||||
antecedent = rule["antecedents"]
|
||||
satisfied_count = self._count_satisfied_members(antecedent)
|
||||
|
||||
if satisfied_count >= self.min_members_threshold:
|
||||
item_explanations.append(
|
||||
{
|
||||
"antecedent": antecedent,
|
||||
"consequent": rule["consequents"],
|
||||
"satisfied_members": satisfied_count,
|
||||
"confidence": rule.get("confidence", "N/A"),
|
||||
"support": rule.get("support", "N/A"),
|
||||
}
|
||||
)
|
||||
|
||||
explanations[item_id] = item_explanations
|
||||
|
||||
return explanations
|
||||
|
||||
def compute_group_fidelity_advanced(self) -> float:
|
||||
"""
|
||||
Compute group fidelity using advanced conditions.
|
||||
|
||||
This method implements a more sophisticated fidelity calculation where:
|
||||
- Condition 1: Each member of the group must have seen at least one item from the antecedent
|
||||
- Condition 2: Each item in the antecedent must have been seen by at least one member
|
||||
|
||||
Returns:
|
||||
The fidelity score as a float between 0 and 1.
|
||||
"""
|
||||
if not self.pool_recommendations:
|
||||
logger.warning("No recommendations to explain")
|
||||
return 0.0
|
||||
|
||||
if not self.members:
|
||||
logger.warning("No group members defined")
|
||||
return 0.0
|
||||
|
||||
explained_count = 0
|
||||
total_recommendations = len(self.pool_recommendations)
|
||||
|
||||
# Convert member IDs to set for faster lookup
|
||||
members_set = set(self.members)
|
||||
|
||||
# Get all items seen by any group member
|
||||
all_seen_items = set()
|
||||
for member in members_set:
|
||||
member_history = self.user_history.get(member, set())
|
||||
# Convert to strings for consistency with rules
|
||||
member_history_str = {str(item) for item in member_history}
|
||||
all_seen_items.update(member_history_str)
|
||||
|
||||
for item_id in self.pool_recommendations:
|
||||
if self._can_explain_item_advanced(item_id, members_set, all_seen_items):
|
||||
explained_count += 1
|
||||
|
||||
fidelity = explained_count / total_recommendations
|
||||
logger.info(
|
||||
f"Advanced explanation: {explained_count}/{total_recommendations} recommendations "
|
||||
f"(fidelity: {fidelity:.3f})"
|
||||
)
|
||||
|
||||
return fidelity
|
||||
|
||||
def _can_explain_item_advanced(
|
||||
self, item_id: ItemId, members_set: Set[MemberId], all_seen_items: Set[str]
|
||||
) -> bool:
|
||||
"""
|
||||
Check if an item can be explained using advanced conditions.
|
||||
|
||||
Args:
|
||||
item_id: The item ID to check.
|
||||
members_set: Set of group member IDs.
|
||||
all_seen_items: Set of all items seen by any group member.
|
||||
|
||||
Returns:
|
||||
True if the item can be explained by at least one rule satisfying both conditions.
|
||||
"""
|
||||
applicable_rules = self._find_applicable_rules(item_id)
|
||||
|
||||
for _, rule in applicable_rules.iterrows():
|
||||
antecedent = rule["antecedents"]
|
||||
|
||||
# Condition 1: Each member must have seen at least one item from the antecedent
|
||||
cond1 = all(
|
||||
self._member_has_antecedent_item(member, antecedent)
|
||||
for member in members_set
|
||||
)
|
||||
|
||||
# Condition 2: Each item in the antecedent must have been seen by at least one member
|
||||
cond2 = antecedent.issubset(all_seen_items)
|
||||
|
||||
if cond1 and cond2:
|
||||
logger.debug(f"Advanced rule fired for item {item_id}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _member_has_antecedent_item(
|
||||
self, member: MemberId, antecedent: Set[ItemId]
|
||||
) -> bool:
|
||||
"""
|
||||
Check if a member has seen at least one item from the antecedent.
|
||||
|
||||
Args:
|
||||
member: The member ID to check.
|
||||
antecedent: The set of items in the rule's antecedent.
|
||||
|
||||
Returns:
|
||||
True if the member has seen at least one item from the antecedent.
|
||||
"""
|
||||
member_history = self.user_history.get(member, set())
|
||||
member_history_str = {str(item) for item in member_history}
|
||||
|
||||
# Check if there's any intersection between member history and antecedent
|
||||
return len(antecedent.intersection(member_history_str)) > 0
|
||||
@@ -0,0 +1,434 @@
|
||||
import itertools
|
||||
from typing import Dict, List, Sequence, Union
|
||||
|
||||
from pygrex.data_reader import DataReader, GroupInteractionHandler
|
||||
from pygrex.models import RecommenderModel
|
||||
from pygrex.recommender import GroupRecommender
|
||||
from pygrex.utils import SlidingWindowRanker, SlidingWindow, AggregationStrategy
|
||||
|
||||
|
||||
class SlidingWindowExplainer:
|
||||
"""
|
||||
Stratigi, M., Bikakis, N., Stefanidis, K.: Counterfactual explanations for group
|
||||
recommendations. In: Proceedings of the 27th International Workshop on Design,
|
||||
Optimization, Languages and Analytical Processing of Big Data (DOLAP 2025).
|
||||
|
||||
A class that uses a sliding window approach to find counterfactual explanations
|
||||
for group recommendation systems.
|
||||
|
||||
This class helps identify which items, if removed from the group's interaction history,
|
||||
would cause a specific target item to no longer appear in the group recommendations.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config,
|
||||
data: DataReader,
|
||||
group_handler: GroupInteractionHandler,
|
||||
members: List[Union[str, int]],
|
||||
target_item: Union[str, int],
|
||||
model: RecommenderModel,
|
||||
aggregation_strategy: AggregationStrategy = AggregationStrategy.AVG_PREDICTIONS,
|
||||
window_size=3,
|
||||
):
|
||||
"""
|
||||
Initialize the SlidingWindowExplainer.
|
||||
|
||||
Args:
|
||||
config: Configuration object with model parameters
|
||||
data: DataReader object containing the dataset
|
||||
group_handler: Object that handles group data modifications
|
||||
members: List of user IDs in the group
|
||||
target_item: The item ID for which explanation is sought
|
||||
model: Recommender model to use for predictions,
|
||||
aggregation_strategy: Strategy to aggregate individual recommendations,
|
||||
window_size: Size of the sliding window
|
||||
"""
|
||||
self.cfg = config
|
||||
self.data = data
|
||||
self.group_handler = group_handler
|
||||
self.members = members
|
||||
self.target_item = target_item
|
||||
self.model = model
|
||||
self.aggregation_strategy = aggregation_strategy
|
||||
self.window_size = window_size
|
||||
|
||||
# Results tracking
|
||||
self.explanations_found: Dict[int, Dict] = {}
|
||||
self.calls = 0
|
||||
self.max_calls = 1000
|
||||
self.item_metrics = {}
|
||||
|
||||
def set_sliding_window(self, sliding_window):
|
||||
"""Set the sliding window object if not provided during initialization."""
|
||||
self.sliding_window = sliding_window
|
||||
|
||||
def set_item_metrics(self, metrics: Dict[Union[str, int], Dict[str, float]]):
|
||||
"""Store the pre-calculated metric scores for all items."""
|
||||
self.item_metrics = metrics
|
||||
|
||||
def find_explanation(
|
||||
self,
|
||||
items_rated_by_group: List[Union[str, int]],
|
||||
group_predictions: Dict,
|
||||
top_recommendation: Union[str, int],
|
||||
ranking_weights: Dict[str, float],
|
||||
) -> Dict[int, Dict]:
|
||||
"""
|
||||
Find counterfactual explanations using the full, encapsulated process.
|
||||
|
||||
Args:
|
||||
items_rated_by_group: All items rated by any member of the group.
|
||||
group_predictions: The original individual predictions from the recommender.
|
||||
top_recommendation: The original top recommended item.
|
||||
ranking_weights: The weights from the UI for each ranking component.
|
||||
|
||||
Returns:
|
||||
A dictionary of found explanations, including their justification metrics.
|
||||
"""
|
||||
|
||||
self.calls = 0
|
||||
ranker = SlidingWindowRanker(config={})
|
||||
ranker.set_group_recommender_values(group_predictions, top_recommendation)
|
||||
ranked_items, self.item_metrics = ranker.generate_ranked_items(
|
||||
all_rated_items=items_rated_by_group,
|
||||
data=self.data,
|
||||
group_members=self.members,
|
||||
component_weights=ranking_weights,
|
||||
)
|
||||
|
||||
sliding_window = SlidingWindow(
|
||||
sequence=ranked_items, window_size=self.window_size
|
||||
)
|
||||
|
||||
found = 0
|
||||
while True:
|
||||
# Get the sliding window
|
||||
big_window = sliding_window.get_next_window()
|
||||
|
||||
# Check exit conditions
|
||||
if big_window is None or found > 0 or self.calls >= self.max_calls:
|
||||
break
|
||||
|
||||
# Count calls and windows
|
||||
self.calls += 1
|
||||
|
||||
# Test if removing this window affects recommendations
|
||||
if self._test_window_removal(big_window, self.target_item):
|
||||
# A counterfactual explanation has been found
|
||||
found += 1
|
||||
# Look for minimal subsets within this window
|
||||
self._find_minimal_subset(big_window, self.target_item)
|
||||
|
||||
if found == 0:
|
||||
print("Explanation could not be found")
|
||||
|
||||
return self.explanations_found
|
||||
|
||||
def _test_window_removal(
|
||||
self, item_ids: List[Union[str, int]], original_group_rec: Union[str, int]
|
||||
) -> bool:
|
||||
"""
|
||||
Test if removing the given items affects the group recommendation.
|
||||
|
||||
Args:
|
||||
item_ids: List of item IDs to remove from group interactions
|
||||
original_group_rec: The original recommendation to compare against
|
||||
|
||||
Returns:
|
||||
bool: True if removing these items changes recommendations, False otherwise
|
||||
"""
|
||||
|
||||
# Get new recommendations after removing items
|
||||
group_recommendation = self._get_recommendations_after_removal(item_ids)
|
||||
|
||||
# Check if target item is still in recommendations
|
||||
|
||||
return original_group_rec not in group_recommendation
|
||||
|
||||
def _get_recommendations_after_removal(
|
||||
self, item_ids: List[Union[str, int]], top_n: int = 10
|
||||
) -> Sequence[Union[str, int]]:
|
||||
"""
|
||||
Get group recommendations after removing specified items from interaction history.
|
||||
|
||||
Args:
|
||||
item_ids: List of item IDs to remove from group interactions
|
||||
top_n: Number of top recommendations to return
|
||||
|
||||
Returns:
|
||||
List of recommended item IDs
|
||||
"""
|
||||
# Create modified dataset with items removed
|
||||
changed_data = self.group_handler.create_modified_dataset(
|
||||
original_data=self.data.dataset,
|
||||
group_ids=self.members,
|
||||
item_ids=item_ids,
|
||||
data=self.data,
|
||||
)
|
||||
|
||||
# Create new DataReader and retrain model
|
||||
data_retrained = self._create_data_reader_and_prepare(changed_data)
|
||||
model_retrained = self._retrain_model(data_retrained)
|
||||
|
||||
# Set up recommender with new model and data
|
||||
group_recommender = GroupRecommender(data_retrained)
|
||||
group_recommender.setup_recommendation(
|
||||
model_retrained,
|
||||
self.members,
|
||||
data_retrained,
|
||||
aggregation_strategy=self.aggregation_strategy,
|
||||
)
|
||||
recommendations = group_recommender.get_group_recommendations(top_n)
|
||||
|
||||
if not isinstance(recommendations, list):
|
||||
return []
|
||||
|
||||
return recommendations
|
||||
|
||||
def _create_data_reader_and_prepare(self, changed_data):
|
||||
"""
|
||||
Create and prepare a new DataReader with modified data.
|
||||
|
||||
Args:
|
||||
changed_data: DataFrame with modified dataset
|
||||
|
||||
Returns:
|
||||
DataReader: A new DataReader object with the modified dataset
|
||||
"""
|
||||
data_retrained = DataReader(
|
||||
filepath_or_buffer=None,
|
||||
sep=None,
|
||||
names=None,
|
||||
skiprows=0,
|
||||
dataframe=changed_data,
|
||||
)
|
||||
|
||||
# Fix for potential dataset issue in original code
|
||||
# data_retrained.dataset = data_retrained.dataset.iloc[1:].reset_index(drop=True)
|
||||
|
||||
# Prepare data
|
||||
data_retrained.make_consecutive_ids_in_dataset()
|
||||
data_retrained.binarize(binary_threshold=1)
|
||||
|
||||
return data_retrained
|
||||
|
||||
def _retrain_model(self, data):
|
||||
"""
|
||||
Retrain the recommendation model with modified data.
|
||||
|
||||
Args:
|
||||
data: Prepared DataReader object with modified dataset
|
||||
|
||||
Returns:
|
||||
Retrained model
|
||||
"""
|
||||
self.model.fit(data)
|
||||
return self.model
|
||||
|
||||
def _find_minimal_subset(
|
||||
self, big_window: List[Union[str, int]], original_group_rec: Union[str, int]
|
||||
) -> None:
|
||||
"""
|
||||
Find minimal subset of items that act as counterfactual explanation.
|
||||
|
||||
Args:
|
||||
big_window: List of item IDs to search within
|
||||
original_group_rec: The original recommendation to compare against
|
||||
|
||||
"""
|
||||
found_subset = 0
|
||||
|
||||
# Try combinations of different lengths
|
||||
for length in range(1, len(big_window) + 1):
|
||||
if found_subset > 0 or self.calls > self.max_calls:
|
||||
break
|
||||
|
||||
combinations = itertools.combinations(big_window, length)
|
||||
for item_combo in combinations:
|
||||
if found_subset > 0 or self.calls > self.max_calls:
|
||||
break
|
||||
|
||||
subset_items = list(item_combo)
|
||||
self.calls += 1
|
||||
|
||||
# Get recommendations after removing this subset
|
||||
new_recommendations = self._get_recommendations_after_removal(
|
||||
subset_items
|
||||
)
|
||||
|
||||
# Check if this is a counterfactual explanation
|
||||
if original_group_rec not in new_recommendations:
|
||||
found_subset += 1
|
||||
self._record_explanation(
|
||||
subset_items, original_group_rec, new_recommendations[0]
|
||||
)
|
||||
|
||||
def _record_explanation(
|
||||
self,
|
||||
explanation_items: List[Union[str, int]],
|
||||
original_rec: Union[str, int],
|
||||
new_rec: Union[str, int],
|
||||
) -> None:
|
||||
"""
|
||||
Record and display found explanation.
|
||||
|
||||
Args:
|
||||
explanation_items: Items that form the counterfactual explanation
|
||||
original_rec: Original recommendation
|
||||
new_rec: New top recommendation after removing explanation items
|
||||
"""
|
||||
print(
|
||||
f"If the group had not interacted with these items {explanation_items},\n"
|
||||
f"the item of interest {original_rec} would not have appeared on the recommendation list;\n"
|
||||
f"instead, {new_rec} would have been recommended."
|
||||
)
|
||||
# print("")
|
||||
# print(f"Explanation: {explanation_items} : found at call: {self.calls}")
|
||||
|
||||
# Calculate metrics for the explanation
|
||||
item_intensity = self._calculate_item_intensity(explanation_items)
|
||||
user_intensity = self._calculate_user_intensity(explanation_items)
|
||||
explanation_metrics = {
|
||||
item: self.item_metrics.get(item, {}) for item in explanation_items
|
||||
}
|
||||
|
||||
self.explanations_found[self.calls] = {
|
||||
"items": explanation_items,
|
||||
"new_rec": new_rec,
|
||||
"metrics": explanation_metrics,
|
||||
}
|
||||
|
||||
exp_size = len(explanation_items)
|
||||
|
||||
# print(f"{exp_size}\t{self.calls}\t{item_intensity}\t{user_intensity}")
|
||||
|
||||
def _calculate_item_intensity(self, items: List[Union[str, int]]) -> List[float]:
|
||||
"""
|
||||
Calculate average item intensity for explanation items.
|
||||
|
||||
Args:
|
||||
items: List of item IDs in the explanation
|
||||
|
||||
Returns:
|
||||
List of average intensity scores for each item
|
||||
"""
|
||||
|
||||
return self._calculate_average_item_intensity_score(
|
||||
items, self.members, self.data
|
||||
)
|
||||
|
||||
def _calculate_user_intensity(self, items: List[Union[str, int]]) -> List[float]:
|
||||
"""
|
||||
Calculate user intensity score for explanation items.
|
||||
|
||||
Args:
|
||||
items: List of item IDs in the explanation
|
||||
|
||||
Returns:
|
||||
List of intensity scores for each user
|
||||
"""
|
||||
return self._calculate_user_intensity_score(items, self.members, self.data)
|
||||
|
||||
@staticmethod
|
||||
def _calculate_average_item_intensity_score(
|
||||
explanation: List[Union[str, int]],
|
||||
members: List[Union[str, int]],
|
||||
data: DataReader,
|
||||
) -> List[float]:
|
||||
"""
|
||||
Calculate the average item intensity for a counterfactual explanation.
|
||||
|
||||
Average item intensity is defined as the average number of interactions
|
||||
between group members and each item in the explanation.
|
||||
|
||||
Args:
|
||||
explanation: The counterfactual explanation items.
|
||||
members: User IDs of the group members.
|
||||
data: DataReader object containing the dataset and ID mapping methods.
|
||||
|
||||
Returns:
|
||||
list: Average intensity for each item in the explanation.
|
||||
"""
|
||||
internal_group_ids = []
|
||||
# Convert user IDs to internal representation
|
||||
for user_id in members:
|
||||
new_user_id = data.get_new_user_id(user_id)
|
||||
if isinstance(new_user_id, list):
|
||||
if new_user_id: # Check that the list is not empty
|
||||
internal_group_ids.append(int(new_user_id[0]))
|
||||
else:
|
||||
internal_group_ids.append(int(new_user_id))
|
||||
|
||||
group_size = len(members)
|
||||
item_intensities = []
|
||||
|
||||
for item_id in explanation:
|
||||
# Convert item ID to internal representation
|
||||
internal_item_id = data.get_new_item_id(item_id)
|
||||
|
||||
# Count interactions between this item and group members
|
||||
interactions_count = len(
|
||||
data.dataset[
|
||||
(data.dataset.itemId == internal_item_id)
|
||||
& (data.dataset.userId.isin(internal_group_ids))
|
||||
]
|
||||
)
|
||||
|
||||
# Calculate average intensity
|
||||
average_intensity = interactions_count / group_size
|
||||
item_intensities.append(average_intensity)
|
||||
|
||||
return item_intensities
|
||||
|
||||
@staticmethod
|
||||
def _calculate_user_intensity_score(
|
||||
explanation_items: List[Union[str, int]],
|
||||
members: List[Union[str, int]],
|
||||
data: DataReader,
|
||||
) -> List[float]:
|
||||
"""
|
||||
Calculate the interaction intensity for each user based on their interactions with items in an explanation.
|
||||
|
||||
Interaction intensity represents how much a user has interacted with the items in the explanation,
|
||||
normalized by the total number of explanation items.
|
||||
|
||||
Args
|
||||
explanation_items : List of item IDs in the explanation
|
||||
members : List of user IDs to calculate intensity for
|
||||
data : DataReader object containing the dataset and ID mapping methods
|
||||
|
||||
Returns
|
||||
List of interaction intensities for each user (same order as members)
|
||||
Values range from 0 to 1, where:
|
||||
- 0 means no interaction with any explanation item
|
||||
- 1 means interaction with all explanation items
|
||||
|
||||
Notes
|
||||
Intensity is calculated as: (number of user interactions with explanation items) / (number of explanation items)
|
||||
"""
|
||||
# Convert external item IDs to internal IDs
|
||||
internal_item_ids = [
|
||||
data.get_new_item_id(item_id) for item_id in explanation_items
|
||||
]
|
||||
|
||||
user_intensities = []
|
||||
num_explanation_items = len(explanation_items)
|
||||
|
||||
for member in members:
|
||||
# Convert external user ID to internal ID
|
||||
internal_user_id = data.get_new_user_id(member)
|
||||
|
||||
# Count interactions between this user and explanation items
|
||||
user_interactions_count = len(
|
||||
data.dataset[
|
||||
(data.dataset.itemId.isin(internal_item_ids))
|
||||
& (data.dataset.userId == internal_user_id)
|
||||
]
|
||||
)
|
||||
|
||||
# Calculate intensity as proportion of explanation items the user interacted with
|
||||
intensity = user_interactions_count / num_explanation_items
|
||||
user_intensities.append(intensity)
|
||||
|
||||
return user_intensities
|
||||
@@ -0,0 +1,11 @@
|
||||
from .model_based_emf import EMFExplainer
|
||||
from .model_based_als_explain import ALSExplainer
|
||||
from .post_hoc_association_rules import ARPostHocExplainer
|
||||
from .post_hoc_knn import KNNPostHocExplainer
|
||||
|
||||
__all__ = [
|
||||
"EMFExplainer",
|
||||
"ALSExplainer",
|
||||
"ARPostHocExplainer",
|
||||
"KNNPostHocExplainer",
|
||||
]
|
||||
@@ -0,0 +1,49 @@
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
class Explainer(ABC):
|
||||
def __init__(self, model, recommendations, data):
|
||||
self.model = model
|
||||
self.recommendations = recommendations
|
||||
self.dataset = data.dataset
|
||||
self.num_items = data.num_item
|
||||
self.num_users = data.num_user
|
||||
self.users = self.dataset.groupby(by="userId")
|
||||
|
||||
def explain_recommendations(self):
|
||||
explanations = []
|
||||
|
||||
with tqdm(
|
||||
total=self.recommendations.shape[0], desc="Computing explanations: "
|
||||
) as pbar:
|
||||
for _, row in self.recommendations.iterrows():
|
||||
explanations.append(
|
||||
self.explain_recommendation_to_user(
|
||||
int(row.userId), int(row.itemId)
|
||||
)
|
||||
)
|
||||
pbar.update()
|
||||
|
||||
self.recommendations["explanations"] = explanations
|
||||
return self.recommendations
|
||||
|
||||
def get_user_items(self, user_id):
|
||||
"""
|
||||
Items Ids rated by a user.
|
||||
:param user_id: the user
|
||||
:return: list
|
||||
"""
|
||||
return self.users.get_group(user_id).itemId.values
|
||||
|
||||
@abstractmethod
|
||||
def explain_recommendation_to_user(
|
||||
self, user_id: int, item_id: int
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generates an explanation for a single user-item recommendation.
|
||||
This method must be implemented by any subclass.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@@ -0,0 +1,51 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from .explainer import Explainer
|
||||
|
||||
|
||||
class ALSExplainer(Explainer):
|
||||
def __init__(self, model, recommendations, data, number_of_contributions=10):
|
||||
super(ALSExplainer, self).__init__(model, recommendations, data)
|
||||
self.number_of_contributions = number_of_contributions
|
||||
|
||||
def explain_recommendation_to_user(self, user_id: int, item_id: int):
|
||||
"""
|
||||
Measuring the contribution of each item to the recommendation.
|
||||
:param model:
|
||||
:param item_id:
|
||||
:param user_id:
|
||||
:return: returns a dataframe with the contribution to the recommendation of each previously interacted with item.
|
||||
"""
|
||||
|
||||
current_interactions = np.zeros(self.num_items)
|
||||
current_interactions[self.get_user_items(user_id)] = 1
|
||||
|
||||
c_u = np.diag(current_interactions)
|
||||
|
||||
y_t = self.model.item_embedding().transpose()
|
||||
temp = np.matmul(y_t, c_u)
|
||||
temp = np.matmul(temp, self.model.item_embedding())
|
||||
temp = temp + np.diag([self.model.reg_term] * self.model.latent_dim)
|
||||
|
||||
if len(self.get_user_items(user_id)) > 1:
|
||||
weight_mtr = np.linalg.inv(temp)
|
||||
else:
|
||||
weight_mtr = np.linalg.pinv(temp)
|
||||
|
||||
temp = np.matmul(self.model.item_embedding(), weight_mtr)
|
||||
|
||||
sim_to_rec_id = temp.dot(self.model.item_embedding()[item_id, :])
|
||||
|
||||
sim_to_rec_id = sim_to_rec_id[self.get_user_items(user_id)]
|
||||
|
||||
contribution = {
|
||||
"item": self.get_user_items(user_id),
|
||||
"contribution": sim_to_rec_id,
|
||||
}
|
||||
contribution = pd.DataFrame(contribution)
|
||||
contribution = contribution.sort_values(by=["contribution"], ascending=False)
|
||||
return {
|
||||
"item": contribution.item[: self.number_of_contributions],
|
||||
"contribution": contribution.contribution[: self.number_of_contributions],
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
from .explainer import Explainer
|
||||
|
||||
|
||||
class EMFExplainer(Explainer):
|
||||
def __init__(self, model, recommendations, data):
|
||||
super(EMFExplainer, self).__init__(model, recommendations, data)
|
||||
|
||||
def explain_recommendation_to_user(self, user_id: int, item_id: int):
|
||||
"""
|
||||
Measuring the contribution of each item to the recommendation.
|
||||
:param user_id:
|
||||
:param item_id: recommendation
|
||||
:return: returns a dataframe with the contribution to the recommendation of each previously interacted with item.
|
||||
"""
|
||||
|
||||
ratings_on_item = self.dataset[self.dataset.itemId == item_id]
|
||||
similar_users = self.model.sim_users[user_id]
|
||||
similar_users_ratings_on_item = ratings_on_item[
|
||||
ratings_on_item.userId.isin(similar_users)
|
||||
]
|
||||
|
||||
explanation_df = similar_users_ratings_on_item.groupby(by="rating").count()
|
||||
explanation = {}
|
||||
|
||||
for index, row in explanation_df.iterrows():
|
||||
explanation[index] = row[0]
|
||||
|
||||
return explanation
|
||||
@@ -0,0 +1,79 @@
|
||||
from typing import Any, Dict
|
||||
from mlxtend.preprocessing import TransactionEncoder
|
||||
from mlxtend.frequent_patterns import apriori, association_rules
|
||||
import pandas as pd
|
||||
|
||||
from .explainer import Explainer
|
||||
|
||||
|
||||
class ARPostHocExplainer(Explainer):
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
recommendations,
|
||||
data,
|
||||
min_support=0.1,
|
||||
max_len=2,
|
||||
metric="lift",
|
||||
min_threshold=0.1,
|
||||
min_confidence=0.1,
|
||||
min_lift=0.1,
|
||||
):
|
||||
super(ARPostHocExplainer, self).__init__(model, recommendations, data)
|
||||
self.AR = None
|
||||
self.min_support = min_support
|
||||
self.max_len = max_len
|
||||
self.metric = metric
|
||||
self.min_threshold = min_threshold
|
||||
self.min_confidence = min_confidence
|
||||
self.min_lift = min_lift
|
||||
|
||||
self.rules: pd.DataFrame | None = None
|
||||
|
||||
def get_rules_for_getting(self, item_id: int) -> pd.DataFrame:
|
||||
if self.rules is None:
|
||||
self.compute_association_rules()
|
||||
|
||||
if self.rules is not None:
|
||||
return self.rules[self.rules.consequents == item_id]
|
||||
|
||||
return pd.DataFrame()
|
||||
|
||||
def compute_association_rules(self):
|
||||
item_sets = [
|
||||
[item for item in self.dataset[self.dataset.userId == user].itemId]
|
||||
for user in self.dataset.userId.unique()
|
||||
]
|
||||
|
||||
te = TransactionEncoder()
|
||||
te_ary = te.fit(item_sets).transform(item_sets)
|
||||
|
||||
# The te_ary object is a NumPy array, which is a valid input for a DataFrame.
|
||||
# Pylance may raise a false positive here due to incomplete type stubs for mlxtend.
|
||||
df = pd.DataFrame(te_ary.astype(bool), columns=te.columns_) # type: ignore
|
||||
|
||||
frequent_itemsets = apriori(
|
||||
df, min_support=self.min_support, use_colnames=True, max_len=self.max_len
|
||||
)
|
||||
|
||||
rules = association_rules(
|
||||
frequent_itemsets, metric="lift", min_threshold=self.min_threshold
|
||||
)
|
||||
rules = rules[
|
||||
(rules["confidence"] > self.min_confidence)
|
||||
& (rules["lift"] > self.min_lift)
|
||||
]
|
||||
|
||||
rules["consequents"] = rules["consequents"].apply(lambda x: list(x)[0])
|
||||
rules["antecedents"] = rules["antecedents"].apply(lambda x: list(x)[0])
|
||||
|
||||
self.rules = rules[["consequents", "antecedents", "confidence"]]
|
||||
|
||||
def explain_recommendation_to_user(
|
||||
self, user_id: int, item_id: int
|
||||
) -> Dict[str, Any]:
|
||||
user_ratings = self.get_user_items(user_id)
|
||||
rules = self.get_rules_for_getting(item_id)
|
||||
explanations = rules[rules.antecedents.isin(user_ratings)]
|
||||
|
||||
return {"antecedents": set(explanations.antecedents)}
|
||||
@@ -0,0 +1,46 @@
|
||||
from scipy import sparse
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
import numpy as np
|
||||
from typing import Dict, Any
|
||||
|
||||
from .explainer import Explainer
|
||||
|
||||
|
||||
class KNNPostHocExplainer(Explainer):
|
||||
def __init__(self, model, recommendations, data, knn=10):
|
||||
super(KNNPostHocExplainer, self).__init__(model, recommendations, data)
|
||||
|
||||
self.knn = knn
|
||||
# Initialize as an empty dictionary to prevent subscripting None
|
||||
self.knn_items_dict: Dict[int, np.ndarray] = {}
|
||||
|
||||
def get_nn_for_getting(self, item_id: int) -> np.ndarray:
|
||||
# Check if the KNN dictionary has been computed
|
||||
if not self.knn_items_dict:
|
||||
self.compute_knn_items_for_all_items()
|
||||
|
||||
# Return the neighbors for the item, or an empty array if not found
|
||||
return self.knn_items_dict.get(item_id, np.array([]))
|
||||
|
||||
def compute_knn_items_for_all_items(self):
|
||||
ds = np.zeros((self.num_items, self.num_users))
|
||||
# Assuming self.dataset has attributes itemId, userId, and rating
|
||||
ds[self.dataset.itemId, self.dataset.userId] = self.dataset.rating
|
||||
|
||||
ds = sparse.csr_matrix(ds)
|
||||
sim_matrix = cosine_similarity(ds)
|
||||
min_val = sim_matrix.min() - 1
|
||||
|
||||
for i in range(self.num_items):
|
||||
sim_matrix[i, i] = min_val
|
||||
knn_to_item_i = (-sim_matrix[i, :]).argsort()[: self.knn]
|
||||
self.knn_items_dict[i] = knn_to_item_i
|
||||
|
||||
def explain_recommendation_to_user(
|
||||
self, user_id: int, item_id: int
|
||||
) -> Dict[str, Any]:
|
||||
user_ratings = self.get_user_items(user_id)
|
||||
sim_items = self.get_nn_for_getting(item_id)
|
||||
explanations = set(sim_items) & set(user_ratings)
|
||||
|
||||
return {"explanations": explanations}
|
||||
@@ -0,0 +1,23 @@
|
||||
from .als_model import ALS
|
||||
from .bpr_model import BPR
|
||||
from .gmf_model import GMFModel
|
||||
from .emf_model import EMFModel
|
||||
from .autoencoder_model import ExplAutoencoderTorch
|
||||
from .mlp_model import MLPModel
|
||||
from .emf_model import PyTorchModel
|
||||
from .knn_basic_model import KNNBasic
|
||||
from .svd_model import SVD
|
||||
from .recommender_model import RecommenderModel
|
||||
|
||||
__all__ = [
|
||||
"ALS",
|
||||
"BPR",
|
||||
"GMFModel",
|
||||
"EMFModel",
|
||||
"PyTorchModel",
|
||||
"MLPModel",
|
||||
"ExplAutoencoderTorch",
|
||||
"KNNBasic",
|
||||
"SVD",
|
||||
"RecommenderModel",
|
||||
]
|
||||
@@ -0,0 +1,31 @@
|
||||
import implicit
|
||||
|
||||
from .mf_implicit_model import MFImplicitModel
|
||||
|
||||
|
||||
class ALS(MFImplicitModel):
|
||||
def __init__(
|
||||
self,
|
||||
latent_dim,
|
||||
reg_term,
|
||||
epochs,
|
||||
random_state=42,
|
||||
num_users=None,
|
||||
num_items=None,
|
||||
**kwargs,
|
||||
):
|
||||
super(ALS, self).__init__(
|
||||
latent_dim=latent_dim,
|
||||
reg_term=reg_term,
|
||||
epochs=epochs,
|
||||
learning_rate=None,
|
||||
num_users=num_users,
|
||||
num_items=num_items,
|
||||
)
|
||||
|
||||
self.model = implicit.als.AlternatingLeastSquares(
|
||||
factors=self.latent_dim,
|
||||
regularization=self.reg_term,
|
||||
iterations=self.epochs,
|
||||
random_state=random_state,
|
||||
)
|
||||
@@ -0,0 +1,223 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim
|
||||
from scipy import sparse
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm.auto import tqdm
|
||||
from typing import Optional, Union, List
|
||||
|
||||
from pygrex.utils.torch_utils import use_cuda, use_optimizer
|
||||
from pygrex.data_reader import UserItemDict, DataReader
|
||||
from .recommender_model import RecommenderModel
|
||||
|
||||
|
||||
class ExplAutoencoderTorch(RecommenderModel, nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
hidden_layer_features: int,
|
||||
learning_rate: float,
|
||||
positive_threshold: float,
|
||||
weight_decay: float,
|
||||
epochs: int,
|
||||
knn: int,
|
||||
cuda: bool,
|
||||
optimizer_name: str,
|
||||
expl: bool,
|
||||
device_id: Optional[int] = None,
|
||||
):
|
||||
super().__init__()
|
||||
if optimizer_name not in ["sgd", "adam", "rmsprop"]:
|
||||
raise Exception("Wrong optimizer.")
|
||||
if cuda:
|
||||
use_cuda(True, device_id if device_id is not None else 0)
|
||||
|
||||
self.positive_threshold = positive_threshold
|
||||
self.weight_decay = weight_decay
|
||||
self.knn = knn
|
||||
self.learning_rate = learning_rate
|
||||
self.epochs = epochs
|
||||
self.use_gpu = cuda
|
||||
self.optimizer_name = optimizer_name
|
||||
self.hidden_layer_features = hidden_layer_features
|
||||
self.expl = expl
|
||||
|
||||
self.dataset = None
|
||||
self.data = None
|
||||
self.embedding_user = None
|
||||
self.embedding_item = None
|
||||
self.optimizer: Optional[torch.optim.Optimizer] = None
|
||||
|
||||
self.explainability_matrix = None
|
||||
self.sim_users = {}
|
||||
|
||||
self.criterion = nn.MSELoss()
|
||||
|
||||
def fit(self, data: DataReader):
|
||||
self.data = data
|
||||
self.dataset = data.dataset
|
||||
num_items = self.data.num_item
|
||||
|
||||
self.encoder_hidden_layer = nn.Linear(
|
||||
in_features=num_items, out_features=self.hidden_layer_features
|
||||
)
|
||||
|
||||
self.decoder_output_layer = nn.Linear(
|
||||
in_features=self.hidden_layer_features, out_features=num_items
|
||||
)
|
||||
|
||||
self.compute_explainability()
|
||||
optimizer = use_optimizer(
|
||||
network=self,
|
||||
weight_decay=self.weight_decay,
|
||||
learning_rate=self.learning_rate,
|
||||
optimizer_name=self.optimizer_name,
|
||||
)
|
||||
|
||||
assert isinstance(optimizer, torch.optim.Optimizer)
|
||||
self.optimizer = optimizer
|
||||
|
||||
with tqdm(total=self.epochs) as progress:
|
||||
train_loader = self.instance_a_train_loader()
|
||||
for epoch in range(self.epochs):
|
||||
loss = self.train_an_epoch(train_loader)
|
||||
progress.update(1)
|
||||
progress.set_postfix({"loss": loss})
|
||||
|
||||
def compute_explainability(self):
|
||||
assert self.dataset is not None
|
||||
assert self.data is not None
|
||||
ds = self.dataset.pivot(index="userId", columns="itemId", values="rating")
|
||||
ds = ds.fillna(0)
|
||||
ds = sparse.csr_matrix(ds)
|
||||
sim_matrix = cosine_similarity(ds)
|
||||
min_val = sim_matrix.min() - 1
|
||||
|
||||
for i in range(self.data.num_user):
|
||||
sim_matrix[i, i] = min_val
|
||||
|
||||
knn_to_user_i = (-sim_matrix[i, :]).argsort()[: self.knn]
|
||||
self.sim_users[i] = knn_to_user_i
|
||||
|
||||
self.explainability_matrix = np.zeros((self.data.num_user, self.data.num_item))
|
||||
|
||||
filter_dataset_on_threshold = self.dataset[
|
||||
self.dataset["rating"] >= self.positive_threshold
|
||||
]
|
||||
|
||||
for i in range(self.data.num_user):
|
||||
knn_to_user_i = self.sim_users[i]
|
||||
|
||||
rated_items_by_sim_users = filter_dataset_on_threshold[
|
||||
filter_dataset_on_threshold["userId"].isin(knn_to_user_i)
|
||||
]
|
||||
|
||||
sim_scores = rated_items_by_sim_users.groupby(by="itemId")
|
||||
sim_scores = sim_scores["rating"].sum()
|
||||
sim_scores = sim_scores.reset_index()
|
||||
|
||||
self.explainability_matrix[i, sim_scores.itemId] = (
|
||||
sim_scores.rating.to_list()
|
||||
)
|
||||
|
||||
self.explainability_matrix = MinMaxScaler().fit_transform(
|
||||
self.explainability_matrix
|
||||
)
|
||||
|
||||
self.explainability_matrix = torch.from_numpy(self.explainability_matrix)
|
||||
|
||||
def instance_a_train_loader(self):
|
||||
"""instance train loader for one training epoch"""
|
||||
assert self.dataset is not None
|
||||
assert self.explainability_matrix is not None
|
||||
self.user_item_dict = UserItemDict(
|
||||
self.dataset, self.explainability_matrix, self.expl
|
||||
)
|
||||
return DataLoader(self.user_item_dict, shuffle=True)
|
||||
|
||||
def train_an_epoch(self, train_loader):
|
||||
self.train()
|
||||
cnt = 0
|
||||
total_loss = 0
|
||||
for batch_id, batch in enumerate(train_loader):
|
||||
assert isinstance(batch[0], torch.Tensor)
|
||||
rating = batch[0]
|
||||
rating = rating.float()
|
||||
loss = self.train_single_user(rating)
|
||||
total_loss += loss
|
||||
cnt += 1
|
||||
return total_loss / cnt
|
||||
|
||||
def train_single_user(self, ratings):
|
||||
if self.use_gpu:
|
||||
ratings = ratings.cuda()
|
||||
|
||||
assert self.optimizer is not None
|
||||
self.optimizer.zero_grad()
|
||||
ratings_pred = self(ratings)
|
||||
loss = self.criterion(ratings_pred, ratings)
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
loss = loss.item()
|
||||
return loss
|
||||
|
||||
def forward(self, user_adjusted_ratings):
|
||||
activation = self.encoder_hidden_layer(user_adjusted_ratings)
|
||||
code = torch.relu(activation)
|
||||
activation = self.decoder_output_layer(code)
|
||||
reconstructed_ratings = torch.relu(activation)
|
||||
return reconstructed_ratings
|
||||
|
||||
def predict(
|
||||
self, user_id: Union[int, List[int], str], item_id: Union[int, List[int], str]
|
||||
) -> list:
|
||||
try:
|
||||
if isinstance(user_id, str):
|
||||
user_id = int(user_id)
|
||||
elif isinstance(user_id, list):
|
||||
user_id = [int(u) for u in user_id]
|
||||
if isinstance(item_id, str):
|
||||
item_id = int(item_id)
|
||||
elif isinstance(item_id, list):
|
||||
item_id = [int(i) for i in item_id]
|
||||
except (ValueError, TypeError):
|
||||
raise ValueError(
|
||||
"User and item IDs must be integers or strings that can be converted to integers."
|
||||
)
|
||||
|
||||
single_user = isinstance(user_id, int)
|
||||
single_item = isinstance(item_id, int)
|
||||
|
||||
if isinstance(user_id, int):
|
||||
user_id = [user_id]
|
||||
if isinstance(item_id, int):
|
||||
item_id = [item_id]
|
||||
|
||||
with torch.no_grad():
|
||||
assert self.user_item_dict is not None, "The model has not been fitted yet."
|
||||
|
||||
# Collect ratings for all users
|
||||
ratings_list = []
|
||||
for uid in user_id:
|
||||
rating = self.user_item_dict[uid] # Pass scalar user_id to dict
|
||||
ratings_list.append(rating)
|
||||
|
||||
rating = torch.stack(ratings_list)
|
||||
rating = rating.float()
|
||||
if self.use_gpu:
|
||||
rating = rating.cuda()
|
||||
pred = self.forward(rating).cpu()
|
||||
predictions = pred[:, item_id].tolist()
|
||||
|
||||
# Flatten the nested list if it contains only one user's predictions
|
||||
if single_user and single_item:
|
||||
return (
|
||||
predictions[0][0]
|
||||
if isinstance(predictions[0], list)
|
||||
else predictions[0]
|
||||
)
|
||||
elif single_user:
|
||||
return predictions[0]
|
||||
return predictions
|
||||
@@ -0,0 +1,25 @@
|
||||
import implicit
|
||||
|
||||
from .mf_implicit_model import MFImplicitModel
|
||||
|
||||
|
||||
class BPR(MFImplicitModel):
|
||||
""""""
|
||||
def __init__(self,
|
||||
latent_dim,
|
||||
reg_term,
|
||||
learning_rate,
|
||||
epochs,
|
||||
**kwargs):
|
||||
|
||||
super(BPR, self).__init__(latent_dim=latent_dim,
|
||||
reg_term=reg_term,
|
||||
learning_rate=learning_rate,
|
||||
epochs=epochs)
|
||||
|
||||
self.model = implicit.bpr.BayesianPersonalizedRanking(
|
||||
factors=self.latent_dim,
|
||||
learning_rate=self.learning_rate,
|
||||
regularization=self.reg_term,
|
||||
iterations=self.epochs
|
||||
)
|
||||
@@ -0,0 +1,391 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from scipy import sparse
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm.auto import tqdm
|
||||
from typing import Union
|
||||
|
||||
from pygrex.data_reader import UserItemRatingDataset, DataReader
|
||||
from pygrex.utils import EMFLoss
|
||||
from .py_torch_model import PyTorchModel
|
||||
from .recommender_model import RecommenderModel
|
||||
|
||||
|
||||
class EMFModel(RecommenderModel):
|
||||
def __init__(
|
||||
self,
|
||||
learning_rate: float,
|
||||
reg_term: float,
|
||||
expl_reg_term: float,
|
||||
positive_threshold: float,
|
||||
latent_dim: int,
|
||||
epochs: int,
|
||||
knn: int,
|
||||
):
|
||||
self.latent_dim = latent_dim
|
||||
self.learning_rate = learning_rate
|
||||
self.epochs = epochs
|
||||
|
||||
self.dataset = None
|
||||
self.data = None
|
||||
self.embedding_user = None
|
||||
self.embedding_item = None
|
||||
self.optimizer = None
|
||||
|
||||
self.reg_term = reg_term
|
||||
self.expl_reg_term = expl_reg_term
|
||||
self.positive_threshold = positive_threshold
|
||||
self.knn = knn
|
||||
|
||||
self.explainability_matrix = None
|
||||
self.sim_users = {}
|
||||
|
||||
self.affine_output = nn.Linear(in_features=self.latent_dim, out_features=1)
|
||||
|
||||
self.criterion = EMFLoss()
|
||||
|
||||
def fit(self, data: DataReader) -> None:
|
||||
self.data = data
|
||||
self.dataset = data.dataset
|
||||
|
||||
assert self.data is not None
|
||||
num_users = self.data.num_user
|
||||
num_items = self.data.num_item
|
||||
|
||||
self.embedding_user = np.random.uniform(
|
||||
low=0, high=0.5 / self.latent_dim, size=(num_users, self.latent_dim)
|
||||
)
|
||||
|
||||
self.embedding_item = np.random.uniform(
|
||||
low=0, high=0.5 / self.latent_dim, size=(num_items, self.latent_dim)
|
||||
)
|
||||
|
||||
self.compute_explainability()
|
||||
|
||||
with tqdm(total=self.epochs) as progress:
|
||||
assert self.dataset is not None
|
||||
for epoch in range(self.epochs):
|
||||
self.dataset = self.dataset.sample(frac=1)
|
||||
loss = []
|
||||
for _, row in self.dataset.iterrows():
|
||||
user_id = int(row.userId)
|
||||
item_id = int(row.itemId)
|
||||
|
||||
p_ui = self.predict(user_id, item_id)
|
||||
|
||||
e_ui = row.rating - p_ui
|
||||
|
||||
loss.append(e_ui**2)
|
||||
|
||||
assert self.embedding_item is not None
|
||||
assert self.embedding_user is not None
|
||||
delta_u = 2 * e_ui * self.embedding_item[item_id, :]
|
||||
delta_u -= self.reg_term * self.embedding_user[user_id, :]
|
||||
temp = np.sign(
|
||||
self.embedding_item[item_id, :]
|
||||
- self.embedding_user[user_id, :]
|
||||
)
|
||||
assert self.explainability_matrix is not None
|
||||
temp *= (
|
||||
self.expl_reg_term
|
||||
* self.explainability_matrix[user_id, item_id]
|
||||
)
|
||||
delta_u -= temp
|
||||
|
||||
delta_v = 2 * e_ui * self.embedding_user[user_id, :]
|
||||
delta_v -= self.reg_term * self.embedding_item[item_id, :]
|
||||
temp = np.sign(
|
||||
self.embedding_user[user_id, :]
|
||||
- self.embedding_item[item_id, :]
|
||||
)
|
||||
assert self.explainability_matrix is not None
|
||||
temp *= (
|
||||
self.expl_reg_term
|
||||
* self.explainability_matrix[user_id, item_id]
|
||||
)
|
||||
delta_v -= temp
|
||||
|
||||
self.embedding_user[user_id, :] += self.learning_rate * delta_u
|
||||
self.embedding_item[item_id, :] += self.learning_rate * delta_v
|
||||
|
||||
progress.update(1)
|
||||
|
||||
progress.set_postfix({"MSE": sum(loss) / len(loss)})
|
||||
|
||||
def compute_explainability(self):
|
||||
assert self.dataset is not None
|
||||
ds = self.dataset.pivot(index="userId", columns="itemId", values="rating")
|
||||
ds = ds.fillna(0)
|
||||
ds = sparse.csr_matrix(ds)
|
||||
sim_matrix = cosine_similarity(ds)
|
||||
min_val = sim_matrix.min() - 1
|
||||
|
||||
assert self.data is not None
|
||||
for i in range(self.data.num_user):
|
||||
sim_matrix[i, i] = min_val
|
||||
|
||||
knn_to_user_i = (-sim_matrix[i, :]).argsort()[: self.knn]
|
||||
self.sim_users[i] = knn_to_user_i
|
||||
|
||||
self.explainability_matrix = np.zeros((self.data.num_user, self.data.num_item))
|
||||
|
||||
filter_dataset_on_threshold = self.dataset[
|
||||
self.dataset["rating"] >= self.positive_threshold
|
||||
]
|
||||
|
||||
for i in range(self.data.num_user):
|
||||
knn_to_user_i = self.sim_users[i]
|
||||
|
||||
rated_items_by_sim_users = filter_dataset_on_threshold[
|
||||
filter_dataset_on_threshold["userId"].isin(knn_to_user_i)
|
||||
]
|
||||
|
||||
sim_scores = rated_items_by_sim_users.groupby(by="itemId")
|
||||
sim_scores = sim_scores["rating"].sum()
|
||||
sim_scores = sim_scores.reset_index()
|
||||
|
||||
self.explainability_matrix[i, sim_scores.itemId.astype(int)] = (
|
||||
sim_scores.rating.to_list()
|
||||
)
|
||||
|
||||
self.explainability_matrix = MinMaxScaler().fit_transform(
|
||||
self.explainability_matrix
|
||||
)
|
||||
|
||||
def predict(
|
||||
self, user_id: Union[int, str], item_id: Union[int, str]
|
||||
) -> Union[float, list]:
|
||||
user_id_processed = user_id
|
||||
item_id_processed = item_id
|
||||
|
||||
if isinstance(user_id_processed, np.ndarray):
|
||||
user_id_processed = user_id_processed.tolist()
|
||||
if isinstance(item_id_processed, np.ndarray):
|
||||
item_id_processed = item_id_processed.tolist()
|
||||
|
||||
is_list_input = isinstance(user_id_processed, list) or isinstance(
|
||||
item_id_processed, list
|
||||
)
|
||||
|
||||
if is_list_input:
|
||||
user_id_list = (
|
||||
user_id_processed
|
||||
if isinstance(user_id_processed, list)
|
||||
else [user_id_processed]
|
||||
)
|
||||
item_id_list = (
|
||||
item_id_processed
|
||||
if isinstance(item_id_processed, list)
|
||||
else [item_id_processed]
|
||||
)
|
||||
predictions = []
|
||||
for u in user_id_list:
|
||||
assert self.embedding_user is not None
|
||||
assert self.embedding_item is not None
|
||||
pred = [
|
||||
np.dot(
|
||||
self.embedding_user[int(u), :], self.embedding_item[int(i), :]
|
||||
)
|
||||
for i in item_id_list
|
||||
]
|
||||
predictions.append(pred)
|
||||
predictions_np = np.array(predictions)
|
||||
|
||||
if len(user_id_list) == 1 or len(item_id_list) == 1:
|
||||
predictions_np = predictions_np.flatten()
|
||||
|
||||
return predictions_np.tolist()
|
||||
|
||||
else:
|
||||
assert self.embedding_user is not None
|
||||
assert self.embedding_item is not None
|
||||
return np.dot(
|
||||
self.embedding_user[int(user_id), :],
|
||||
self.embedding_item[int(item_id), :],
|
||||
)
|
||||
|
||||
def user_embedding(self):
|
||||
return self.embedding_user
|
||||
|
||||
def item_embedding(self):
|
||||
return self.embedding_item
|
||||
|
||||
|
||||
class EMFTorchModel(PyTorchModel):
|
||||
def __init__(
|
||||
self,
|
||||
learning_rate: float,
|
||||
reg_term: float,
|
||||
expl_reg_term: float,
|
||||
positive_threshold: float,
|
||||
momentum: float,
|
||||
weight_decay: float,
|
||||
latent_dim: int,
|
||||
epochs: int,
|
||||
batch_size: int,
|
||||
knn: int,
|
||||
cuda: bool,
|
||||
optimizer_name: str,
|
||||
device_id=None,
|
||||
):
|
||||
super().__init__(
|
||||
learning_rate=learning_rate,
|
||||
latent_dim=latent_dim,
|
||||
epochs=epochs,
|
||||
batch_size=batch_size,
|
||||
cuda=cuda,
|
||||
optimizer_name=optimizer_name,
|
||||
device_id=device_id,
|
||||
)
|
||||
|
||||
self.reg_term = reg_term
|
||||
self.expl_reg_term = expl_reg_term
|
||||
self.positive_threshold = positive_threshold
|
||||
self.momentum = momentum
|
||||
self.weight_decay = weight_decay
|
||||
self.knn = knn
|
||||
|
||||
self.explainability_matrix = None
|
||||
self.sim_users = {}
|
||||
|
||||
self.affine_output = nn.Linear(in_features=self.latent_dim, out_features=1)
|
||||
|
||||
self.criterion = EMFLoss()
|
||||
|
||||
def fit(self, data: DataReader) -> None:
|
||||
self.data = data
|
||||
self.dataset = data.dataset
|
||||
|
||||
assert self.data is not None
|
||||
num_users = self.data.num_user
|
||||
num_items = self.data.num_item
|
||||
|
||||
self.embedding_user = nn.Embedding(
|
||||
num_embeddings=num_users, embedding_dim=self.latent_dim
|
||||
)
|
||||
|
||||
self.embedding_item = nn.Embedding(
|
||||
num_embeddings=num_items, embedding_dim=self.latent_dim
|
||||
)
|
||||
|
||||
self.compute_explainability()
|
||||
|
||||
self.optimizer = torch.optim.SGD(
|
||||
self.parameters(),
|
||||
lr=self.learning_rate,
|
||||
momentum=self.momentum,
|
||||
weight_decay=self.weight_decay,
|
||||
)
|
||||
|
||||
with tqdm(total=self.epochs) as progress:
|
||||
for epoch in range(self.epochs):
|
||||
train_loader = self.instance_a_train_loader(self.batch_size)
|
||||
loss = self.train_an_epoch(train_loader)
|
||||
progress.update(1)
|
||||
progress.set_postfix({"loss": loss})
|
||||
|
||||
def compute_explainability(self):
|
||||
assert self.dataset is not None
|
||||
ds = self.dataset.pivot(index="userId", columns="itemId", values="rating")
|
||||
ds = ds.fillna(0)
|
||||
ds = sparse.csr_matrix(ds)
|
||||
sim_matrix = cosine_similarity(ds)
|
||||
min_val = sim_matrix.min() - 1
|
||||
|
||||
assert self.data is not None
|
||||
for i in range(self.data.num_user):
|
||||
sim_matrix[i, i] = min_val
|
||||
|
||||
knn_to_user_i = (-sim_matrix[i, :]).argsort()[: self.knn]
|
||||
self.sim_users[i] = knn_to_user_i
|
||||
|
||||
self.explainability_matrix = np.zeros((self.data.num_user, self.data.num_item))
|
||||
|
||||
filter_dataset_on_threshold = self.dataset[
|
||||
self.dataset["rating"] >= self.positive_threshold
|
||||
]
|
||||
|
||||
for i in range(self.data.num_user):
|
||||
knn_to_user_i = self.sim_users[i]
|
||||
|
||||
rated_items_by_sim_users = filter_dataset_on_threshold[
|
||||
filter_dataset_on_threshold["userId"].isin(knn_to_user_i)
|
||||
]
|
||||
|
||||
sim_scores = rated_items_by_sim_users.groupby(by="itemId")
|
||||
sim_scores = sim_scores["rating"].sum()
|
||||
sim_scores = sim_scores.reset_index()
|
||||
|
||||
self.explainability_matrix[i, sim_scores.itemId.astype(int)] = (
|
||||
sim_scores.rating.to_list()
|
||||
)
|
||||
|
||||
self.explainability_matrix = MinMaxScaler().fit_transform(
|
||||
self.explainability_matrix
|
||||
)
|
||||
|
||||
self.explainability_matrix = torch.from_numpy(self.explainability_matrix)
|
||||
|
||||
def instance_a_train_loader(self, batch_size):
|
||||
assert self.dataset is not None
|
||||
dataset = UserItemRatingDataset(
|
||||
user_tensor=torch.LongTensor(self.dataset.userId.values),
|
||||
item_tensor=torch.LongTensor(self.dataset.itemId.values),
|
||||
target_tensor=torch.FloatTensor(self.dataset.rating.values),
|
||||
)
|
||||
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
|
||||
def train_an_epoch(self, train_loader):
|
||||
self.train()
|
||||
cnt = 0
|
||||
total_loss = 0
|
||||
for batch_id, batch in enumerate(train_loader):
|
||||
assert isinstance(batch[0], torch.LongTensor)
|
||||
user, item, rating = batch[0], batch[1], batch[2]
|
||||
rating = rating.float()
|
||||
loss = self.train_single_batch(user, item, rating)
|
||||
total_loss += loss
|
||||
cnt += 1
|
||||
return total_loss / cnt
|
||||
|
||||
def train_single_batch(self, users, items, ratings):
|
||||
if self.cuda is True:
|
||||
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
|
||||
|
||||
assert self.optimizer is not None
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
ratings_pred = self(users, items)
|
||||
|
||||
assert self.embedding_user is not None
|
||||
user_embeddings = self.embedding_user(users)
|
||||
assert self.embedding_item is not None
|
||||
item_embeddings = self.embedding_item(items)
|
||||
|
||||
assert self.explainability_matrix is not None
|
||||
loss = self.criterion(
|
||||
ratings_pred=ratings_pred,
|
||||
ratings=ratings,
|
||||
u=user_embeddings,
|
||||
v=item_embeddings,
|
||||
reg_term=self.reg_term,
|
||||
expl=self.explainability_matrix[users, items],
|
||||
expl_reg_term=self.expl_reg_term,
|
||||
)
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
loss = loss.item()
|
||||
|
||||
return loss
|
||||
|
||||
def forward(self, user_indices, item_indices):
|
||||
assert self.embedding_user is not None
|
||||
user_embeddings = self.embedding_user(user_indices)
|
||||
assert self.embedding_item is not None
|
||||
item_embeddings = self.embedding_item(item_indices)
|
||||
element_product = torch.mul(user_embeddings, item_embeddings)
|
||||
rating = self.affine_output(element_product)
|
||||
return rating
|
||||
@@ -0,0 +1,165 @@
|
||||
import random
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.optim import Optimizer
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from pygrex.data_reader import DataReader, UserItemRatingDataset
|
||||
from pygrex.utils.torch_utils import use_optimizer
|
||||
from .py_torch_model import PyTorchModel
|
||||
|
||||
|
||||
class GMFModel(PyTorchModel):
|
||||
def __init__(
|
||||
self,
|
||||
learning_rate: float,
|
||||
weight_decay: float,
|
||||
latent_dim: int,
|
||||
epochs: int,
|
||||
num_negative: int,
|
||||
batch_size: int,
|
||||
cuda: bool,
|
||||
optimizer_name: str,
|
||||
device_id=None,
|
||||
):
|
||||
super().__init__(
|
||||
learning_rate=learning_rate,
|
||||
latent_dim=latent_dim,
|
||||
epochs=epochs,
|
||||
batch_size=batch_size,
|
||||
cuda=cuda,
|
||||
optimizer_name=optimizer_name,
|
||||
device_id=device_id,
|
||||
)
|
||||
|
||||
self.negative_sample_size = num_negative
|
||||
self.weight_decay = weight_decay
|
||||
self.optimizer: Optimizer | None = None
|
||||
|
||||
self.affine_output = torch.nn.Linear(
|
||||
in_features=self.latent_dim, out_features=1
|
||||
)
|
||||
self.logistic = torch.nn.Sigmoid()
|
||||
|
||||
self.criterion = nn.BCELoss()
|
||||
|
||||
def fit(self, data: DataReader):
|
||||
optimizer = use_optimizer(
|
||||
network=self,
|
||||
weight_decay=self.weight_decay,
|
||||
learning_rate=self.learning_rate,
|
||||
optimizer_name=self.optimizer_name,
|
||||
)
|
||||
|
||||
if not isinstance(optimizer, Optimizer):
|
||||
raise TypeError(f"Expected an Optimizer, but got {type(optimizer)}")
|
||||
self.optimizer = optimizer
|
||||
dataset = data.dataset
|
||||
|
||||
num_users = data.num_user
|
||||
num_items = data.num_item
|
||||
|
||||
self.embedding_user = torch.nn.Embedding(
|
||||
num_embeddings=num_users, embedding_dim=self.latent_dim
|
||||
)
|
||||
|
||||
self.embedding_item = torch.nn.Embedding(
|
||||
num_embeddings=num_items, embedding_dim=self.latent_dim
|
||||
)
|
||||
|
||||
self.negatives = self._sample_negative(dataset)
|
||||
|
||||
with tqdm(total=self.epochs) as progress:
|
||||
for epoch in range(self.epochs):
|
||||
train_loader = self.instance_a_train_loader(
|
||||
dataset, self.negative_sample_size, self.batch_size
|
||||
)
|
||||
loss = self.train_an_epoch(train_loader)
|
||||
progress.update(1)
|
||||
progress.set_postfix({"loss": loss})
|
||||
|
||||
def instance_a_train_loader(self, dataset, num_negatives, batch_size):
|
||||
"""instance train loader for one training epoch"""
|
||||
users, items, ratings = [], [], []
|
||||
train_ratings = pd.merge(
|
||||
dataset, self.negatives[["userId", "negative_items"]], on="userId"
|
||||
)
|
||||
train_ratings["negatives"] = train_ratings["negative_items"].apply(
|
||||
lambda x: random.sample(list(x), num_negatives)
|
||||
)
|
||||
user_ids = train_ratings["userId"].tolist()
|
||||
item_ids = train_ratings["itemId"].tolist()
|
||||
rating_values = train_ratings["rating"].tolist()
|
||||
negatives_lists = train_ratings["negatives"].tolist()
|
||||
|
||||
for user, item, rating, negatives in zip(
|
||||
user_ids, item_ids, rating_values, negatives_lists
|
||||
):
|
||||
users.append(user)
|
||||
items.append(item)
|
||||
ratings.append(rating)
|
||||
for neg_item in negatives:
|
||||
users.append(user)
|
||||
items.append(neg_item)
|
||||
ratings.append(float(0)) # negative samples get 0 rating
|
||||
# negative samples get 0 rating
|
||||
dataset = UserItemRatingDataset(
|
||||
user_tensor=torch.LongTensor(users),
|
||||
item_tensor=torch.LongTensor(items),
|
||||
target_tensor=torch.FloatTensor(ratings),
|
||||
)
|
||||
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
|
||||
def train_an_epoch(self, train_loader):
|
||||
self.train()
|
||||
cnt = 0
|
||||
total_loss = 0
|
||||
for batch_id, batch in enumerate(train_loader):
|
||||
assert isinstance(batch[0], torch.LongTensor)
|
||||
user, item, rating = batch[0], batch[1], batch[2]
|
||||
rating = rating.float()
|
||||
loss = self.train_single_batch(user, item, rating)
|
||||
total_loss += loss
|
||||
cnt += 1
|
||||
return total_loss / cnt
|
||||
|
||||
def train_single_batch(self, users, items, ratings):
|
||||
if self.cuda is True:
|
||||
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
|
||||
|
||||
if self.optimizer is None:
|
||||
raise RuntimeError(
|
||||
"Optimizer is not initialized. Call fit() before training."
|
||||
)
|
||||
self.optimizer.zero_grad()
|
||||
ratings_pred = self(users, items)
|
||||
loss = self.criterion(ratings_pred.view(-1), ratings)
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
loss = loss.item()
|
||||
return loss
|
||||
|
||||
def _sample_negative(self, ratings):
|
||||
"""return all negative items & 100 sampled negative items"""
|
||||
interact_status = (
|
||||
ratings.groupby("userId")["itemId"]
|
||||
.apply(set)
|
||||
.reset_index()
|
||||
.rename(columns={"itemId": "interacted_items"})
|
||||
)
|
||||
self.item_catalogue = set(ratings.itemId)
|
||||
interact_status["negative_items"] = interact_status["interacted_items"].apply(
|
||||
lambda x: self.item_catalogue - x
|
||||
)
|
||||
return interact_status[["userId", "negative_items"]]
|
||||
|
||||
def forward(self, user_indices, item_indices):
|
||||
user_embedding = self.embedding_user(user_indices)
|
||||
item_embedding = self.embedding_item(item_indices)
|
||||
element_product = torch.mul(user_embedding, item_embedding)
|
||||
dot = self.affine_output(element_product)
|
||||
rating = self.logistic(dot)
|
||||
return rating
|
||||
@@ -0,0 +1,22 @@
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class Item2Vec(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.num_items = config['num_items']
|
||||
self.latent_dim = config['latent_dim']
|
||||
self.embedding = nn.Embedding(
|
||||
num_embeddings=self.num_items,
|
||||
embedding_dim=self.latent_dim)
|
||||
self.fc = nn.Linear(
|
||||
in_features=self.latent_dim,
|
||||
out_features=self.num_items)
|
||||
|
||||
def forward(self, input_data):
|
||||
embedding = self.embedding(input_data)
|
||||
return self.fc(embedding)
|
||||
|
||||
def item_embedding(self):
|
||||
return self.embedding.weight.detach()
|
||||
@@ -0,0 +1,240 @@
|
||||
from typing import Optional, Union
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from .recommender_model import RecommenderModel
|
||||
from pygrex.data_reader import DataReader
|
||||
|
||||
|
||||
class KNNBasic(RecommenderModel):
|
||||
"""
|
||||
An improved K-Nearest Neighbors collaborative filtering model.
|
||||
|
||||
This version uses Pearson correlation similarity and improved neighbor selection
|
||||
for better performance on sparse datasets like MovieLens.
|
||||
|
||||
Args:
|
||||
k (int): Number of neighbors to consider. Default 50.
|
||||
min_k (int): Minimum number of neighbors required for prediction. Default 3.
|
||||
sim_options (dict): Similarity options. Default pearson, user-based.
|
||||
"""
|
||||
|
||||
def __init__(self, k: int = 50, min_k: int = 3, sim_options: Optional[dict] = None):
|
||||
super().__init__()
|
||||
self.k = k
|
||||
self.min_k = min_k
|
||||
self.sim_options = sim_options if sim_options is not None else {}
|
||||
|
||||
# Validate similarity options
|
||||
if self.sim_options.get("user_based", True) is False:
|
||||
raise NotImplementedError("Only the user-based approach is implemented.")
|
||||
|
||||
sim_name = self.sim_options.get("name", "pearson").lower()
|
||||
if sim_name not in ["cosine", "pearson"]:
|
||||
raise NotImplementedError(
|
||||
"Only cosine and pearson similarity are implemented."
|
||||
)
|
||||
|
||||
# Model attributes
|
||||
self.trainset: Optional[sp.csr_matrix] = None
|
||||
self.global_mean: float = 0
|
||||
self.user_biases: Optional[np.ndarray] = None
|
||||
self.item_biases: Optional[np.ndarray] = None
|
||||
self.num_users: Optional[int] = None
|
||||
self.num_items: Optional[int] = None
|
||||
|
||||
# For memory-efficient similarity computation
|
||||
self.user_means: Optional[np.ndarray] = None
|
||||
|
||||
def fit(self, data: DataReader) -> None:
|
||||
"""
|
||||
Trains the KNN model with improved memory efficiency.
|
||||
"""
|
||||
print("Fitting the improved KNNBasic model...")
|
||||
df = data.dataset
|
||||
self.num_users = data.num_user
|
||||
self.num_items = data.num_item
|
||||
|
||||
print(
|
||||
f"Building ratings matrix for {self.num_users} users and {self.num_items} items..."
|
||||
)
|
||||
|
||||
# 1. Build the sparse user-item ratings matrix
|
||||
ratings = df["rating"].values
|
||||
rows = df["userId"].values
|
||||
cols = df["itemId"].values
|
||||
self.trainset = sp.csr_matrix(
|
||||
(ratings, (rows, cols)), shape=(self.num_users, self.num_items)
|
||||
)
|
||||
|
||||
# 2. Calculate global mean and biases
|
||||
print("Computing biases...")
|
||||
self.global_mean = self.trainset.data.mean()
|
||||
|
||||
# User biases: bu = avg(ratings_u) - global_mean
|
||||
user_sums = np.array(self.trainset.sum(axis=1)).flatten()
|
||||
user_counts = np.diff(self.trainset.indptr)
|
||||
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
user_avg_ratings = np.where(
|
||||
user_counts > 0, user_sums / user_counts, self.global_mean
|
||||
)
|
||||
self.user_biases = np.where(
|
||||
user_counts > 0, user_avg_ratings - self.global_mean, 0
|
||||
)
|
||||
|
||||
# Item biases: bi = avg(ratings_i) - global_mean
|
||||
item_sums = np.array(self.trainset.sum(axis=0)).flatten()
|
||||
item_counts = np.diff(self.trainset.tocsc().indptr)
|
||||
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
item_avg_ratings = np.where(
|
||||
item_counts > 0, item_sums / item_counts, self.global_mean
|
||||
)
|
||||
self.item_biases = np.where(
|
||||
item_counts > 0, item_avg_ratings - self.global_mean, 0
|
||||
)
|
||||
|
||||
# Store user means for similarity computation
|
||||
self.user_means = user_avg_ratings
|
||||
|
||||
print("Model fitting complete.")
|
||||
|
||||
def _compute_user_similarity(self, user1_id: int, user2_id: int) -> float:
|
||||
"""
|
||||
Compute Pearson correlation similarity between two users.
|
||||
This works better than cosine similarity for collaborative filtering.
|
||||
"""
|
||||
assert self.trainset is not None
|
||||
# Get rating vectors for both users
|
||||
user1_ratings = self.trainset[user1_id].toarray().flatten()
|
||||
user2_ratings = self.trainset[user2_id].toarray().flatten()
|
||||
|
||||
# Find commonly rated items
|
||||
mask = (user1_ratings > 0) & (user2_ratings > 0)
|
||||
n_common = np.sum(mask)
|
||||
|
||||
# Need at least 2 common ratings for correlation
|
||||
if n_common < 2:
|
||||
return 0.0
|
||||
|
||||
# Extract ratings for commonly rated items
|
||||
u1_common = user1_ratings[mask]
|
||||
u2_common = user2_ratings[mask]
|
||||
|
||||
# Mean-center the ratings
|
||||
u1_mean = np.mean(u1_common)
|
||||
u2_mean = np.mean(u2_common)
|
||||
|
||||
u1_centered = u1_common - u1_mean
|
||||
u2_centered = u2_common - u2_mean
|
||||
|
||||
# Compute Pearson correlation
|
||||
numerator = np.sum(u1_centered * u2_centered)
|
||||
denom1 = np.sqrt(np.sum(u1_centered**2))
|
||||
denom2 = np.sqrt(np.sum(u2_centered**2))
|
||||
|
||||
if denom1 == 0 or denom2 == 0:
|
||||
return 0.0
|
||||
|
||||
correlation = numerator / (denom1 * denom2)
|
||||
|
||||
# Apply significance weighting based on number of common items
|
||||
# More common items = more reliable similarity
|
||||
significance_weight = min(n_common / 50.0, 1.0) # Cap at 50 common items
|
||||
|
||||
return correlation * significance_weight
|
||||
|
||||
def _get_neighbors_for_item(self, user_id: int, item_id: int):
|
||||
"""
|
||||
Get the top-k most similar users who have rated the given item.
|
||||
"""
|
||||
# Find users who rated this item
|
||||
assert self.trainset is not None
|
||||
item_col = self.trainset[:, item_id] # type: ignore
|
||||
neighbor_candidates, _ = item_col.nonzero()
|
||||
|
||||
# Remove the target user if they're in the candidates
|
||||
neighbor_candidates = neighbor_candidates[neighbor_candidates != user_id]
|
||||
|
||||
if len(neighbor_candidates) == 0:
|
||||
return np.array([]), np.array([]), np.array([])
|
||||
|
||||
# Compute similarities
|
||||
similarities = []
|
||||
for neighbor_id in neighbor_candidates:
|
||||
sim = self._compute_user_similarity(user_id, neighbor_id)
|
||||
similarities.append((sim, neighbor_id))
|
||||
|
||||
# Sort by similarity and take top-k
|
||||
similarities.sort(key=lambda x: x[0], reverse=True)
|
||||
top_k = similarities[: min(self.k, len(similarities))]
|
||||
|
||||
if len(top_k) < self.min_k:
|
||||
return np.array([]), np.array([]), np.array([])
|
||||
|
||||
# Extract data
|
||||
neighbor_sims = np.array([sim for sim, _ in top_k])
|
||||
neighbor_ids = np.array([nid for _, nid in top_k])
|
||||
neighbor_ratings = np.array(
|
||||
[self.trainset[nid, item_id] for nid in neighbor_ids]
|
||||
)
|
||||
|
||||
return neighbor_sims, neighbor_ids, neighbor_ratings
|
||||
|
||||
def predict(self, user_id: Union[int, str], item_id: Union[int, str]) -> float:
|
||||
"""
|
||||
Predict rating for a user-item pair using KNN.
|
||||
"""
|
||||
if self.trainset is None:
|
||||
raise RuntimeError("Model must be trained first using fit() method.")
|
||||
|
||||
assert self.num_users is not None
|
||||
assert self.num_items is not None
|
||||
assert self.user_biases is not None
|
||||
assert self.item_biases is not None
|
||||
user_id = int(user_id)
|
||||
item_id = int(item_id)
|
||||
# Handle out-of-bounds users/items
|
||||
if user_id >= self.num_users or item_id >= self.num_items:
|
||||
return self.global_mean
|
||||
|
||||
# 1. Calculate baseline estimate
|
||||
baseline = (
|
||||
self.global_mean + self.user_biases[user_id] + self.item_biases[item_id]
|
||||
)
|
||||
|
||||
# 2. Get neighbors who rated this item
|
||||
neighbor_sims, neighbor_ids, neighbor_ratings = self._get_neighbors_for_item(
|
||||
user_id, item_id
|
||||
)
|
||||
|
||||
if len(neighbor_ids) == 0:
|
||||
return baseline
|
||||
|
||||
# 3. Calculate weighted prediction
|
||||
neighbor_biases = self.user_biases[neighbor_ids]
|
||||
neighbor_baselines = (
|
||||
self.global_mean + neighbor_biases + self.item_biases[item_id]
|
||||
)
|
||||
|
||||
deviations = neighbor_ratings - neighbor_baselines
|
||||
|
||||
# Only use neighbors with positive similarity
|
||||
positive_mask = neighbor_sims > 0
|
||||
if not np.any(positive_mask):
|
||||
return baseline
|
||||
|
||||
neighbor_sims = neighbor_sims[positive_mask]
|
||||
deviations = deviations[positive_mask]
|
||||
|
||||
numerator = np.sum(neighbor_sims * deviations)
|
||||
denominator = np.sum(np.abs(neighbor_sims))
|
||||
|
||||
if denominator == 0:
|
||||
return baseline
|
||||
|
||||
prediction = baseline + (numerator / denominator)
|
||||
|
||||
# Clip to valid rating range
|
||||
return np.clip(prediction, 1.0, 5.0)
|
||||
@@ -0,0 +1,136 @@
|
||||
import numpy as np
|
||||
import scipy
|
||||
from typing import Union, Protocol, runtime_checkable
|
||||
|
||||
from implicit.recommender_base import RecommenderBase
|
||||
from .recommender_model import RecommenderModel
|
||||
from pygrex.data_reader import DataReader
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class FittableImplicitModel(Protocol):
|
||||
user_factors: np.ndarray
|
||||
item_factors: np.ndarray
|
||||
|
||||
def fit(self, item_user_data) -> None: ...
|
||||
|
||||
|
||||
class MFImplicitModel(RecommenderModel):
|
||||
def __init__(
|
||||
self,
|
||||
latent_dim,
|
||||
reg_term,
|
||||
learning_rate,
|
||||
epochs,
|
||||
num_users=None,
|
||||
num_items=None,
|
||||
):
|
||||
self.latent_dim = latent_dim
|
||||
self.reg_term = reg_term
|
||||
self.learning_rate = learning_rate
|
||||
self.epochs = epochs
|
||||
self.model: Union[RecommenderBase, FittableImplicitModel, None] = None
|
||||
self.total_users = num_users
|
||||
self.total_items = num_items
|
||||
|
||||
def fit(self, data: DataReader) -> None:
|
||||
if self.model is None:
|
||||
raise RuntimeError(
|
||||
"The model has not been initialized. Please use a specific subclass like ALS or BPR."
|
||||
)
|
||||
num_user_for_shape = data.dataset["userId"].max() + 1
|
||||
num_item_for_shape = data.dataset["itemId"].max() + 1
|
||||
self.total_users = num_user_for_shape
|
||||
self.total_items = num_item_for_shape
|
||||
|
||||
item_user_data = self.rearrange_dataset(
|
||||
ds=data.dataset,
|
||||
num_user=num_user_for_shape,
|
||||
num_item=num_item_for_shape,
|
||||
).T.tocsr()
|
||||
|
||||
self.model.fit(item_user_data)
|
||||
|
||||
@staticmethod
|
||||
def rearrange_dataset(ds, num_user: int, num_item: int) -> scipy.sparse.csr_matrix:
|
||||
"""
|
||||
Converts the dataset into a sparse matrix format for the implicit model.
|
||||
|
||||
Args:
|
||||
ds: Dataset containing userId and itemId columns
|
||||
num_user : Number of users in the dataset
|
||||
num_item : Number of items in the dataset
|
||||
|
||||
Returns:
|
||||
ds_mtr: Sparse matrix representation of the dataset
|
||||
"""
|
||||
|
||||
# Create sparse matrix directly from data
|
||||
data = np.ones(len(ds)) # Array of 1s for each interaction
|
||||
rows = ds["userId"].values # User IDs as row indices
|
||||
cols = ds["itemId"].values # Item IDs as column indices
|
||||
|
||||
ds_mtr = scipy.sparse.csr_matrix(
|
||||
(data, (rows, cols)), shape=(num_user, num_item)
|
||||
)
|
||||
|
||||
return ds_mtr
|
||||
|
||||
def predict(
|
||||
self, user_id: Union[str, int], item_id: Union[str, int, list, np.ndarray]
|
||||
) -> Union[float, list]:
|
||||
"""
|
||||
Predict ratings for a user and one or more items using efficient vectorization.
|
||||
|
||||
Args:
|
||||
user_id : User identifier
|
||||
item_id : Item identifier or a list/array of item identifiers
|
||||
|
||||
Returns:
|
||||
A single predicted score (float) or an array of scores (np.ndarray)
|
||||
"""
|
||||
if not isinstance(self.model, FittableImplicitModel):
|
||||
raise RuntimeError(
|
||||
"The model has not been trained yet. Please call fit() first."
|
||||
)
|
||||
user_id = int(user_id)
|
||||
|
||||
# 1. Validate user_id
|
||||
if not (0 <= user_id < self.model.user_factors.shape[0]):
|
||||
raise ValueError(f"user_id {user_id} is out of bounds")
|
||||
|
||||
# 2. Unify input to always be a numpy array
|
||||
is_single_item = not isinstance(item_id, (list, np.ndarray))
|
||||
item_ids_arr = np.array(item_id, ndmin=1).astype(int)
|
||||
|
||||
# 3. Perform a single, vectorized bounds check for all items at once
|
||||
max_item_id = self.model.item_factors.shape[0]
|
||||
if not np.all((item_ids_arr >= 0) & (item_ids_arr < max_item_id)):
|
||||
out_of_bounds_id = item_ids_arr[
|
||||
(item_ids_arr < 0) | (item_ids_arr >= max_item_id)
|
||||
][0]
|
||||
raise ValueError(f"item_id {out_of_bounds_id} is out of bounds")
|
||||
|
||||
# 4. Get all item vectors in a single, highly efficient operation
|
||||
item_vectors = self.model.item_factors[item_ids_arr]
|
||||
user_vector = self.model.user_factors[user_id]
|
||||
|
||||
# 5. Calculate all scores with one dot product
|
||||
scores = user_vector.dot(item_vectors.T)
|
||||
|
||||
# 6. Return a single float if the input was a single item, otherwise the array
|
||||
return scores[0].item() if is_single_item else scores.tolist()
|
||||
|
||||
def user_embedding(self) -> np.ndarray:
|
||||
if not isinstance(self.model, FittableImplicitModel):
|
||||
raise RuntimeError(
|
||||
"The model has not been trained yet. Please call fit() first."
|
||||
)
|
||||
return self.model.user_factors
|
||||
|
||||
def item_embedding(self) -> np.ndarray:
|
||||
if not isinstance(self.model, FittableImplicitModel):
|
||||
raise RuntimeError(
|
||||
"The model has not been trained yet. Please call fit() first."
|
||||
)
|
||||
return self.model.item_factors
|
||||
@@ -0,0 +1,179 @@
|
||||
import random
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.optim import Optimizer
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from pygrex.data_reader import DataReader, UserItemRatingDataset
|
||||
from pygrex.utils.torch_utils import use_optimizer
|
||||
from .py_torch_model import PyTorchModel
|
||||
|
||||
|
||||
class MLPModel(PyTorchModel):
|
||||
def __init__(
|
||||
self,
|
||||
learning_rate: float,
|
||||
weight_decay: float,
|
||||
latent_dim: int,
|
||||
epochs: int,
|
||||
num_negative: int,
|
||||
batch_size: int,
|
||||
cuda: bool,
|
||||
optimizer_name: str,
|
||||
device_id=None,
|
||||
):
|
||||
super().__init__(
|
||||
learning_rate=learning_rate,
|
||||
latent_dim=latent_dim,
|
||||
epochs=epochs,
|
||||
batch_size=batch_size,
|
||||
cuda=cuda,
|
||||
optimizer_name=optimizer_name,
|
||||
device_id=device_id,
|
||||
)
|
||||
|
||||
self.negative_sample_size = num_negative
|
||||
self.weight_decay = weight_decay
|
||||
|
||||
# layer dim is 2*self.latent_dim since the embeddings will be concatenated
|
||||
self.affine_output = torch.nn.Linear(
|
||||
in_features=2 * self.latent_dim, out_features=1
|
||||
)
|
||||
self.logistic = torch.nn.Sigmoid()
|
||||
|
||||
self.criterion = nn.BCELoss()
|
||||
self.optimizer: Optimizer | None = None
|
||||
|
||||
def fit(self, data: DataReader):
|
||||
optimizer = use_optimizer(
|
||||
network=self,
|
||||
weight_decay=self.weight_decay,
|
||||
learning_rate=self.learning_rate,
|
||||
optimizer_name=self.optimizer_name,
|
||||
)
|
||||
if not isinstance(optimizer, Optimizer):
|
||||
raise TypeError(f"Expected an Optimizer, but got {type(optimizer)}")
|
||||
self.optimizer = optimizer
|
||||
|
||||
dataset = data.dataset
|
||||
|
||||
num_users = data.num_user
|
||||
num_items = data.num_item
|
||||
|
||||
self.embedding_user = torch.nn.Embedding(
|
||||
num_embeddings=num_users, embedding_dim=self.latent_dim
|
||||
)
|
||||
|
||||
self.embedding_item = torch.nn.Embedding(
|
||||
num_embeddings=num_items, embedding_dim=self.latent_dim
|
||||
)
|
||||
|
||||
self.negatives = self._sample_negative(dataset)
|
||||
|
||||
with tqdm(total=self.epochs) as progress:
|
||||
for epoch in range(self.epochs):
|
||||
train_loader = self.instance_a_train_loader(
|
||||
dataset, self.negative_sample_size, self.batch_size
|
||||
)
|
||||
loss = self.train_an_epoch(train_loader)
|
||||
progress.update(1)
|
||||
progress.set_postfix({"loss": loss})
|
||||
|
||||
def instance_a_train_loader(self, dataset, num_negatives, batch_size):
|
||||
"""instance train loader for one training epoch"""
|
||||
users, items, ratings = [], [], []
|
||||
train_ratings = pd.merge(
|
||||
dataset, self.negatives[["userId", "negative_items"]], on="userId"
|
||||
)
|
||||
train_ratings["negatives"] = train_ratings["negative_items"].apply(
|
||||
lambda x: random.sample(list(x), num_negatives)
|
||||
)
|
||||
user_ids = train_ratings["userId"].tolist()
|
||||
item_ids = train_ratings["itemId"].tolist()
|
||||
rating_values = train_ratings["rating"].tolist()
|
||||
negatives_lists = train_ratings["negatives"].tolist()
|
||||
|
||||
for user, item, rating, negatives in zip(
|
||||
user_ids, item_ids, rating_values, negatives_lists
|
||||
):
|
||||
users.append(user)
|
||||
items.append(item)
|
||||
ratings.append(rating)
|
||||
for neg_item in negatives:
|
||||
users.append(user)
|
||||
items.append(neg_item)
|
||||
ratings.append(float(0)) # negative samples get 0 rating
|
||||
|
||||
dataset = UserItemRatingDataset(
|
||||
user_tensor=torch.LongTensor(users),
|
||||
item_tensor=torch.LongTensor(items),
|
||||
target_tensor=torch.FloatTensor(ratings),
|
||||
)
|
||||
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
|
||||
def train_an_epoch(self, train_loader):
|
||||
self.train()
|
||||
cnt = 0
|
||||
total_loss = 0
|
||||
for batch_id, batch in enumerate(train_loader):
|
||||
assert isinstance(batch[0], torch.LongTensor)
|
||||
user, item, rating = batch[0], batch[1], batch[2]
|
||||
rating = rating.float()
|
||||
loss = self.train_single_batch(user, item, rating)
|
||||
total_loss += loss
|
||||
cnt += 1
|
||||
return total_loss / cnt
|
||||
|
||||
def train_single_batch(self, users, items, ratings):
|
||||
if self.cuda is True:
|
||||
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
|
||||
if self.optimizer is None:
|
||||
raise RuntimeError(
|
||||
"Optimizer is not initialized. Call fit() before training."
|
||||
)
|
||||
|
||||
self.optimizer.zero_grad()
|
||||
ratings_pred = self(users, items)
|
||||
loss = self.criterion(ratings_pred.view(-1), ratings)
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
loss = loss.item()
|
||||
return loss
|
||||
|
||||
def _sample_negative(self, ratings):
|
||||
"""return all negative items & 100 sampled negative items"""
|
||||
interact_status = (
|
||||
ratings.groupby("userId")["itemId"]
|
||||
.apply(set)
|
||||
.reset_index()
|
||||
.rename(columns={"itemId": "interacted_items"})
|
||||
)
|
||||
self.item_catalogue = set(ratings.itemId)
|
||||
interact_status["negative_items"] = interact_status["interacted_items"].apply(
|
||||
lambda x: self.item_catalogue - x
|
||||
)
|
||||
return interact_status[["userId", "negative_items"]]
|
||||
|
||||
def forward(self, user_indices, item_indices):
|
||||
user_embedding = self.embedding_user(user_indices)
|
||||
item_embedding = self.embedding_item(item_indices)
|
||||
|
||||
# Ensure embeddings are 2D [batch_size, embedding_dim]
|
||||
if user_embedding.dim() == 3:
|
||||
user_embedding = user_embedding.squeeze(1)
|
||||
if item_embedding.dim() == 3:
|
||||
item_embedding = item_embedding.squeeze(1)
|
||||
|
||||
# This is needed because cat does not support broadcasting.
|
||||
if user_embedding.size(0) == 1 and item_embedding.size(0) > 1:
|
||||
user_embedding = user_embedding.repeat(item_embedding.size(0), 1)
|
||||
elif item_embedding.size(0) == 1 and user_embedding.size(0) > 1:
|
||||
item_embedding = item_embedding.repeat(user_embedding.size(0), 1)
|
||||
|
||||
element_concat = torch.cat((user_embedding, item_embedding), 1)
|
||||
concat = self.affine_output(element_concat)
|
||||
rating = self.logistic(concat)
|
||||
return rating
|
||||
@@ -0,0 +1,69 @@
|
||||
import itertools
|
||||
from typing import Union
|
||||
import torch
|
||||
|
||||
from pygrex.utils.torch_utils import use_cuda
|
||||
from .recommender_model import RecommenderModel
|
||||
from pygrex.data_reader import DataReader
|
||||
|
||||
|
||||
class PyTorchModel(RecommenderModel, torch.nn.Module):
|
||||
"""Meta Learner
|
||||
|
||||
Note: Subclass should implement self.model !
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
learning_rate: float,
|
||||
latent_dim: int,
|
||||
epochs: int,
|
||||
batch_size: int,
|
||||
cuda: bool,
|
||||
optimizer_name: str,
|
||||
device_id: Union[int, None] = None,
|
||||
):
|
||||
if optimizer_name not in ["sgd", "adam", "rmsprop"]:
|
||||
raise Exception("Wrong optimizer.")
|
||||
|
||||
if cuda is True and device_id is not None:
|
||||
use_cuda(True, device_id)
|
||||
|
||||
self.latent_dim = latent_dim
|
||||
self.learning_rate = learning_rate
|
||||
self.epochs = epochs
|
||||
self.batch_size = batch_size
|
||||
self._cuda = cuda
|
||||
self.optimizer_name = optimizer_name
|
||||
|
||||
self.dataset = None
|
||||
self.dataset_metadata = None
|
||||
self.embedding_user = None
|
||||
self.embedding_item = None
|
||||
self.optimizer = None
|
||||
|
||||
super().__init__()
|
||||
|
||||
def fit(self, data: DataReader):
|
||||
pass
|
||||
|
||||
def predict(self, user_id, item_id) -> list:
|
||||
if isinstance(user_id, int):
|
||||
user_id = [user_id]
|
||||
if isinstance(item_id, int):
|
||||
item_id = [item_id]
|
||||
user_id = torch.LongTensor(user_id)
|
||||
item_id = torch.LongTensor(item_id)
|
||||
with torch.no_grad():
|
||||
if self._cuda:
|
||||
user_id = user_id.cuda()
|
||||
item_id = item_id.cuda()
|
||||
pred = self.forward(user_id, item_id).cpu().tolist()
|
||||
pred = list(itertools.chain.from_iterable(pred))
|
||||
return pred
|
||||
|
||||
def user_embedding(self):
|
||||
return self.state_dict()["embedding_user.weight"].cpu().numpy()
|
||||
|
||||
def item_embedding(self):
|
||||
return self.state_dict()["embedding_item.weight"].cpu().numpy()
|
||||
@@ -0,0 +1,35 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Union
|
||||
|
||||
from pygrex.data_reader.data_reader import DataReader
|
||||
|
||||
|
||||
class RecommenderModel(ABC):
|
||||
"""
|
||||
Abstract base class that defines the interface for recommendation models.
|
||||
All model implementations should inherit from this class.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def predict(
|
||||
self, user_id: Union[str, int], item_id: Union[str, int]
|
||||
) -> Union[float, list]:
|
||||
"""
|
||||
Make predictions for a specific user on a list of items.
|
||||
|
||||
Args:
|
||||
user_id: The ID of the user
|
||||
item_ids: List of item IDs to predict ratings/scores for
|
||||
|
||||
Returns:
|
||||
A dictionary mapping item IDs to predicted ratings/scores
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fit(self, data: DataReader):
|
||||
"""
|
||||
Train the model on data.
|
||||
The specific parameters depend on the model implementation.
|
||||
"""
|
||||
pass
|
||||
@@ -0,0 +1,169 @@
|
||||
from math import sqrt
|
||||
import numpy as np
|
||||
from pygrex.data_reader.data_reader import DataReader
|
||||
from pygrex.models.recommender_model import RecommenderModel
|
||||
|
||||
|
||||
class SVD(RecommenderModel):
|
||||
def __init__(
|
||||
self,
|
||||
n_factors=50,
|
||||
n_epochs=25,
|
||||
lr=0.007,
|
||||
reg=0.1,
|
||||
init_mean=0.0,
|
||||
init_std=0.1,
|
||||
random_state=42,
|
||||
early_stopping=True,
|
||||
):
|
||||
self.n_factors = n_factors
|
||||
self.n_epochs = n_epochs
|
||||
self.lr = lr
|
||||
self.reg = reg
|
||||
self.init_mean = init_mean
|
||||
self.init_std = init_std
|
||||
self.random_state = random_state
|
||||
self.early_stopping = early_stopping
|
||||
|
||||
# Model parameters
|
||||
self.user_factors = None
|
||||
self.item_factors = None
|
||||
self.user_biases = None
|
||||
self.item_biases = None
|
||||
self.global_mean = None
|
||||
|
||||
# Training history
|
||||
self.training_rmse = []
|
||||
|
||||
def fit(self, data: DataReader, validation_data=None):
|
||||
df = data.dataset
|
||||
if data._num_user is None or data._num_item is None:
|
||||
raise ValueError("The number of users and items cannot be None.")
|
||||
num_users, num_items = data._num_user, data._num_item
|
||||
|
||||
# Initialize random number generator
|
||||
rng = np.random.RandomState(self.random_state)
|
||||
|
||||
# Initialize parameters with better scaling
|
||||
scale = 1.0 / sqrt(self.n_factors)
|
||||
self.user_factors = rng.normal(
|
||||
self.init_mean, scale, (num_users, self.n_factors)
|
||||
) # type: ignore
|
||||
self.item_factors = rng.normal(
|
||||
self.init_mean, scale, (num_items, self.n_factors)
|
||||
) # type: ignore
|
||||
self.user_biases = np.zeros(num_users)
|
||||
self.item_biases = np.zeros(num_items)
|
||||
self.global_mean = df["rating"].mean()
|
||||
|
||||
# Convert to list of tuples for faster iteration
|
||||
ratings_tuple = list(
|
||||
df[["userId", "itemId", "rating"]].itertuples(index=False, name=None)
|
||||
)
|
||||
|
||||
# Training loop with early stopping
|
||||
best_rmse = float("inf")
|
||||
patience = 3
|
||||
patience_counter = 0
|
||||
|
||||
for epoch in range(self.n_epochs):
|
||||
print(f"Epoch {epoch + 1}/{self.n_epochs}...")
|
||||
|
||||
# Shuffle training data
|
||||
rng.shuffle(ratings_tuple)
|
||||
|
||||
# SGD updates
|
||||
for user, item, rating in ratings_tuple:
|
||||
# Predict rating
|
||||
dot_product = np.dot(self.user_factors[user], self.item_factors[item])
|
||||
prediction = (
|
||||
self.global_mean
|
||||
+ self.user_biases[user]
|
||||
+ self.item_biases[item]
|
||||
+ dot_product
|
||||
)
|
||||
|
||||
# Compute error
|
||||
error = rating - prediction
|
||||
|
||||
# Update biases
|
||||
self.user_biases[user] += self.lr * (
|
||||
error - self.reg * self.user_biases[user]
|
||||
)
|
||||
self.item_biases[item] += self.lr * (
|
||||
error - self.reg * self.item_biases[item]
|
||||
)
|
||||
|
||||
# Update factors
|
||||
uf_temp = self.user_factors[user].copy()
|
||||
self.user_factors[user] += self.lr * (
|
||||
error * self.item_factors[item] - self.reg * self.user_factors[user]
|
||||
)
|
||||
self.item_factors[item] += self.lr * (
|
||||
error * uf_temp - self.reg * self.item_factors[item]
|
||||
)
|
||||
|
||||
# Calculate training RMSE
|
||||
if epoch % 5 == 0 or epoch == self.n_epochs - 1:
|
||||
train_rmse = self.calculate_rmse(ratings_tuple)
|
||||
self.training_rmse.append(train_rmse)
|
||||
print(f" Training RMSE: {train_rmse:.4f}")
|
||||
|
||||
# Early stopping
|
||||
if self.early_stopping and validation_data is not None:
|
||||
val_rmse = self.calculate_rmse(validation_data)
|
||||
print(f" Validation RMSE: {val_rmse:.4f}")
|
||||
|
||||
if val_rmse < best_rmse:
|
||||
best_rmse = val_rmse
|
||||
patience_counter = 0
|
||||
else:
|
||||
patience_counter += 1
|
||||
|
||||
if patience_counter >= patience:
|
||||
print(f"Early stopping at epoch {epoch + 1}")
|
||||
break
|
||||
|
||||
print("Fit complete.")
|
||||
|
||||
def calculate_rmse(self, ratings_data):
|
||||
"""Calculate RMSE for given ratings data."""
|
||||
total_error = 0
|
||||
count = 0
|
||||
|
||||
for user, item, rating in ratings_data:
|
||||
prediction = self.predict(user, item)
|
||||
total_error += (rating - prediction) ** 2
|
||||
count += 1
|
||||
|
||||
return sqrt(total_error / count) if count > 0 else 0
|
||||
|
||||
def predict(self, user_id: int | str, item_id: int | str) -> float:
|
||||
# Check that all model components are initialized
|
||||
if (
|
||||
self.user_factors is None
|
||||
or self.item_factors is None
|
||||
or self.user_biases is None
|
||||
or self.item_biases is None
|
||||
or self.global_mean is None
|
||||
):
|
||||
raise RuntimeError("The model has not been trained yet.")
|
||||
|
||||
try:
|
||||
user_id = int(user_id)
|
||||
item_id = int(item_id)
|
||||
except (ValueError, TypeError):
|
||||
# If conversion fails, return the global mean rating
|
||||
return self.global_mean
|
||||
|
||||
# Make prediction
|
||||
dot_product = np.dot(self.user_factors[user_id], self.item_factors[item_id])
|
||||
prediction = (
|
||||
self.global_mean
|
||||
+ self.user_biases[user_id]
|
||||
+ self.item_biases[item_id]
|
||||
+ dot_product
|
||||
)
|
||||
|
||||
# Clip to valid rating range
|
||||
return np.clip(prediction, 1, 5)
|
||||
@@ -0,0 +1,4 @@
|
||||
from .recommender import Recommender
|
||||
from .group_recommender import GroupRecommender
|
||||
|
||||
__all__ = ["Recommender", "GroupRecommender"]
|
||||
@@ -0,0 +1,72 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm.autonotebook import tqdm
|
||||
|
||||
|
||||
class GenericRecommender:
|
||||
def __init__(self, dataset_metadata, model, top_n: int = 10):
|
||||
self.top_n = top_n
|
||||
self.dataset = dataset_metadata.dataset
|
||||
self.model = model
|
||||
self.catalogue = set(self.dataset["itemId"])
|
||||
|
||||
def recommend_all(self):
|
||||
"""
|
||||
Get all recommendations.
|
||||
:param top_n:
|
||||
:return: recommendations for any user.
|
||||
"""
|
||||
|
||||
ratings = self.dataset.groupby("userId")
|
||||
|
||||
recommendations = pd.DataFrame({"userId": [], "itemId": [], "rank": []})
|
||||
|
||||
with tqdm(
|
||||
total=self.dataset["userId"].nunique(), desc="Recommending for users: "
|
||||
) as pbar:
|
||||
for user_id, user_ratings in ratings:
|
||||
# Replace .append() with pd.concat() - pandas 2.2.x +
|
||||
recommendations = pd.concat(
|
||||
[recommendations, self.recommend_user(user_id, user_ratings)], # type: ignore
|
||||
ignore_index=True,
|
||||
)
|
||||
pbar.update()
|
||||
|
||||
return recommendations
|
||||
|
||||
def rank_prediction(self, user_id, target_item_id, predictions):
|
||||
# Ensure predictions are flattened if they're 2D
|
||||
if isinstance(predictions, np.ndarray) and predictions.ndim > 1:
|
||||
predictions = predictions.flatten()
|
||||
recommendations = pd.DataFrame(
|
||||
{"userId": user_id, "itemId": target_item_id, "prediction": predictions}
|
||||
)
|
||||
|
||||
recommendations["rank"] = recommendations["prediction"].rank(
|
||||
method="first", ascending=False
|
||||
)
|
||||
|
||||
recommendations.sort_values(["userId", "rank"], inplace=True)
|
||||
|
||||
recommendations = recommendations[recommendations["rank"] <= self.top_n]
|
||||
|
||||
return recommendations[["userId", "itemId", "rank"]]
|
||||
|
||||
def get_unrated(self, user_ratings):
|
||||
"""
|
||||
Extract the set of items a user has not rated.
|
||||
:param user_ratings: list, items rated.
|
||||
:return: list, items not rated.
|
||||
"""
|
||||
unrated_item_id = self.catalogue - set(user_ratings)
|
||||
unrated_item_id = list(unrated_item_id)
|
||||
return unrated_item_id
|
||||
|
||||
def get_rated(self, user_id):
|
||||
"""
|
||||
Extract the set of items a user has not rated.
|
||||
:param user_id: userId rated.
|
||||
:return: list, rated items.
|
||||
"""
|
||||
rated = self.dataset[self.dataset["userId"] == user_id]
|
||||
return rated
|
||||
@@ -0,0 +1,391 @@
|
||||
from typing import Dict, List, Union, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pygrex.data_reader.data_reader import DataReader
|
||||
from pygrex.models.recommender_model import RecommenderModel
|
||||
from pygrex.utils.aggregation_strategy import ScoreAggregator, AggregationStrategy
|
||||
from pygrex.utils.scale import Scale
|
||||
|
||||
|
||||
class GroupRecommender:
|
||||
"""
|
||||
A class to represent a group recommender system that follows the workflow:
|
||||
1. Setup and Candidate Selection
|
||||
2. Individual Preference Collection
|
||||
3. Score Aggregation
|
||||
4. Final Recommendation List
|
||||
"""
|
||||
|
||||
def __init__(self, data: DataReader):
|
||||
"""Initialize the group recommender with data.
|
||||
|
||||
Args:
|
||||
data: The dataset containing user-item interactions.
|
||||
"""
|
||||
self.data = data
|
||||
self._group_predictions = None
|
||||
self._members = None
|
||||
self._item_pool = None
|
||||
self._model = None
|
||||
self._aggregation_strategy = None
|
||||
self._score_aggregator = None
|
||||
self._aggregated_scores = None
|
||||
self._top_recommendation = None
|
||||
|
||||
def setup_recommendation(
|
||||
self,
|
||||
model: RecommenderModel,
|
||||
members: List[Union[str, int]],
|
||||
data: DataReader,
|
||||
aggregation_strategy: AggregationStrategy, # type: ignore
|
||||
most_respected_person: Optional[Union[str, int]] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Setup and Candidate Selection: Initialize the group recommendation process.
|
||||
Args:
|
||||
model: The recommendation model to use
|
||||
members: List of user IDs representing the group members
|
||||
data: DataReader object containing the dataset
|
||||
aggregation_strategy: Strategy for aggregating individual predictions
|
||||
most_respected_person: User ID of most respected person (required for MRP strategy)
|
||||
"""
|
||||
self._members = members
|
||||
self._model = model
|
||||
self._aggregation_strategy = aggregation_strategy
|
||||
|
||||
# Initialize score aggregator
|
||||
self._score_aggregator = ScoreAggregator(
|
||||
most_respected_person=most_respected_person
|
||||
)
|
||||
|
||||
# get all item IDs from the dataset
|
||||
item_ids = data.dataset["itemId"].unique()
|
||||
|
||||
# Get items that no group member has interacted with
|
||||
self._item_pool = self.get_non_interacted_items_for_recommendation(
|
||||
self.data,
|
||||
item_ids, # type: ignore
|
||||
members, # type: ignore
|
||||
)
|
||||
|
||||
# Filter item_pool to only include IDs that are valid for the model
|
||||
# This prevents out-of-bounds errors when the model was trained with a different
|
||||
# number of items than what's currently in the dataset
|
||||
max_item_id = self._get_max_valid_item_id(model)
|
||||
# Convert to int array and filter out invalid IDs
|
||||
item_pool_int = self._item_pool.astype(int)
|
||||
valid_mask = (item_pool_int >= 0) & (item_pool_int < max_item_id)
|
||||
self._item_pool = item_pool_int[valid_mask]
|
||||
|
||||
# Individual Preference Collection: Generate predictions for each group member
|
||||
self._group_predictions = self._generate_group_predictions()
|
||||
|
||||
# Score Aggregation: Aggregate individual predictions into collective scores
|
||||
self._aggregated_scores = self._aggregate_group_scores()
|
||||
|
||||
def _generate_group_predictions(self) -> Dict[Union[str, int], Dict[int, float]]:
|
||||
"""
|
||||
Individual Preference Collection: Generate predictions for all group members.
|
||||
|
||||
Returns:
|
||||
A dictionary with user IDs as keys and their predictions as values
|
||||
"""
|
||||
if not self._members or self._model is None or self._item_pool is None:
|
||||
raise ValueError(
|
||||
"You must call setup_recommendation before generating predictions"
|
||||
)
|
||||
|
||||
predictions = {}
|
||||
for member in self._members:
|
||||
user_pred = self.generate_recommendation(
|
||||
self._model,
|
||||
member,
|
||||
self._item_pool, # type: ignore
|
||||
self.data, # type: ignore
|
||||
)
|
||||
predictions[member] = user_pred
|
||||
|
||||
return predictions
|
||||
|
||||
def _aggregate_group_scores(self) -> Dict[int, float]:
|
||||
"""
|
||||
Score Aggregation: Aggregate individual predictions into collective scores.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping item IDs to aggregated scores
|
||||
"""
|
||||
if (
|
||||
self._group_predictions is None
|
||||
or self._score_aggregator is None
|
||||
or self._aggregation_strategy is None
|
||||
):
|
||||
raise ValueError(
|
||||
"You must call setup_recommendation before aggregating scores"
|
||||
)
|
||||
|
||||
# For Borda Count, we need to create rankings from predictions
|
||||
rankings = None
|
||||
if self._aggregation_strategy == AggregationStrategy.BORDA_COUNT:
|
||||
rankings = self._create_rankings_from_predictions()
|
||||
|
||||
# Use ScoreAggregator to aggregate scores
|
||||
aggregated_scores = self._score_aggregator.aggregate_scores(
|
||||
evaluations=self._group_predictions, # type: ignore
|
||||
strategy=self._aggregation_strategy,
|
||||
rankings=rankings, # type: ignore
|
||||
)
|
||||
|
||||
# Sort items by their aggregated scores in descending order
|
||||
sorted_scores = dict(
|
||||
sorted(aggregated_scores.items(), key=lambda x: x[1], reverse=True)
|
||||
)
|
||||
|
||||
return sorted_scores # type: ignore
|
||||
|
||||
def _create_rankings_from_predictions(self) -> Dict[Union[str, int], List[int]]:
|
||||
"""
|
||||
Create rankings from predictions for Borda Count aggregation.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping user IDs to ranked lists of item IDs
|
||||
"""
|
||||
if self._group_predictions is None:
|
||||
raise ValueError("Group predictions not available")
|
||||
|
||||
rankings = {}
|
||||
for user_id, predictions in self._group_predictions.items():
|
||||
# Sort items by prediction score in descending order
|
||||
sorted_items = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
|
||||
rankings[user_id] = [item_id for item_id, _ in sorted_items]
|
||||
|
||||
return rankings
|
||||
|
||||
def _get_max_valid_item_id(self, model: RecommenderModel) -> int:
|
||||
"""
|
||||
Get the maximum valid item ID for the given model.
|
||||
|
||||
Args:
|
||||
model: The recommendation model
|
||||
|
||||
Returns:
|
||||
Maximum valid item ID (exclusive, so valid IDs are [0, max_item_id))
|
||||
"""
|
||||
# For implicit models (MFImplicitModel), check item_factors shape
|
||||
if hasattr(model, 'model') and model.model is not None:
|
||||
if hasattr(model.model, 'item_factors'):
|
||||
return model.model.item_factors.shape[0]
|
||||
# Check if model has total_items attribute (set during fit)
|
||||
if hasattr(model, 'total_items') and model.total_items is not None:
|
||||
return model.total_items
|
||||
# Fallback to data.num_item if model shape is not available
|
||||
return self.data.num_item
|
||||
|
||||
def get_non_interacted_items_for_recommendation(
|
||||
self,
|
||||
data: DataReader,
|
||||
item_ids: List[Union[str, int]],
|
||||
members: List[Union[str, int]],
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Returns the list of item IDs that none of the specified group members have interacted with.
|
||||
|
||||
This method is typically used in recommendation systems to filter out items that have already
|
||||
been interacted with by any member of the group, ensuring that recommendations focus on new or
|
||||
unseen items.
|
||||
|
||||
Args:
|
||||
data: The original dataset containing user-item interactions.
|
||||
item_ids: A list of all available item IDs to consider.
|
||||
members: A list of user IDs representing the group.
|
||||
|
||||
Returns:
|
||||
np.ndarray: A list of item IDs that have not been interacted with by any member of the group.
|
||||
"""
|
||||
|
||||
consecutive_member_ids = [data.get_new_user_id(int(m)) for m in members]
|
||||
consecutive_member_ids = [m for m in consecutive_member_ids if m is not None]
|
||||
|
||||
# Get all unique item IDs interacted with by users in the group
|
||||
interacted_item_ids = data.dataset.loc[
|
||||
data.dataset.userId.isin(consecutive_member_ids), "itemId"
|
||||
].unique()
|
||||
|
||||
# Use numpy set difference to get non-interacted item IDs
|
||||
item_pool = np.setdiff1d(item_ids, interacted_item_ids, assume_unique=True)
|
||||
|
||||
return item_pool
|
||||
|
||||
def generate_recommendation(
|
||||
self,
|
||||
model: RecommenderModel,
|
||||
member: Union[str, int],
|
||||
item_pool: List[Union[str, int]],
|
||||
data: DataReader,
|
||||
) -> Dict[int, float]:
|
||||
"""
|
||||
Generate recommendations for a user based on the provided model.
|
||||
|
||||
Args:
|
||||
model: A recommendation model that implements the RecommenderModel interface
|
||||
member: The ID of the user
|
||||
item_pool: List of item IDs to predict ratings/scores for
|
||||
data: The dataset containing user-item interactions
|
||||
|
||||
Returns:
|
||||
A dictionary mapping item IDs to predicted ratings/scores
|
||||
"""
|
||||
member = int(member)
|
||||
new_member_id = data.get_new_user_id(member)
|
||||
|
||||
if new_member_id is None:
|
||||
return {} # Return empty predictions for this user
|
||||
|
||||
# Additional safety check: filter item_pool to valid IDs before prediction
|
||||
# This provides a second layer of protection in case filtering was missed earlier
|
||||
max_valid_item_id = self._get_max_valid_item_id(model)
|
||||
if isinstance(item_pool, np.ndarray):
|
||||
item_pool = item_pool.astype(int)
|
||||
item_pool = item_pool[(item_pool >= 0) & (item_pool < max_valid_item_id)]
|
||||
elif isinstance(item_pool, list):
|
||||
item_pool = [int(item) for item in item_pool if 0 <= int(item) < max_valid_item_id]
|
||||
|
||||
if len(item_pool) == 0:
|
||||
print(f"No valid items found for user {new_member_id}. Returning empty predictions.")
|
||||
return {} # Return empty predictions if no valid items
|
||||
|
||||
raw_predictions = model.predict(new_member_id, item_pool) # type: ignore
|
||||
if not isinstance(raw_predictions, (list, np.ndarray)):
|
||||
raise TypeError(
|
||||
f"Model's predict function returned an unexpected type: {type(raw_predictions)}"
|
||||
)
|
||||
|
||||
# raw_predictions = []
|
||||
# # Generate predictions for each item in the pool
|
||||
# for item in item_pool:
|
||||
# item = int(item)
|
||||
# raw_predictions.append(model.predict(new_member_id, item)) # type: ignore
|
||||
|
||||
# Ensure raw_predictions is a numpy array
|
||||
raw_predictions = np.array(raw_predictions)
|
||||
|
||||
# # Flatten the predictions if it's a 2D array (single user, multiple items)
|
||||
# if raw_predictions.ndim == 2 and raw_predictions.shape[0] == 1:
|
||||
# raw_predictions = raw_predictions.flatten()
|
||||
|
||||
# # Check if the length of raw_predictions matches item_pool
|
||||
# if len(raw_predictions) != len(item_pool):
|
||||
# raise ValueError(
|
||||
# "Mismatch between predictions and item IDs. Check the model's predict function."
|
||||
# )
|
||||
|
||||
# Apply scaling to normalize predictions to 1-5 range
|
||||
scaled_linear = Scale.linear(
|
||||
np.array(raw_predictions),
|
||||
target_min=1,
|
||||
target_max=5,
|
||||
)
|
||||
# Convert the scaled predictions into a dictionary with original item IDs as keys
|
||||
predictions = {}
|
||||
for item, scaled_pred in zip(item_pool, scaled_linear):
|
||||
# Ensure item_id is treated as an integer
|
||||
item_original_id = data.get_original_item_id(int(item))
|
||||
if item_original_id is not None:
|
||||
predictions[int(item_original_id)] = scaled_pred # type: ignore
|
||||
|
||||
# Sort the predictions in descending order of scores
|
||||
sorted_predictions = dict(
|
||||
sorted(predictions.items(), key=lambda item: item[1], reverse=True)
|
||||
)
|
||||
|
||||
return sorted_predictions
|
||||
|
||||
def get_group_recommendations(
|
||||
self, top_k: Optional[int] = None
|
||||
) -> Union[int, List[int]]:
|
||||
"""
|
||||
Final Recommendation List: Get recommendations for the group based on aggregated scores.
|
||||
|
||||
Args:
|
||||
top_k: The number of recommendations to return.
|
||||
If None, returns all recommendations sorted by score.
|
||||
If 1, returns only the top recommendation as a single item ID.
|
||||
If > 1, returns the top k recommendations as a list of item IDs.
|
||||
|
||||
Returns:
|
||||
If top_k is 1, a single item ID. Otherwise, a list of item IDs.
|
||||
"""
|
||||
if self._aggregated_scores is None:
|
||||
raise ValueError(
|
||||
"You must call setup_recommendation before getting recommendations"
|
||||
)
|
||||
|
||||
sorted_items = list(self._aggregated_scores.items())
|
||||
|
||||
# Return results based on top_k parameter
|
||||
if top_k is None:
|
||||
# Return all items as a list of item IDs
|
||||
return [item_id for item_id, _ in sorted_items]
|
||||
elif top_k == 1:
|
||||
# Return only the top item ID
|
||||
if sorted_items:
|
||||
return sorted_items[0][0]
|
||||
return None # type: ignore
|
||||
else:
|
||||
# Return top k item IDs
|
||||
return [
|
||||
item_id for item_id, _ in sorted_items[: min(top_k, len(sorted_items))]
|
||||
]
|
||||
|
||||
def get_top_recommendation(self) -> int:
|
||||
"""
|
||||
Get the top recommendation for the group.
|
||||
|
||||
Returns:
|
||||
The item ID with the highest aggregated score across all group members.
|
||||
"""
|
||||
if self._top_recommendation is None:
|
||||
self._top_recommendation = self.get_group_recommendations(top_k=1)
|
||||
return self._top_recommendation # type: ignore
|
||||
|
||||
def get_recommendation_scores(self) -> Dict[int, float]:
|
||||
"""
|
||||
Get the aggregated scores for all items across the group.
|
||||
|
||||
Returns:
|
||||
A dictionary with item IDs as keys and their aggregated scores as values.
|
||||
"""
|
||||
if self._aggregated_scores is None:
|
||||
raise ValueError(
|
||||
"You must call setup_recommendation before getting recommendation scores"
|
||||
)
|
||||
return self._aggregated_scores.copy()
|
||||
|
||||
def get_aggregation_strategy(self) -> Optional[AggregationStrategy]:
|
||||
"""
|
||||
Get the current aggregation strategy.
|
||||
|
||||
Returns:
|
||||
The aggregation strategy being used, or None if not set.
|
||||
"""
|
||||
return self._aggregation_strategy
|
||||
|
||||
def get_group_members(self) -> Optional[List[Union[str, int]]]:
|
||||
"""
|
||||
Get the current group members.
|
||||
|
||||
Returns:
|
||||
List of group member IDs, or None if not set.
|
||||
"""
|
||||
return self._members.copy() if self._members else None
|
||||
|
||||
def get_individual_predictions(
|
||||
self,
|
||||
) -> Optional[Dict[Union[str, int], Dict[int, float]]]:
|
||||
"""
|
||||
Get the individual predictions for all group members.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping user IDs to their individual predictions, or None if not available.
|
||||
"""
|
||||
return self._group_predictions.copy() if self._group_predictions else None
|
||||
@@ -0,0 +1,57 @@
|
||||
import pandas as pd
|
||||
from typing import Optional
|
||||
|
||||
from .generic_recommender import GenericRecommender
|
||||
|
||||
|
||||
class Recommender(GenericRecommender):
|
||||
def __init__(self, dataset_metadata, model, top_n: int = 10):
|
||||
super(Recommender, self).__init__(dataset_metadata, model, top_n)
|
||||
|
||||
def get_predictions(
|
||||
self,
|
||||
user_id: int,
|
||||
target_item_id: list,
|
||||
):
|
||||
predictions = self.model.predict(user_id, target_item_id)
|
||||
return predictions
|
||||
|
||||
def recommend(self, user_id: int, target_item_id: list):
|
||||
"""
|
||||
Generate recommendations on specific itemId and userId
|
||||
:param user_id: list, user Ids
|
||||
:param target_item_id: list, item Ids
|
||||
:param rated_items: list, of rated interactions.
|
||||
:return: data.frame [userId, itemId, rank], recommendations ranking for the specified pairs of userId and itemId.
|
||||
"""
|
||||
predictions = self.get_predictions(user_id, target_item_id)
|
||||
|
||||
return self.rank_prediction(user_id, target_item_id, predictions)
|
||||
|
||||
def recommend_user(
|
||||
self, user_id: Optional[int] = None, user_ratings: Optional[pd.DataFrame] = None
|
||||
):
|
||||
"""
|
||||
Get recommendations for a user.
|
||||
:param user_id: int, a user Id
|
||||
:param user_ratings: list, interactions on the user
|
||||
:return: dataframe [userId, itemId, rank], recommendations ranking for the specified userId.
|
||||
"""
|
||||
if user_ratings is None:
|
||||
if user_id is None:
|
||||
raise ValueError("Either 'user_id' or 'user_ratings' must be provided.")
|
||||
user_ratings = self.get_rated(user_id=user_id)
|
||||
|
||||
if user_ratings is None:
|
||||
return pd.DataFrame(
|
||||
columns=["userId", "itemId", "rank"]
|
||||
) # Return empty recommendations
|
||||
|
||||
if user_id is None:
|
||||
raise ValueError(
|
||||
"Could not determine user_id from the provided user_ratings."
|
||||
)
|
||||
|
||||
unrated_item_id = self.get_unrated(user_ratings["itemId"])
|
||||
|
||||
return self.recommend(user_id=user_id, target_item_id=unrated_item_id)
|
||||
@@ -0,0 +1,17 @@
|
||||
from .aggregation_strategy import AggregationStrategy
|
||||
from .association_rules import AssociationRules
|
||||
from .scale import Scale
|
||||
from .sliding_window import SlidingWindow
|
||||
from .emp_loss import EMFLoss
|
||||
from .explanation_diversity import calculate_gild_for_explanations
|
||||
from .sliding_window_ranker import SlidingWindowRanker
|
||||
|
||||
__all__ = [
|
||||
"AggregationStrategy",
|
||||
"AssociationRules",
|
||||
"Scale",
|
||||
"EMFLoss",
|
||||
"calculate_gild_for_explanations",
|
||||
"SlidingWindowRanker",
|
||||
"SlidingWindow",
|
||||
]
|
||||
@@ -0,0 +1,210 @@
|
||||
import numpy as np
|
||||
from typing import Dict, List, Union, Optional, TypeAlias
|
||||
from enum import Enum
|
||||
|
||||
# Type aliases for better readability
|
||||
UserID: TypeAlias = Union[str, int]
|
||||
ItemID: TypeAlias = Union[str, int]
|
||||
EvaluationScore: TypeAlias = float
|
||||
AggregatedScore: TypeAlias = float
|
||||
|
||||
# Main data structure types
|
||||
UserEvaluations: TypeAlias = Dict[UserID, Dict[ItemID, EvaluationScore]]
|
||||
UserRankings: TypeAlias = Dict[UserID, List[ItemID]]
|
||||
AggregatedScores: TypeAlias = Dict[ItemID, AggregatedScore]
|
||||
|
||||
|
||||
class AggregationStrategy(Enum):
|
||||
"""Enumeration of available aggregation strategies."""
|
||||
|
||||
# Individual Predictions
|
||||
AVG_PREDICTIONS = "avg_predictions"
|
||||
LEAST_MISERY = "least_misery"
|
||||
MOST_PLEASURE = "most_pleasure"
|
||||
MOST_RESPECTED_PERSON = "most_respected_person"
|
||||
|
||||
# Individual Preferences
|
||||
ADDITIVE_UTILITARIAN = "additive_utilitarian"
|
||||
MULTIPLICATIVE = "multiplicative"
|
||||
BORDA_COUNT = "borda_count"
|
||||
|
||||
|
||||
class ScoreAggregator:
|
||||
"""
|
||||
A class for aggregating individual predictions or preferences into collective scores.
|
||||
|
||||
Supports two main approaches:
|
||||
1. Individual Predictions: AVG, LM, MP, MRP
|
||||
2. Individual Preferences: AVG, ADD, MUL, BRC
|
||||
|
||||
Felfernig, A., Boratto, L., Stettinger, M., Tkali, M.: Group Recommender Systems:
|
||||
An Introduction. Springer Publishing Company, Incorporated, 1st edn. (2018)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, most_respected_person: Optional[UserID] = None):
|
||||
"""
|
||||
Initialize the ScoreAggregator.
|
||||
|
||||
Args:
|
||||
most_respected_person: User ID of the most respected person (required for MRP strategy)
|
||||
"""
|
||||
self.most_respected_person = most_respected_person
|
||||
|
||||
def aggregate_scores(
|
||||
self,
|
||||
evaluations: UserEvaluations,
|
||||
strategy: AggregationStrategy,
|
||||
rankings: Optional[UserRankings] = None,
|
||||
) -> AggregatedScores:
|
||||
"""
|
||||
Aggregate individual evaluations into collective scores.
|
||||
|
||||
Args:
|
||||
evaluations: Dictionary mapping user_id -> {item_id: evaluation_score}
|
||||
strategy: Aggregation strategy to use
|
||||
rankings: Dictionary mapping user_id -> [ordered_list_of_items] (required for Borda Count)
|
||||
|
||||
Returns:
|
||||
Dictionary mapping item_id -> aggregated_score
|
||||
"""
|
||||
if not evaluations:
|
||||
return {}
|
||||
|
||||
# Get all items across all users
|
||||
all_items: set[ItemID] = set()
|
||||
for user_evals in evaluations.values():
|
||||
all_items.update(user_evals.keys())
|
||||
|
||||
result: AggregatedScores = {}
|
||||
|
||||
for item in all_items:
|
||||
if strategy == AggregationStrategy.AVG_PREDICTIONS:
|
||||
result[item] = self._avg_predictions(evaluations, item)
|
||||
elif strategy == AggregationStrategy.LEAST_MISERY:
|
||||
result[item] = self._least_misery(evaluations, item)
|
||||
elif strategy == AggregationStrategy.MOST_PLEASURE:
|
||||
result[item] = self._most_pleasure(evaluations, item)
|
||||
elif strategy == AggregationStrategy.MOST_RESPECTED_PERSON:
|
||||
result[item] = self._most_respected_person(evaluations, item)
|
||||
elif strategy == AggregationStrategy.ADDITIVE_UTILITARIAN:
|
||||
result[item] = self._additive_utilitarian(evaluations, item)
|
||||
elif strategy == AggregationStrategy.MULTIPLICATIVE:
|
||||
result[item] = self._multiplicative(evaluations, item)
|
||||
elif strategy == AggregationStrategy.BORDA_COUNT:
|
||||
if rankings is None:
|
||||
raise ValueError("Rankings required for Borda Count strategy")
|
||||
result[item] = self._borda_count(rankings, item)
|
||||
else:
|
||||
raise ValueError(f"Unknown aggregation strategy: {strategy}")
|
||||
|
||||
return result
|
||||
|
||||
def get_top_recommendation(
|
||||
self,
|
||||
evaluations: UserEvaluations,
|
||||
strategy: AggregationStrategy,
|
||||
rankings: Optional[UserRankings] = None,
|
||||
) -> ItemID:
|
||||
"""
|
||||
Get the top recommended item based on aggregated scores.
|
||||
|
||||
Args:
|
||||
evaluations: Dictionary mapping user_id -> {item_id: evaluation_score}
|
||||
strategy: Aggregation strategy to use
|
||||
rankings: Dictionary mapping user_id -> [ordered_list_of_items] (required for Borda Count)
|
||||
|
||||
Returns:
|
||||
Item ID with highest aggregated score
|
||||
"""
|
||||
aggregated_scores = self.aggregate_scores(evaluations, strategy, rankings)
|
||||
return max(aggregated_scores.items(), key=lambda x: x[1])[0]
|
||||
|
||||
def _avg_predictions(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Average of item-specific evaluations."""
|
||||
item_evals = [
|
||||
user_evals.get(item, 0)
|
||||
for user_evals in evaluations.values()
|
||||
if item in user_evals
|
||||
]
|
||||
return np.mean(item_evals) if item_evals else 0.0 # type: ignore
|
||||
|
||||
def _least_misery(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Minimum item-specific evaluation."""
|
||||
item_evals = [
|
||||
user_evals.get(item, 0)
|
||||
for user_evals in evaluations.values()
|
||||
if item in user_evals
|
||||
]
|
||||
return min(item_evals) if item_evals else 0.0
|
||||
|
||||
def _most_pleasure(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Maximum item-specific evaluation."""
|
||||
item_evals = [
|
||||
user_evals.get(item, 0)
|
||||
for user_evals in evaluations.values()
|
||||
if item in user_evals
|
||||
]
|
||||
return max(item_evals) if item_evals else 0.0
|
||||
|
||||
def _most_respected_person(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Item-evaluations of most respected user."""
|
||||
if self.most_respected_person is None:
|
||||
raise ValueError("Most respected person not specified")
|
||||
if self.most_respected_person not in evaluations:
|
||||
raise ValueError(
|
||||
f"Most respected person '{self.most_respected_person}' not found in evaluations"
|
||||
)
|
||||
return evaluations[self.most_respected_person].get(item, 0.0)
|
||||
|
||||
def _avg_preferences(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Average of item-specific evaluations (same as avg_predictions)."""
|
||||
return self._avg_predictions(evaluations, item)
|
||||
|
||||
def _additive_utilitarian(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Sum of item-specific evaluations."""
|
||||
item_evals = [
|
||||
user_evals.get(item, 0)
|
||||
for user_evals in evaluations.values()
|
||||
if item in user_evals
|
||||
]
|
||||
return sum(item_evals)
|
||||
|
||||
def _multiplicative(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Multiplication of item-specific evaluations."""
|
||||
item_evals = [
|
||||
user_evals.get(item, 0)
|
||||
for user_evals in evaluations.values()
|
||||
if item in user_evals
|
||||
]
|
||||
if not item_evals:
|
||||
return 0.0
|
||||
result = 1.0
|
||||
for eval_score in item_evals:
|
||||
result *= eval_score
|
||||
return result
|
||||
|
||||
def _borda_count(self, rankings: UserRankings, item: ItemID) -> AggregatedScore:
|
||||
"""Sum of item-specific scores derived from item ranking."""
|
||||
total_score = 0.0
|
||||
for user_ranking in rankings.values():
|
||||
if item in user_ranking:
|
||||
# Score is based on position in ranking (higher position = higher score)
|
||||
position = user_ranking.index(item)
|
||||
score = len(user_ranking) - position - 1 # Reverse position for score
|
||||
total_score += score
|
||||
return total_score
|
||||
@@ -0,0 +1,255 @@
|
||||
from mlxtend.preprocessing import TransactionEncoder
|
||||
from mlxtend.frequent_patterns import fpgrowth, association_rules
|
||||
import pandas as pd
|
||||
from pygrex.data_reader.data_reader import DataReader
|
||||
from typing import List, Optional, Union
|
||||
|
||||
|
||||
class AssociationRules:
|
||||
"""
|
||||
A class to represent association rules mining for recommendation systems.
|
||||
|
||||
This class implements association rules mining using the FP-Growth algorithm
|
||||
to discover frequent itemsets and generate association rules from user-item
|
||||
interaction data. It can be used to find patterns in user behavior and
|
||||
generate item recommendations based on item associations.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
data: DataReader,
|
||||
min_support: float = 0.2,
|
||||
min_confidence: float = 0.2,
|
||||
rating_threshold: float = 4.0,
|
||||
) -> None:
|
||||
"""Initialize the association rules miner with data and parameters.
|
||||
|
||||
Args:
|
||||
data: The DataReader object containing user-item interactions with ratings.
|
||||
min_support: Minimum support threshold for frequent itemsets.
|
||||
Must be between 0 and 1. Default is 0.2.
|
||||
min_confidence: Minimum confidence threshold for association rules.
|
||||
Must be between 0 and 1. Default is 0.2.
|
||||
rating_threshold: Minimum rating threshold to consider an interaction
|
||||
as positive. Default is 4.0.
|
||||
|
||||
Raises:
|
||||
ValueError: If support, confidence, or rating_threshold values are invalid.
|
||||
"""
|
||||
self._validate_parameters(min_support, min_confidence, rating_threshold)
|
||||
|
||||
self.data = data
|
||||
self.min_support = min_support
|
||||
self.min_confidence = min_confidence
|
||||
self.rating_threshold = rating_threshold
|
||||
self._frequent_itemsets: Optional[pd.DataFrame] = None
|
||||
self._association_rules: Optional[pd.DataFrame] = None
|
||||
|
||||
def _validate_parameters(
|
||||
self, min_support: float, min_confidence: float, rating_threshold: float
|
||||
) -> None:
|
||||
"""Validate initialization parameters.
|
||||
|
||||
Args:
|
||||
min_support: Minimum support threshold to validate.
|
||||
min_confidence: Minimum confidence threshold to validate.
|
||||
rating_threshold: Rating threshold to validate.
|
||||
|
||||
Raises:
|
||||
ValueError: If any parameter is invalid.
|
||||
"""
|
||||
if not (0 < min_support <= 1):
|
||||
raise ValueError("min_support must be between 0 and 1")
|
||||
if not (0 < min_confidence <= 1):
|
||||
raise ValueError("min_confidence must be between 0 and 1")
|
||||
if rating_threshold < 0:
|
||||
raise ValueError("rating_threshold must be non-negative")
|
||||
|
||||
def get_df_filtered_by_rating_threshold(self) -> pd.DataFrame:
|
||||
df = self.data.dataset.copy()
|
||||
# Filter interactions based on rating threshold
|
||||
df_filtered = df[df["rating"] >= self.rating_threshold]
|
||||
|
||||
if df_filtered.empty:
|
||||
raise ValueError(
|
||||
f"No interactions found with rating >= {self.rating_threshold}"
|
||||
)
|
||||
return df_filtered
|
||||
|
||||
def _prepare_transactions(self) -> List[List[str]]:
|
||||
"""Prepare transaction data from the dataset.
|
||||
|
||||
Filters the dataset based on rating threshold and groups items
|
||||
by user to create transaction lists.
|
||||
|
||||
Returns:
|
||||
A list of transactions, where each transaction is a list of item IDs
|
||||
that a user has positively interacted with.
|
||||
"""
|
||||
df_filtered = self.get_df_filtered_by_rating_threshold()
|
||||
# Group items by user to create transactions
|
||||
transactions = df_filtered.groupby("userId")["itemId"].apply(list).tolist()
|
||||
|
||||
# Convert item IDs to strings for consistency
|
||||
transactions = [
|
||||
[str(item) for item in transaction] for transaction in transactions
|
||||
]
|
||||
|
||||
return transactions
|
||||
|
||||
def _mine_frequent_itemsets(
|
||||
self, transactions: List[List[Union[str, int]]]
|
||||
) -> pd.DataFrame:
|
||||
"""Mine frequent itemsets using FP-Growth algorithm.
|
||||
|
||||
Args:
|
||||
transactions: List of transactions to mine frequent itemsets from.
|
||||
|
||||
Returns:
|
||||
DataFrame containing frequent itemsets with their support values.
|
||||
|
||||
Raises:
|
||||
ValueError: If no frequent itemsets are found.
|
||||
"""
|
||||
# Encode transactions into binary matrix
|
||||
transaction_encoder = TransactionEncoder()
|
||||
transaction_matrix = transaction_encoder.fit_transform(transactions)
|
||||
|
||||
df_encoded = pd.DataFrame(
|
||||
transaction_matrix, # type: ignore
|
||||
columns=transaction_encoder.columns_,
|
||||
)
|
||||
|
||||
# Apply FP-Growth to find frequent itemsets
|
||||
frequent_itemsets = fpgrowth(
|
||||
df_encoded, min_support=self.min_support, use_colnames=True
|
||||
)
|
||||
|
||||
if frequent_itemsets.empty:
|
||||
raise ValueError(
|
||||
f"No frequent itemsets found with min_support={self.min_support}"
|
||||
)
|
||||
|
||||
return frequent_itemsets
|
||||
|
||||
def _generate_association_rules(
|
||||
self, frequent_itemsets: pd.DataFrame
|
||||
) -> pd.DataFrame:
|
||||
"""Generate association rules from frequent itemsets.
|
||||
|
||||
Args:
|
||||
frequent_itemsets: DataFrame containing frequent itemsets.
|
||||
|
||||
Returns:
|
||||
DataFrame containing association rules with their metrics.
|
||||
|
||||
Raises:
|
||||
ValueError: If no association rules are found.
|
||||
"""
|
||||
rules = association_rules(
|
||||
frequent_itemsets, metric="confidence", min_threshold=self.min_confidence
|
||||
)
|
||||
|
||||
if rules.empty:
|
||||
raise ValueError(
|
||||
f"No association rules found with min_confidence={self.min_confidence}"
|
||||
)
|
||||
|
||||
return rules
|
||||
|
||||
def compute(self) -> pd.DataFrame:
|
||||
"""Compute association rules from the dataset.
|
||||
|
||||
This method performs the complete association rules mining process:
|
||||
1. Prepares transactions from the dataset
|
||||
2. Mines frequent itemsets using FP-Growth
|
||||
3. Generates association rules from frequent itemsets
|
||||
|
||||
Returns:
|
||||
DataFrame containing association rules with metrics including
|
||||
antecedents, consequents, support, confidence, lift, etc.
|
||||
|
||||
Raises:
|
||||
ValueError: If the dataset is empty, no transactions meet the
|
||||
criteria, or no rules can be generated with the given parameters.
|
||||
"""
|
||||
if self.data.dataset.empty:
|
||||
raise ValueError("Dataset is empty")
|
||||
|
||||
# Prepare transactions
|
||||
transactions = self._prepare_transactions()
|
||||
|
||||
if not transactions:
|
||||
raise ValueError("No transactions found after filtering")
|
||||
|
||||
# Mine frequent itemsets
|
||||
self._frequent_itemsets = self._mine_frequent_itemsets(transactions) # type: ignore
|
||||
|
||||
# Generate association rules
|
||||
self._association_rules = self._generate_association_rules(
|
||||
self._frequent_itemsets
|
||||
)
|
||||
|
||||
return self._association_rules
|
||||
|
||||
def get_frequent_itemsets(self) -> Optional[pd.DataFrame]:
|
||||
"""Get the computed frequent itemsets.
|
||||
|
||||
Returns:
|
||||
DataFrame containing frequent itemsets if compute() has been called,
|
||||
None otherwise.
|
||||
"""
|
||||
return self._frequent_itemsets
|
||||
|
||||
def get_recommendations_for_items(
|
||||
self, items: List[Union[str, int]], top_k: int = 10
|
||||
) -> pd.DataFrame:
|
||||
"""Get item recommendations based on association rules.
|
||||
|
||||
Args:
|
||||
items: List of item IDs to get recommendations for.
|
||||
top_k: Maximum number of recommendations to return. Default is 10.
|
||||
|
||||
Returns:
|
||||
DataFrame containing recommended items sorted by confidence.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If compute() hasn't been called yet.
|
||||
ValueError: If items list is empty.
|
||||
"""
|
||||
if self._association_rules is None:
|
||||
raise RuntimeError("Must call compute() before getting recommendations")
|
||||
|
||||
if not items:
|
||||
raise ValueError("Items list cannot be empty")
|
||||
|
||||
items_set = set(str(item) for item in items)
|
||||
|
||||
# Filter rules where antecedents match the given items
|
||||
matching_rules = self._association_rules[
|
||||
self._association_rules["antecedents"].apply(
|
||||
lambda x: items_set.issubset(set(str(item) for item in x))
|
||||
)
|
||||
]
|
||||
|
||||
if matching_rules.empty:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Sort by confidence and return top_k recommendations
|
||||
recommendations = matching_rules.nlargest(top_k, "confidence")
|
||||
|
||||
return recommendations[
|
||||
["antecedents", "consequents", "confidence", "lift", "support"]
|
||||
]
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return string representation of the AssociationRules object."""
|
||||
return (
|
||||
f"AssociationRules(min_support={self.min_support}, "
|
||||
f"min_confidence={self.min_confidence}, "
|
||||
f"rating_threshold={self.rating_threshold})"
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""Return detailed string representation of the AssociationRules object."""
|
||||
return self.__str__()
|
||||
@@ -0,0 +1,17 @@
|
||||
import torch
|
||||
|
||||
|
||||
class EMFLoss(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(EMFLoss, self).__init__()
|
||||
|
||||
def forward(self, ratings_pred, ratings, u, v, reg_term, expl, expl_reg_term):
|
||||
|
||||
mse = (ratings - ratings_pred.view(-1)) ** 2
|
||||
u_l2 = reg_term * torch.norm(u, 2, -1)
|
||||
v_l2 = reg_term * torch.norm(v, 2, -1)
|
||||
expl_constraint = expl_reg_term * torch.norm(u - v, 1, -1) * expl
|
||||
|
||||
loss = mse + u_l2 + v_l2 + expl_constraint
|
||||
|
||||
return loss.mean()
|
||||
@@ -0,0 +1,80 @@
|
||||
from itertools import combinations
|
||||
import numpy as np
|
||||
|
||||
|
||||
def _get_explanation_feature_set(explanation, explainer_type, details=None):
|
||||
"""Helper to extract a consistent feature set from different explanation types."""
|
||||
if explainer_type == "Sliding Window":
|
||||
return set(explanation.get("items", []))
|
||||
elif explainer_type == "EXPGRS":
|
||||
if details is not None:
|
||||
return set(details.get("antecedent", frozenset()))
|
||||
else:
|
||||
return set()
|
||||
elif explainer_type == "LORE4Groups":
|
||||
rules_data = explanation.get("group_factual_rule", {})
|
||||
if isinstance(rules_data, dict):
|
||||
return set(
|
||||
rule for tier_rules in rules_data.values() for rule in tier_rules
|
||||
)
|
||||
elif isinstance(rules_data, list):
|
||||
return set(rules_data)
|
||||
return set()
|
||||
|
||||
|
||||
def calculate_gild_for_explanations(explanations_dict, explainer_type, use_median=True):
|
||||
"""Calculate Gaussian Inter-List Diversity (GILD) for a set of explanations."""
|
||||
|
||||
if not explanations_dict or len(explanations_dict) < 2:
|
||||
return 0.0
|
||||
|
||||
feature_sets = []
|
||||
if explainer_type == "EXPGRS":
|
||||
for item_id, rules_list in explanations_dict.items():
|
||||
if rules_list:
|
||||
feature_sets.append(
|
||||
_get_explanation_feature_set(
|
||||
None, explainer_type, details=rules_list[0]
|
||||
)
|
||||
)
|
||||
elif explainer_type == "Sliding Window":
|
||||
for call, exp_data in explanations_dict.items():
|
||||
feature_sets.append(_get_explanation_feature_set(exp_data, explainer_type))
|
||||
elif explainer_type == "LORE4Groups":
|
||||
for item_id, exp_data in explanations_dict.items():
|
||||
feature_sets.append(_get_explanation_feature_set(exp_data, explainer_type))
|
||||
|
||||
feature_sets = [fs for fs in feature_sets if fs]
|
||||
if len(feature_sets) < 2:
|
||||
return 0.0
|
||||
|
||||
# Calculate pairwise Jaccard distances
|
||||
distances = []
|
||||
for set1, set2 in combinations(feature_sets, 2):
|
||||
intersection_len = len(set1.intersection(set2))
|
||||
union_len = len(set1.union(set2))
|
||||
jaccard_dist = 1.0 - (intersection_len / union_len) if union_len > 0 else 1.0
|
||||
distances.append(jaccard_dist)
|
||||
|
||||
if not distances:
|
||||
return 0.0
|
||||
|
||||
# Calculate sigma using paper's formula
|
||||
k_choose_2 = len(distances)
|
||||
if use_median:
|
||||
reference_dist = np.median(distances)
|
||||
else:
|
||||
reference_dist = min(distances)
|
||||
|
||||
denominator = np.sqrt(2 * np.log(k_choose_2 - 1)) if k_choose_2 > 1 else 1.0
|
||||
sigma = reference_dist / denominator if denominator > 0 else reference_dist
|
||||
if sigma == 0:
|
||||
sigma = 1e-9
|
||||
kernel_distances_sum = 0.0
|
||||
for d in distances:
|
||||
kernel_distance = np.sqrt(2 - 2 * np.exp(-(d**2) / (2 * sigma**2)))
|
||||
kernel_distances_sum += kernel_distance
|
||||
|
||||
gild = kernel_distances_sum / k_choose_2 if distances else 0
|
||||
|
||||
return gild
|
||||
@@ -0,0 +1,138 @@
|
||||
from typing import List, Union, Optional
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
|
||||
|
||||
class Scale:
|
||||
"""
|
||||
A class for scaling numerical values using different methods.
|
||||
|
||||
Methods:
|
||||
quantile: Scale values using quantile-based ranking.
|
||||
linear: Scale values linearly to a target range with outlier handling.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def quantile(
|
||||
raw_predictions: Union[List[float], np.ndarray],
|
||||
target_min: float = 1,
|
||||
target_max: float = 5,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Scale raw predictions to the target range using quantile-based ranking.
|
||||
|
||||
Args:
|
||||
raw_predictions: The raw prediction values.
|
||||
target_min: Minimum of the target range (default: 1).
|
||||
target_max: Maximum of the target range (default: 5).
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: Scaled predictions.
|
||||
|
||||
Raises:
|
||||
ValueError: If raw_predictions is empty.
|
||||
"""
|
||||
if len(raw_predictions) == 0:
|
||||
raise ValueError("Raw predictions array is empty.")
|
||||
|
||||
# Convert to numpy array if it's not already
|
||||
raw_predictions = np.array(raw_predictions)
|
||||
|
||||
ranks = stats.rankdata(raw_predictions, method="average")
|
||||
if len(raw_predictions) == 1:
|
||||
# Handle single element case
|
||||
scaled_predictions = np.array([(target_min + target_max) / 2])
|
||||
else:
|
||||
scaled_predictions = target_min + (ranks - 1) * (
|
||||
target_max - target_min
|
||||
) / (len(raw_predictions) - 1)
|
||||
|
||||
# Ensure scaled predictions are within [target_min, target_max]
|
||||
scaled_predictions = np.clip(scaled_predictions, target_min, target_max)
|
||||
|
||||
return scaled_predictions
|
||||
|
||||
@staticmethod
|
||||
def linear(
|
||||
raw_predictions: Union[List[float], np.ndarray],
|
||||
target_min: float = 1,
|
||||
target_max: float = 5,
|
||||
ref_min: Optional[float] = None,
|
||||
ref_max: Optional[float] = None,
|
||||
handle_outliers: bool = True,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Scale raw predictions to the target range [target_min, target_max].
|
||||
|
||||
Args:
|
||||
raw_predictions: The raw prediction values.
|
||||
target_min: Minimum of the target range (default: 1).
|
||||
target_max: Maximum of the target range (default: 5).
|
||||
ref_min: Reference minimum for raw predictions. If None, will be calculated
|
||||
from the data or from outlier bounds if handle_outliers=True.
|
||||
ref_max: Reference maximum for raw predictions. If None, will be calculated
|
||||
from the data or from outlier bounds if handle_outliers=True.
|
||||
handle_outliers: Whether to handle outliers using IQR method (default: True).
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: Scaled predictions.
|
||||
|
||||
Raises:
|
||||
ValueError: If raw_predictions is empty.
|
||||
"""
|
||||
if len(raw_predictions) == 0:
|
||||
raise ValueError("Raw predictions array is empty.")
|
||||
|
||||
# Convert to numpy array if it's not already
|
||||
raw_predictions = np.array(raw_predictions)
|
||||
|
||||
# Handle single element case
|
||||
if len(raw_predictions) == 1:
|
||||
if ref_min is not None and ref_max is not None:
|
||||
# Scale based on provided reference range
|
||||
value = raw_predictions[0]
|
||||
scaled_value = (
|
||||
target_min
|
||||
+ (value - ref_min)
|
||||
* (target_max - target_min)
|
||||
/ (ref_max - ref_min)
|
||||
if ref_max != ref_min
|
||||
else (target_min + target_max) / 2
|
||||
)
|
||||
scaled_value = np.clip(scaled_value, target_min, target_max)
|
||||
return np.array([scaled_value])
|
||||
else:
|
||||
# Can't determine range from single value, return middle of target range
|
||||
return np.array([(target_min + target_max) / 2])
|
||||
|
||||
clipped_predictions = raw_predictions.copy()
|
||||
|
||||
# Handle outliers if requested
|
||||
if handle_outliers:
|
||||
q1, q3 = np.percentile(raw_predictions, [25, 75])
|
||||
iqr = q3 - q1
|
||||
lower_bound = q1 - 1.5 * iqr
|
||||
upper_bound = q3 + 1.5 * iqr
|
||||
clipped_predictions = np.clip(raw_predictions, lower_bound, upper_bound)
|
||||
|
||||
# Determine min and max values
|
||||
min_raw = np.min(clipped_predictions)
|
||||
max_raw = np.max(clipped_predictions)
|
||||
|
||||
# Use provided reference bounds if given, otherwise use data bounds
|
||||
actual_ref_min = ref_min if ref_min is not None else min_raw
|
||||
actual_ref_max = ref_max if ref_max is not None else max_raw
|
||||
|
||||
# Scale to [target_min, target_max]
|
||||
if actual_ref_max == actual_ref_min:
|
||||
# Reference bounds are equal, return the middle of the target range
|
||||
return np.full_like(raw_predictions, (target_min + target_max) / 2)
|
||||
else:
|
||||
scaled_predictions = target_min + (raw_predictions - actual_ref_min) * (
|
||||
target_max - target_min
|
||||
) / (actual_ref_max - actual_ref_min)
|
||||
|
||||
# Ensure scaled predictions are within [target_min, target_max]
|
||||
scaled_predictions = np.clip(scaled_predictions, target_min, target_max)
|
||||
|
||||
return scaled_predictions
|
||||
@@ -0,0 +1,90 @@
|
||||
from typing import List, Optional, TypeVar, Generic, Iterator
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class SlidingWindow(Generic[T]):
|
||||
"""Class for creating and managing sliding windows over a sequence.
|
||||
|
||||
This class provides functionality to iterate through windows of a fixed size
|
||||
over a sequence of items.
|
||||
"""
|
||||
|
||||
def __init__(self, sequence: List[T], window_size: int):
|
||||
"""Initialize the sliding window.
|
||||
|
||||
Args:
|
||||
sequence: The sequence of items to slide over
|
||||
window_size: The size of each window (must be positive)
|
||||
|
||||
Raises:
|
||||
ValueError: If window_size is less than 1
|
||||
TypeError: If sequence is not iterable
|
||||
"""
|
||||
if window_size < 1:
|
||||
raise ValueError("Window size must be at least 1")
|
||||
|
||||
if not hasattr(sequence, "__iter__"):
|
||||
raise TypeError("Sequence must be iterable")
|
||||
|
||||
self.sequence = sequence
|
||||
self.window_size = window_size
|
||||
self.index = 0
|
||||
self.max_index = len(sequence) - window_size + 1 if sequence else 0
|
||||
|
||||
def get_next_window(self) -> Optional[List[T]]:
|
||||
"""Return the next window and advance the current position.
|
||||
|
||||
Returns:
|
||||
A list containing the next window of items, or None if all windows
|
||||
have been processed.
|
||||
"""
|
||||
if self.index >= self.max_index:
|
||||
return None
|
||||
|
||||
window = self.sequence[self.index : self.index + self.window_size]
|
||||
self.index += 1
|
||||
return window
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset the window position to the beginning of the sequence."""
|
||||
self.index = 0
|
||||
|
||||
def has_next(self) -> bool:
|
||||
"""Check if there are more windows available.
|
||||
|
||||
Returns:
|
||||
True if there are more windows, False otherwise.
|
||||
"""
|
||||
return self.index < self.max_index
|
||||
|
||||
def __iter__(self) -> Iterator[List[T]]:
|
||||
"""Make the class iterable.
|
||||
|
||||
Returns:
|
||||
An iterator over all windows in the sequence.
|
||||
"""
|
||||
self.reset()
|
||||
return self
|
||||
|
||||
def __next__(self) -> List[T]:
|
||||
"""Get the next window for iteration.
|
||||
|
||||
Returns:
|
||||
The next window as a list.
|
||||
|
||||
Raises:
|
||||
StopIteration: When all windows have been processed.
|
||||
"""
|
||||
window = self.get_next_window()
|
||||
if window is None:
|
||||
raise StopIteration
|
||||
return window
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Return the total number of windows.
|
||||
|
||||
Returns:
|
||||
The number of complete windows in the sequence.
|
||||
"""
|
||||
return max(0, self.max_index)
|
||||
@@ -0,0 +1,631 @@
|
||||
import operator
|
||||
from typing import Any, Dict, List, Union, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.signal import (
|
||||
find_peaks,
|
||||
peak_widths,
|
||||
)
|
||||
|
||||
from pygrex.data_reader import DataReader
|
||||
|
||||
|
||||
class SlidingWindowRanker:
|
||||
"""
|
||||
Stratigi, M., Bikakis, N., Stefanidis, K.: Counterfactual explanations for group
|
||||
recommendations. In: Proceedings of the 27th International Workshop on Design,
|
||||
Optimization, Languages and Analytical Processing of Big Data (DOLAP 2025)
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""
|
||||
Initialize the SlidingWindowRanker.
|
||||
|
||||
Args:
|
||||
config: Configuration parameters for the evaluator
|
||||
"""
|
||||
self.config = config
|
||||
self.group_predictions: Optional[
|
||||
Dict[Union[str, int], Dict[Union[str, int], float]]
|
||||
] = None
|
||||
self.top_recommendation: Optional[Union[str, int]] = None
|
||||
|
||||
def set_group_recommender_values(
|
||||
self,
|
||||
group_predictions: Dict[Union[str, int], Dict[Union[str, int], float]],
|
||||
top_recommendation: Union[str, int],
|
||||
) -> None:
|
||||
"""
|
||||
Set group recommender values.
|
||||
|
||||
Args:
|
||||
group_predictions: Dictionary mapping user IDs to their item predictions
|
||||
top_recommendation: List of top recommended items for the group
|
||||
"""
|
||||
self.group_predictions = group_predictions
|
||||
self.top_recommendation = top_recommendation
|
||||
|
||||
def evaluate(self, data: DataReader) -> Dict[str, Any]:
|
||||
"""
|
||||
Evaluate the data using the Stratigis evaluator.
|
||||
|
||||
Args:
|
||||
data: DataReader object containing dataset and transformation methods
|
||||
|
||||
Returns:
|
||||
Dictionary with evaluation metrics
|
||||
"""
|
||||
# Implementation would go here
|
||||
return {}
|
||||
|
||||
def calculate_item_popularity_score(
|
||||
self, items: List[Union[str, int]], data: DataReader
|
||||
) -> Dict[Union[str, int], float]:
|
||||
"""
|
||||
Calculate the normalized popularity of each item based on the number of interactions received.
|
||||
|
||||
Args:
|
||||
items: List of item IDs
|
||||
data: Data object containing the dataset and transformation methods
|
||||
|
||||
Returns:
|
||||
Dictionary with item IDs as keys and normalized popularity (0-1) as values
|
||||
"""
|
||||
# Calculate popularity (number of interactions) for each item
|
||||
popularity_counts = {}
|
||||
for item_id in items:
|
||||
internal_item_id = data.get_new_item_id(item_id)
|
||||
count = len(data.dataset[data.dataset["itemId"] == internal_item_id])
|
||||
popularity_counts[item_id] = count
|
||||
|
||||
# Find min and max values for normalization
|
||||
min_count = min(popularity_counts.values()) if popularity_counts else 0
|
||||
max_count = max(popularity_counts.values()) if popularity_counts else 0
|
||||
|
||||
# Add 1% padding to the range
|
||||
range_value = max_count - min_count
|
||||
padded_range = range_value + (
|
||||
range_value / 50
|
||||
) # Add 2% to range (1% on each end)
|
||||
padded_min = min_count - (
|
||||
range_value / 100
|
||||
) # Subtract 1% of range from minimum
|
||||
|
||||
if padded_range == 0:
|
||||
padded_range = 1 # Avoid division by zero
|
||||
|
||||
# Normalize popularity values to [0,1]
|
||||
popularity_mask = {}
|
||||
for item_id, count in popularity_counts.items():
|
||||
popularity_mask[item_id] = (count - padded_min) / padded_range
|
||||
|
||||
return popularity_mask
|
||||
|
||||
def calculate_relevance_mask(
|
||||
self,
|
||||
target_item_id: Union[str, int],
|
||||
) -> Dict[Union[str, int], float]:
|
||||
"""
|
||||
Create a mapping between users and their prediction scores for a specific target item.
|
||||
|
||||
Args:
|
||||
target_item_id :The ID of the item for which prediction scores are needed
|
||||
|
||||
Returns:
|
||||
Dictionary mapping user IDs to their predicted scores for the target item
|
||||
Note: Users without a prediction for the target item will have a value of 0
|
||||
|
||||
Examples
|
||||
>>> user_preds = {'user1': {'item1': 4.5, 'item2': 3.2}, 'user2': {'item2': 2.8}}
|
||||
>>> evaluator.set_group_recommender_values(user_preds,top_recommendation)
|
||||
>>> evaluator.calculate_relevance_mask('item1')
|
||||
{'user1': 4.5, 'user2': 0}
|
||||
"""
|
||||
|
||||
if self.group_predictions is None:
|
||||
raise ValueError(
|
||||
"User predictions not set. Call set_group_recommender_values first."
|
||||
)
|
||||
|
||||
individual_predictions = {}
|
||||
|
||||
for user_id, predictions in self.group_predictions.items():
|
||||
# Get the prediction for the target item if it exists, otherwise default to 0
|
||||
individual_predictions[user_id] = predictions.get(target_item_id, 0)
|
||||
|
||||
return individual_predictions
|
||||
|
||||
def calculate_relevance_score(
|
||||
self,
|
||||
item_id: Union[str, int],
|
||||
data: DataReader,
|
||||
prediction_scores: Dict[Union[str, int], float],
|
||||
members: List[Union[str, int]],
|
||||
rating_scale: tuple = (0, 5), # Default rating scale
|
||||
) -> float:
|
||||
"""
|
||||
Calculate the normalized average prediction score for an item based on group members' predictions.
|
||||
|
||||
Agrs
|
||||
item_id: ID of the item to calculate relevance for
|
||||
data : DataReader object containing dataset and ID mapping methods
|
||||
prediction_scores : Dictionary mapping user IDs to their prediction scores for items
|
||||
members : List of user IDs in the group
|
||||
rating_scale: Tuple indicating (min_rating, max_rating) for normalization
|
||||
|
||||
Returns
|
||||
Normalized average prediction score in range [0,1]
|
||||
Returns 0 if no users in the group have interacted with the item
|
||||
|
||||
Notes
|
||||
1. Calculates the average prediction score for the item from group members
|
||||
2. Normalizes the score to [0,1] range with 1% padding
|
||||
"""
|
||||
total_score = 0
|
||||
valid_users_count = 0
|
||||
internal_item_id = data.get_new_item_id(item_id)
|
||||
|
||||
for user_id in members:
|
||||
# Convert user ID to internal format
|
||||
internal_user_id = (
|
||||
data.get_new_user_id(int(user_id))
|
||||
if isinstance(user_id, (int, np.integer))
|
||||
else user_id
|
||||
)
|
||||
|
||||
# Check if user has interacted with the item
|
||||
user_item_data = data.dataset[
|
||||
(data.dataset["userId"] == internal_user_id)
|
||||
& (data.dataset["itemId"] == internal_item_id)
|
||||
]
|
||||
|
||||
if user_item_data.empty:
|
||||
continue
|
||||
|
||||
# Get the prediction score for this user
|
||||
if user_id in prediction_scores:
|
||||
total_score += prediction_scores[user_id]
|
||||
valid_users_count += 1
|
||||
|
||||
# Return 0 if no valid users found
|
||||
if valid_users_count == 0:
|
||||
return 0
|
||||
|
||||
# Calculate average score
|
||||
average_score = total_score / valid_users_count
|
||||
|
||||
# Normalize to [0,1] with 1% padding
|
||||
min_value, max_value = rating_scale
|
||||
range_value = max_value - min_value
|
||||
padded_range = range_value + (
|
||||
range_value / 50
|
||||
) # Add 2% to range (1% on each end)
|
||||
padded_min = min_value - (
|
||||
range_value / 100
|
||||
) # Subtract 1% of range from minimum
|
||||
|
||||
if padded_range == 0:
|
||||
return 0.0
|
||||
|
||||
normalized_score = (average_score - padded_min) / padded_range
|
||||
return float(normalized_score)
|
||||
|
||||
def calculate_item_intensity_score(
|
||||
self, item_id: Union[str, int], members: List[Union[str, int]], data: DataReader
|
||||
) -> float:
|
||||
"""
|
||||
Calculate what proportion of group members have interacted with the specified item.
|
||||
|
||||
Args
|
||||
item_id : ID of the item to calculate interaction rate for
|
||||
members : List of user IDs in the group
|
||||
data : DataReader object containing dataset and ID mapping methods
|
||||
|
||||
Returns
|
||||
Proportion of group members who have interacted with the item (range [0,1])
|
||||
0 means no group members have interacted with the item
|
||||
1 means all group members have interacted with the item
|
||||
"""
|
||||
# Convert item ID to internal format
|
||||
if data is None:
|
||||
print("Error: DataReader object is None. Cannot convert item_id.")
|
||||
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
|
||||
internal_item_id = data.get_new_item_id(item_id)
|
||||
|
||||
# Convert all user IDs to internal format
|
||||
internal_members = [data.get_new_user_id(user_id) for user_id in members]
|
||||
|
||||
# Count how many users have interacted with the item
|
||||
interaction_count = len(
|
||||
data.dataset[
|
||||
(data.dataset.itemId == internal_item_id)
|
||||
& data.dataset.userId.isin(internal_members)
|
||||
]
|
||||
)
|
||||
|
||||
# Calculate proportion of group members who interacted with item
|
||||
if not members:
|
||||
return 0 # Avoid division by zero if no members
|
||||
|
||||
interaction_rate = interaction_count / len(members)
|
||||
return interaction_rate
|
||||
|
||||
def calculate_rating_score(
|
||||
self,
|
||||
item_id: Union[str, int],
|
||||
members: List[Union[str, int]],
|
||||
data: DataReader,
|
||||
rating_scale: tuple = (0, 5),
|
||||
) -> float:
|
||||
"""
|
||||
Calculate the normalized average rating given to an item by group members.
|
||||
|
||||
Args
|
||||
item_id : ID of the item to calculate average rating for
|
||||
data : DataReader object containing dataset and ID mapping methods
|
||||
members : List of user IDs in the group
|
||||
rating_scale: Tuple indicating (min_rating, max_rating) for normalization
|
||||
|
||||
Returns
|
||||
Normalized average rating in range [0,1]
|
||||
|
||||
Notes
|
||||
- Considers all group members in the denominator even if some haven't rated the item
|
||||
- Normalizes the resulting average to [0,1] with 1% padding
|
||||
"""
|
||||
# Convert item ID to internal format
|
||||
if data is None:
|
||||
print("Error: DataReader object is None. Cannot convert item_id.")
|
||||
return 0.0
|
||||
internal_item_id = data.get_new_item_id(item_id)
|
||||
|
||||
# Convert all user IDs to internal format
|
||||
internal_members = [data.get_new_user_id(user_id) for user_id in members]
|
||||
|
||||
# Get ratings from users who have rated this item
|
||||
rating_data = data.dataset[
|
||||
(data.dataset.itemId == internal_item_id)
|
||||
& data.dataset.userId.isin(internal_members)
|
||||
]
|
||||
|
||||
# Calculate average rating (sum of ratings divided by total group size)
|
||||
if len(members) == 0:
|
||||
return 0 # Avoid division by zero if no members
|
||||
|
||||
total_rating = rating_data["rating"].sum()
|
||||
average_rating = total_rating / len(members)
|
||||
|
||||
# Normalize to [0,1] with 1% padding
|
||||
min_value, max_value = rating_scale
|
||||
range_value = max_value - min_value
|
||||
padded_range = range_value + (
|
||||
range_value / 50
|
||||
) # Add 2% to range (1% on each end)
|
||||
padded_min = min_value - (
|
||||
range_value / 100
|
||||
) # Subtract 1% of range from minimum
|
||||
|
||||
if padded_range == 0:
|
||||
return 0.0
|
||||
|
||||
normalized_rating = (average_rating - padded_min) / padded_range
|
||||
return float(normalized_rating)
|
||||
|
||||
def calculate_trending_score(
|
||||
self,
|
||||
members: List[Union[str, int]],
|
||||
item_id: Union[str, int],
|
||||
data: Optional[DataReader] = None,
|
||||
peak_norm_min_height: float = 0.1,
|
||||
peak_norm_min_prominence: float = 0.05,
|
||||
peak_min_distance: int = 3,
|
||||
peak_width_rel_height: float = 0.5,
|
||||
) -> tuple[float, Dict[Union[str, int], float], pd.DataFrame]:
|
||||
"""
|
||||
Calculates a trending score for a user, using normalized data for hype period detection.
|
||||
|
||||
Args
|
||||
members : List of user IDs in the group
|
||||
item_id : ID of the item to calculate trending score for
|
||||
data : DataReader object containing dataset and ID mapping methods
|
||||
peak_norm_min_height : Minimum height of peaks in normalized data to consider as significant
|
||||
peak_norm_min_prominence : Minimum prominence of peaks in normalized data
|
||||
peak_min_distance : Minimum distance between peaks in months
|
||||
peak_width_rel_height : Relative height for peak width calculation
|
||||
|
||||
Returns
|
||||
tuple: (average_trending_score, individual_scores, hype_periods_for_item)
|
||||
average_trending_score: Average trending score across all group members (0-1)
|
||||
individual_scores: Dictionary mapping user IDs to their individual trending scores
|
||||
hype_periods_for_item: DataFrame containing detected hype periods for the item
|
||||
"""
|
||||
|
||||
if not members:
|
||||
print("Error: No group members provided for trending score calculation.")
|
||||
return 0.0, {}, pd.DataFrame()
|
||||
|
||||
_df = pd.DataFrame()
|
||||
if data is not None and isinstance(data, DataReader):
|
||||
_df = data.dataset.copy()
|
||||
else:
|
||||
if data is not None:
|
||||
print(
|
||||
f"Warning: data was provided but is not a DataReader object (type: {type(data)})."
|
||||
)
|
||||
|
||||
if _df.empty:
|
||||
print(
|
||||
"Error: The DataFrame (_df) is empty. Cannot calculate score or plot."
|
||||
)
|
||||
return 0.0, {}, pd.DataFrame()
|
||||
|
||||
required_columns = [
|
||||
"userId",
|
||||
"itemId",
|
||||
"rating",
|
||||
"timestamp",
|
||||
]
|
||||
missing_columns = [col for col in required_columns if col not in _df.columns]
|
||||
if missing_columns:
|
||||
print(
|
||||
f"Error: Missing required columns in DataFrame: {', '.join(missing_columns)}"
|
||||
)
|
||||
return 0.0, {}, pd.DataFrame()
|
||||
|
||||
try:
|
||||
if "timestamp_dt" not in _df.columns or _df["timestamp_dt"].isnull().all():
|
||||
_df["timestamp_dt"] = pd.to_datetime(_df["timestamp"], unit="s")
|
||||
if "year_month" not in _df.columns or _df["year_month"].isnull().all():
|
||||
_df["year_month"] = _df["timestamp_dt"].dt.to_period("M")
|
||||
except Exception as e:
|
||||
print(f"Error during timestamp conversion or year-month extraction: {e}")
|
||||
return 0.0, {}, pd.DataFrame()
|
||||
|
||||
if data is None: # Should not happen if _df is not empty, but as a safeguard
|
||||
return 0.0, {}, pd.DataFrame()
|
||||
|
||||
# Convert item ID to internal format
|
||||
internal_item_id = data.get_new_item_id(item_id)
|
||||
|
||||
# Convert all user IDs to internal format
|
||||
internal_members = [data.get_new_user_id(user_id) for user_id in members]
|
||||
|
||||
# Filter data for the specific item ID only
|
||||
item_df = _df[_df["itemId"] == internal_item_id]
|
||||
if item_df.empty:
|
||||
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
|
||||
|
||||
# movie_ratings_per_month contains original rating counts
|
||||
movie_ratings_per_month = (
|
||||
item_df.groupby(["itemId", "year_month"], observed=False)
|
||||
.size()
|
||||
.reset_index(name="rating_count")
|
||||
)
|
||||
|
||||
if movie_ratings_per_month.empty:
|
||||
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
|
||||
|
||||
hype_periods_for_item = None
|
||||
|
||||
# Process the specific item for hype period detection
|
||||
group_sorted = movie_ratings_per_month.sort_values("year_month").reset_index(
|
||||
drop=True
|
||||
)
|
||||
original_ratings = group_sorted["rating_count"].to_numpy()
|
||||
|
||||
# Normalization Step
|
||||
min_rating = np.min(original_ratings)
|
||||
max_rating = np.max(original_ratings)
|
||||
|
||||
normalized_ratings = None
|
||||
if (
|
||||
max_rating > min_rating
|
||||
): # Avoid division by zero if all ratings are the same
|
||||
normalized_ratings = (original_ratings - min_rating) / (
|
||||
max_rating - min_rating
|
||||
)
|
||||
elif len(original_ratings) > 0:
|
||||
normalized_ratings = np.zeros_like(original_ratings, dtype=float)
|
||||
else: # No ratings for this item in group_sorted (should not happen if groupby is correct)
|
||||
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
|
||||
|
||||
# Peak Detection on Normalized Data
|
||||
peaks_indices, properties = find_peaks(
|
||||
normalized_ratings,
|
||||
height=peak_norm_min_height,
|
||||
distance=peak_min_distance,
|
||||
prominence=peak_norm_min_prominence,
|
||||
)
|
||||
|
||||
hype_periods_list = []
|
||||
if len(peaks_indices) > 0:
|
||||
widths, _, left_ips, right_ips = peak_widths(
|
||||
normalized_ratings, peaks_indices, rel_height=peak_width_rel_height
|
||||
)
|
||||
|
||||
for i, peak_idx in enumerate(peaks_indices):
|
||||
start_idx = max(0, int(round(left_ips[i])))
|
||||
end_idx = min(len(group_sorted) - 1, int(round(right_ips[i])))
|
||||
|
||||
if start_idx <= end_idx:
|
||||
start_month = group_sorted.iloc[start_idx]["year_month"]
|
||||
end_month = group_sorted.iloc[end_idx]["year_month"]
|
||||
|
||||
hype_periods_list.append(
|
||||
{
|
||||
"itemId": item_id,
|
||||
"hype_start_month": start_month,
|
||||
"hype_end_month": end_month,
|
||||
"peak_month": group_sorted.iloc[peak_idx]["year_month"],
|
||||
"peak_rating_count_original": original_ratings[peak_idx],
|
||||
"peak_rating_count_normalized": normalized_ratings[
|
||||
peak_idx
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
if hype_periods_list:
|
||||
hype_periods_for_item = pd.DataFrame(hype_periods_list)
|
||||
else:
|
||||
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
|
||||
|
||||
# Calculate trending scores for each user in the group
|
||||
individual_scores = {}
|
||||
valid_scores = []
|
||||
|
||||
for idx, user_id in enumerate(internal_members):
|
||||
user_ratings = item_df[item_df["userId"] == user_id].copy()
|
||||
|
||||
if user_ratings.empty:
|
||||
individual_scores[members[idx]] = 0.0
|
||||
continue
|
||||
|
||||
# Merge user ratings with hype periods
|
||||
user_ratings_merged = pd.merge(
|
||||
user_ratings, hype_periods_for_item, on="itemId", how="left"
|
||||
)
|
||||
|
||||
user_ratings_merged["is_match"] = (
|
||||
(
|
||||
user_ratings_merged["year_month"]
|
||||
>= user_ratings_merged["hype_start_month"]
|
||||
)
|
||||
& (
|
||||
user_ratings_merged["year_month"]
|
||||
<= user_ratings_merged["hype_end_month"]
|
||||
)
|
||||
& user_ratings_merged["hype_start_month"].notna()
|
||||
)
|
||||
|
||||
if (
|
||||
not user_ratings_merged.empty
|
||||
and "is_match" in user_ratings_merged.columns
|
||||
):
|
||||
is_event_trending = user_ratings_merged.groupby(
|
||||
["userId", "itemId", "timestamp_dt"]
|
||||
)["is_match"].any()
|
||||
num_trending_ratings = is_event_trending.sum()
|
||||
total_unique_rating_events = len(is_event_trending)
|
||||
else:
|
||||
num_trending_ratings = 0
|
||||
total_unique_rating_events = len(
|
||||
user_ratings.drop_duplicates(
|
||||
subset=["userId", "itemId", "timestamp_dt"]
|
||||
)
|
||||
)
|
||||
|
||||
if total_unique_rating_events == 0:
|
||||
individual_scores[members[idx]] = 0.0
|
||||
else:
|
||||
trending_score = num_trending_ratings / total_unique_rating_events
|
||||
individual_scores[members[idx]] = trending_score
|
||||
valid_scores.append(trending_score)
|
||||
|
||||
# Calculate average trending score across all group members
|
||||
# Include users with 0.0 scores (no ratings for the item) in the average
|
||||
all_scores = [individual_scores[user_id] for user_id in members]
|
||||
average_trending_score = sum(all_scores) / len(members) if members else 0.0
|
||||
|
||||
return average_trending_score, individual_scores, hype_periods_for_item
|
||||
|
||||
def generate_ranked_items(
|
||||
self,
|
||||
all_rated_items: List[Union[str, int]],
|
||||
data: DataReader,
|
||||
group_members: List[Union[str, int]],
|
||||
component_weights: Optional[Dict[str, float]] = None,
|
||||
) -> tuple[List[Union[str, int]], Dict]:
|
||||
"""
|
||||
Ranks items based on multiple scoring factors for a group of users.
|
||||
|
||||
Calculates a composite score for each item based on:
|
||||
- Item popularity
|
||||
- Group preference intensity
|
||||
- Predicted ratings
|
||||
- Relevance to the group
|
||||
- Trends in the group
|
||||
|
||||
Args:
|
||||
candidate_items: List of items that at least one group member has interacted with
|
||||
data: The DataReader object containing user-item interactions
|
||||
group_members: List of user identifiers in the group
|
||||
component_weights: Optional dictionary with weights for each component
|
||||
(popularity, intensity, rating, relevance, trend)
|
||||
|
||||
Returns:
|
||||
List of item IDs sorted in descending order by their composite scores
|
||||
"""
|
||||
if self.group_predictions is None:
|
||||
raise ValueError(
|
||||
"User predictions not set. Call set_group_recommender_values first."
|
||||
)
|
||||
if self.top_recommendation is None:
|
||||
raise ValueError(
|
||||
"Top recommendation not set. Call set_group_recommender_values first."
|
||||
)
|
||||
|
||||
# Default weights if not provided
|
||||
if component_weights is None:
|
||||
component_weights = {
|
||||
"popularity": 1.0,
|
||||
"intensity": 1.0,
|
||||
"rating": 1.0,
|
||||
"relevance": 1.0,
|
||||
"trend": 1.0,
|
||||
}
|
||||
|
||||
item_scores = {}
|
||||
item_metric_details = {}
|
||||
popularity_scores = self.calculate_item_popularity_score(all_rated_items, data)
|
||||
|
||||
relevance_mask = self.calculate_relevance_mask(self.top_recommendation)
|
||||
|
||||
for item_id in all_rated_items:
|
||||
# Calculate individual score components
|
||||
|
||||
popularity_score = popularity_scores[item_id]
|
||||
|
||||
intensity_score = self.calculate_item_intensity_score(
|
||||
item_id, group_members, data
|
||||
)
|
||||
rating_score = self.calculate_rating_score(item_id, group_members, data)
|
||||
relevance_score = self.calculate_relevance_score(
|
||||
item_id, data, relevance_mask, group_members
|
||||
)
|
||||
|
||||
trending_score, _, _ = self.calculate_trending_score(
|
||||
group_members,
|
||||
item_id,
|
||||
data,
|
||||
0.3,
|
||||
0.2,
|
||||
9,
|
||||
0.6,
|
||||
)
|
||||
|
||||
composite_score = (
|
||||
component_weights["popularity"] * popularity_score
|
||||
+ component_weights["intensity"] * intensity_score
|
||||
+ component_weights["rating"] * rating_score
|
||||
+ component_weights["relevance"] * relevance_score
|
||||
+ component_weights["trend"] * trending_score
|
||||
)
|
||||
|
||||
item_metric_details[item_id] = {
|
||||
"Popularity": popularity_score,
|
||||
"Intensity": intensity_score,
|
||||
"Rating": rating_score,
|
||||
"Relevance": relevance_score,
|
||||
"Trend": trending_score,
|
||||
"Composite Score": composite_score,
|
||||
}
|
||||
item_scores[item_id] = composite_score
|
||||
|
||||
# Sort items by score in descending order
|
||||
ranked_items = sorted(
|
||||
item_scores.items(), key=operator.itemgetter(1), reverse=True
|
||||
)
|
||||
|
||||
# Return the sorted item IDs and the detailed metrics
|
||||
return [item_id for item_id, _ in ranked_items], item_metric_details
|
||||
@@ -0,0 +1,55 @@
|
||||
"""
|
||||
Some handy functions for pytroch model training ...
|
||||
"""
|
||||
|
||||
import torch
|
||||
from torch.optim import Optimizer
|
||||
|
||||
|
||||
# Checkpoints
|
||||
def save_checkpoint(model, model_dir):
|
||||
torch.save(model.state_dict(), model_dir)
|
||||
|
||||
|
||||
def resume_checkpoint(model, model_dir, device_id):
|
||||
device = f"cuda:{device_id}"
|
||||
state_dict = torch.load(model_dir, map_location=device)
|
||||
model.load_state_dict(state_dict)
|
||||
|
||||
|
||||
# Hyper params
|
||||
def use_cuda(enabled, device_id=0):
|
||||
if enabled:
|
||||
assert torch.cuda.is_available(), "CUDA is not available"
|
||||
torch.cuda.set_device(device_id)
|
||||
|
||||
|
||||
def use_optimizer(
|
||||
optimizer_name: str,
|
||||
network: torch.nn.Module,
|
||||
learning_rate: float,
|
||||
momentum: float = 0,
|
||||
weight_decay: float = 0,
|
||||
alpha: float = 0.99,
|
||||
) -> Optimizer:
|
||||
if optimizer_name == "sgd":
|
||||
optimizer = torch.optim.SGD(
|
||||
network.parameters(),
|
||||
lr=learning_rate,
|
||||
momentum=momentum,
|
||||
weight_decay=weight_decay,
|
||||
)
|
||||
|
||||
elif optimizer_name == "adam":
|
||||
optimizer = torch.optim.Adam(
|
||||
network.parameters(), lr=learning_rate, weight_decay=weight_decay
|
||||
)
|
||||
|
||||
elif optimizer_name == "rmsprop":
|
||||
optimizer = torch.optim.RMSprop(
|
||||
network.parameters(), lr=learning_rate, alpha=alpha, momentum=momentum
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Optimizer '{optimizer_name}' is not supported")
|
||||
|
||||
return optimizer
|
||||
Reference in New Issue
Block a user