public code v1

This commit is contained in:
2026-05-22 10:02:10 +02:00
commit 46a9ecf065
166 changed files with 6982454 additions and 0 deletions
+17
View File
@@ -0,0 +1,17 @@
from .aggregation_strategy import AggregationStrategy
from .association_rules import AssociationRules
from .scale import Scale
from .sliding_window import SlidingWindow
from .emp_loss import EMFLoss
from .explanation_diversity import calculate_gild_for_explanations
from .sliding_window_ranker import SlidingWindowRanker
__all__ = [
"AggregationStrategy",
"AssociationRules",
"Scale",
"EMFLoss",
"calculate_gild_for_explanations",
"SlidingWindowRanker",
"SlidingWindow",
]
+210
View File
@@ -0,0 +1,210 @@
import numpy as np
from typing import Dict, List, Union, Optional, TypeAlias
from enum import Enum
# Type aliases for better readability
UserID: TypeAlias = Union[str, int]
ItemID: TypeAlias = Union[str, int]
EvaluationScore: TypeAlias = float
AggregatedScore: TypeAlias = float
# Main data structure types
UserEvaluations: TypeAlias = Dict[UserID, Dict[ItemID, EvaluationScore]]
UserRankings: TypeAlias = Dict[UserID, List[ItemID]]
AggregatedScores: TypeAlias = Dict[ItemID, AggregatedScore]
class AggregationStrategy(Enum):
"""Enumeration of available aggregation strategies."""
# Individual Predictions
AVG_PREDICTIONS = "avg_predictions"
LEAST_MISERY = "least_misery"
MOST_PLEASURE = "most_pleasure"
MOST_RESPECTED_PERSON = "most_respected_person"
# Individual Preferences
ADDITIVE_UTILITARIAN = "additive_utilitarian"
MULTIPLICATIVE = "multiplicative"
BORDA_COUNT = "borda_count"
class ScoreAggregator:
"""
A class for aggregating individual predictions or preferences into collective scores.
Supports two main approaches:
1. Individual Predictions: AVG, LM, MP, MRP
2. Individual Preferences: AVG, ADD, MUL, BRC
Felfernig, A., Boratto, L., Stettinger, M., Tkali, M.: Group Recommender Systems:
An Introduction. Springer Publishing Company, Incorporated, 1st edn. (2018)
"""
def __init__(self, most_respected_person: Optional[UserID] = None):
"""
Initialize the ScoreAggregator.
Args:
most_respected_person: User ID of the most respected person (required for MRP strategy)
"""
self.most_respected_person = most_respected_person
def aggregate_scores(
self,
evaluations: UserEvaluations,
strategy: AggregationStrategy,
rankings: Optional[UserRankings] = None,
) -> AggregatedScores:
"""
Aggregate individual evaluations into collective scores.
Args:
evaluations: Dictionary mapping user_id -> {item_id: evaluation_score}
strategy: Aggregation strategy to use
rankings: Dictionary mapping user_id -> [ordered_list_of_items] (required for Borda Count)
Returns:
Dictionary mapping item_id -> aggregated_score
"""
if not evaluations:
return {}
# Get all items across all users
all_items: set[ItemID] = set()
for user_evals in evaluations.values():
all_items.update(user_evals.keys())
result: AggregatedScores = {}
for item in all_items:
if strategy == AggregationStrategy.AVG_PREDICTIONS:
result[item] = self._avg_predictions(evaluations, item)
elif strategy == AggregationStrategy.LEAST_MISERY:
result[item] = self._least_misery(evaluations, item)
elif strategy == AggregationStrategy.MOST_PLEASURE:
result[item] = self._most_pleasure(evaluations, item)
elif strategy == AggregationStrategy.MOST_RESPECTED_PERSON:
result[item] = self._most_respected_person(evaluations, item)
elif strategy == AggregationStrategy.ADDITIVE_UTILITARIAN:
result[item] = self._additive_utilitarian(evaluations, item)
elif strategy == AggregationStrategy.MULTIPLICATIVE:
result[item] = self._multiplicative(evaluations, item)
elif strategy == AggregationStrategy.BORDA_COUNT:
if rankings is None:
raise ValueError("Rankings required for Borda Count strategy")
result[item] = self._borda_count(rankings, item)
else:
raise ValueError(f"Unknown aggregation strategy: {strategy}")
return result
def get_top_recommendation(
self,
evaluations: UserEvaluations,
strategy: AggregationStrategy,
rankings: Optional[UserRankings] = None,
) -> ItemID:
"""
Get the top recommended item based on aggregated scores.
Args:
evaluations: Dictionary mapping user_id -> {item_id: evaluation_score}
strategy: Aggregation strategy to use
rankings: Dictionary mapping user_id -> [ordered_list_of_items] (required for Borda Count)
Returns:
Item ID with highest aggregated score
"""
aggregated_scores = self.aggregate_scores(evaluations, strategy, rankings)
return max(aggregated_scores.items(), key=lambda x: x[1])[0]
def _avg_predictions(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Average of item-specific evaluations."""
item_evals = [
user_evals.get(item, 0)
for user_evals in evaluations.values()
if item in user_evals
]
return np.mean(item_evals) if item_evals else 0.0 # type: ignore
def _least_misery(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Minimum item-specific evaluation."""
item_evals = [
user_evals.get(item, 0)
for user_evals in evaluations.values()
if item in user_evals
]
return min(item_evals) if item_evals else 0.0
def _most_pleasure(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Maximum item-specific evaluation."""
item_evals = [
user_evals.get(item, 0)
for user_evals in evaluations.values()
if item in user_evals
]
return max(item_evals) if item_evals else 0.0
def _most_respected_person(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Item-evaluations of most respected user."""
if self.most_respected_person is None:
raise ValueError("Most respected person not specified")
if self.most_respected_person not in evaluations:
raise ValueError(
f"Most respected person '{self.most_respected_person}' not found in evaluations"
)
return evaluations[self.most_respected_person].get(item, 0.0)
def _avg_preferences(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Average of item-specific evaluations (same as avg_predictions)."""
return self._avg_predictions(evaluations, item)
def _additive_utilitarian(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Sum of item-specific evaluations."""
item_evals = [
user_evals.get(item, 0)
for user_evals in evaluations.values()
if item in user_evals
]
return sum(item_evals)
def _multiplicative(
self, evaluations: UserEvaluations, item: ItemID
) -> AggregatedScore:
"""Multiplication of item-specific evaluations."""
item_evals = [
user_evals.get(item, 0)
for user_evals in evaluations.values()
if item in user_evals
]
if not item_evals:
return 0.0
result = 1.0
for eval_score in item_evals:
result *= eval_score
return result
def _borda_count(self, rankings: UserRankings, item: ItemID) -> AggregatedScore:
"""Sum of item-specific scores derived from item ranking."""
total_score = 0.0
for user_ranking in rankings.values():
if item in user_ranking:
# Score is based on position in ranking (higher position = higher score)
position = user_ranking.index(item)
score = len(user_ranking) - position - 1 # Reverse position for score
total_score += score
return total_score
+255
View File
@@ -0,0 +1,255 @@
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, association_rules
import pandas as pd
from pygrex.data_reader.data_reader import DataReader
from typing import List, Optional, Union
class AssociationRules:
"""
A class to represent association rules mining for recommendation systems.
This class implements association rules mining using the FP-Growth algorithm
to discover frequent itemsets and generate association rules from user-item
interaction data. It can be used to find patterns in user behavior and
generate item recommendations based on item associations.
"""
def __init__(
self,
data: DataReader,
min_support: float = 0.2,
min_confidence: float = 0.2,
rating_threshold: float = 4.0,
) -> None:
"""Initialize the association rules miner with data and parameters.
Args:
data: The DataReader object containing user-item interactions with ratings.
min_support: Minimum support threshold for frequent itemsets.
Must be between 0 and 1. Default is 0.2.
min_confidence: Minimum confidence threshold for association rules.
Must be between 0 and 1. Default is 0.2.
rating_threshold: Minimum rating threshold to consider an interaction
as positive. Default is 4.0.
Raises:
ValueError: If support, confidence, or rating_threshold values are invalid.
"""
self._validate_parameters(min_support, min_confidence, rating_threshold)
self.data = data
self.min_support = min_support
self.min_confidence = min_confidence
self.rating_threshold = rating_threshold
self._frequent_itemsets: Optional[pd.DataFrame] = None
self._association_rules: Optional[pd.DataFrame] = None
def _validate_parameters(
self, min_support: float, min_confidence: float, rating_threshold: float
) -> None:
"""Validate initialization parameters.
Args:
min_support: Minimum support threshold to validate.
min_confidence: Minimum confidence threshold to validate.
rating_threshold: Rating threshold to validate.
Raises:
ValueError: If any parameter is invalid.
"""
if not (0 < min_support <= 1):
raise ValueError("min_support must be between 0 and 1")
if not (0 < min_confidence <= 1):
raise ValueError("min_confidence must be between 0 and 1")
if rating_threshold < 0:
raise ValueError("rating_threshold must be non-negative")
def get_df_filtered_by_rating_threshold(self) -> pd.DataFrame:
df = self.data.dataset.copy()
# Filter interactions based on rating threshold
df_filtered = df[df["rating"] >= self.rating_threshold]
if df_filtered.empty:
raise ValueError(
f"No interactions found with rating >= {self.rating_threshold}"
)
return df_filtered
def _prepare_transactions(self) -> List[List[str]]:
"""Prepare transaction data from the dataset.
Filters the dataset based on rating threshold and groups items
by user to create transaction lists.
Returns:
A list of transactions, where each transaction is a list of item IDs
that a user has positively interacted with.
"""
df_filtered = self.get_df_filtered_by_rating_threshold()
# Group items by user to create transactions
transactions = df_filtered.groupby("userId")["itemId"].apply(list).tolist()
# Convert item IDs to strings for consistency
transactions = [
[str(item) for item in transaction] for transaction in transactions
]
return transactions
def _mine_frequent_itemsets(
self, transactions: List[List[Union[str, int]]]
) -> pd.DataFrame:
"""Mine frequent itemsets using FP-Growth algorithm.
Args:
transactions: List of transactions to mine frequent itemsets from.
Returns:
DataFrame containing frequent itemsets with their support values.
Raises:
ValueError: If no frequent itemsets are found.
"""
# Encode transactions into binary matrix
transaction_encoder = TransactionEncoder()
transaction_matrix = transaction_encoder.fit_transform(transactions)
df_encoded = pd.DataFrame(
transaction_matrix, # type: ignore
columns=transaction_encoder.columns_,
)
# Apply FP-Growth to find frequent itemsets
frequent_itemsets = fpgrowth(
df_encoded, min_support=self.min_support, use_colnames=True
)
if frequent_itemsets.empty:
raise ValueError(
f"No frequent itemsets found with min_support={self.min_support}"
)
return frequent_itemsets
def _generate_association_rules(
self, frequent_itemsets: pd.DataFrame
) -> pd.DataFrame:
"""Generate association rules from frequent itemsets.
Args:
frequent_itemsets: DataFrame containing frequent itemsets.
Returns:
DataFrame containing association rules with their metrics.
Raises:
ValueError: If no association rules are found.
"""
rules = association_rules(
frequent_itemsets, metric="confidence", min_threshold=self.min_confidence
)
if rules.empty:
raise ValueError(
f"No association rules found with min_confidence={self.min_confidence}"
)
return rules
def compute(self) -> pd.DataFrame:
"""Compute association rules from the dataset.
This method performs the complete association rules mining process:
1. Prepares transactions from the dataset
2. Mines frequent itemsets using FP-Growth
3. Generates association rules from frequent itemsets
Returns:
DataFrame containing association rules with metrics including
antecedents, consequents, support, confidence, lift, etc.
Raises:
ValueError: If the dataset is empty, no transactions meet the
criteria, or no rules can be generated with the given parameters.
"""
if self.data.dataset.empty:
raise ValueError("Dataset is empty")
# Prepare transactions
transactions = self._prepare_transactions()
if not transactions:
raise ValueError("No transactions found after filtering")
# Mine frequent itemsets
self._frequent_itemsets = self._mine_frequent_itemsets(transactions) # type: ignore
# Generate association rules
self._association_rules = self._generate_association_rules(
self._frequent_itemsets
)
return self._association_rules
def get_frequent_itemsets(self) -> Optional[pd.DataFrame]:
"""Get the computed frequent itemsets.
Returns:
DataFrame containing frequent itemsets if compute() has been called,
None otherwise.
"""
return self._frequent_itemsets
def get_recommendations_for_items(
self, items: List[Union[str, int]], top_k: int = 10
) -> pd.DataFrame:
"""Get item recommendations based on association rules.
Args:
items: List of item IDs to get recommendations for.
top_k: Maximum number of recommendations to return. Default is 10.
Returns:
DataFrame containing recommended items sorted by confidence.
Raises:
RuntimeError: If compute() hasn't been called yet.
ValueError: If items list is empty.
"""
if self._association_rules is None:
raise RuntimeError("Must call compute() before getting recommendations")
if not items:
raise ValueError("Items list cannot be empty")
items_set = set(str(item) for item in items)
# Filter rules where antecedents match the given items
matching_rules = self._association_rules[
self._association_rules["antecedents"].apply(
lambda x: items_set.issubset(set(str(item) for item in x))
)
]
if matching_rules.empty:
return pd.DataFrame()
# Sort by confidence and return top_k recommendations
recommendations = matching_rules.nlargest(top_k, "confidence")
return recommendations[
["antecedents", "consequents", "confidence", "lift", "support"]
]
def __str__(self) -> str:
"""Return string representation of the AssociationRules object."""
return (
f"AssociationRules(min_support={self.min_support}, "
f"min_confidence={self.min_confidence}, "
f"rating_threshold={self.rating_threshold})"
)
def __repr__(self) -> str:
"""Return detailed string representation of the AssociationRules object."""
return self.__str__()
+17
View File
@@ -0,0 +1,17 @@
import torch
class EMFLoss(torch.nn.Module):
def __init__(self):
super(EMFLoss, self).__init__()
def forward(self, ratings_pred, ratings, u, v, reg_term, expl, expl_reg_term):
mse = (ratings - ratings_pred.view(-1)) ** 2
u_l2 = reg_term * torch.norm(u, 2, -1)
v_l2 = reg_term * torch.norm(v, 2, -1)
expl_constraint = expl_reg_term * torch.norm(u - v, 1, -1) * expl
loss = mse + u_l2 + v_l2 + expl_constraint
return loss.mean()
+80
View File
@@ -0,0 +1,80 @@
from itertools import combinations
import numpy as np
def _get_explanation_feature_set(explanation, explainer_type, details=None):
"""Helper to extract a consistent feature set from different explanation types."""
if explainer_type == "Sliding Window":
return set(explanation.get("items", []))
elif explainer_type == "EXPGRS":
if details is not None:
return set(details.get("antecedent", frozenset()))
else:
return set()
elif explainer_type == "LORE4Groups":
rules_data = explanation.get("group_factual_rule", {})
if isinstance(rules_data, dict):
return set(
rule for tier_rules in rules_data.values() for rule in tier_rules
)
elif isinstance(rules_data, list):
return set(rules_data)
return set()
def calculate_gild_for_explanations(explanations_dict, explainer_type, use_median=True):
"""Calculate Gaussian Inter-List Diversity (GILD) for a set of explanations."""
if not explanations_dict or len(explanations_dict) < 2:
return 0.0
feature_sets = []
if explainer_type == "EXPGRS":
for item_id, rules_list in explanations_dict.items():
if rules_list:
feature_sets.append(
_get_explanation_feature_set(
None, explainer_type, details=rules_list[0]
)
)
elif explainer_type == "Sliding Window":
for call, exp_data in explanations_dict.items():
feature_sets.append(_get_explanation_feature_set(exp_data, explainer_type))
elif explainer_type == "LORE4Groups":
for item_id, exp_data in explanations_dict.items():
feature_sets.append(_get_explanation_feature_set(exp_data, explainer_type))
feature_sets = [fs for fs in feature_sets if fs]
if len(feature_sets) < 2:
return 0.0
# Calculate pairwise Jaccard distances
distances = []
for set1, set2 in combinations(feature_sets, 2):
intersection_len = len(set1.intersection(set2))
union_len = len(set1.union(set2))
jaccard_dist = 1.0 - (intersection_len / union_len) if union_len > 0 else 1.0
distances.append(jaccard_dist)
if not distances:
return 0.0
# Calculate sigma using paper's formula
k_choose_2 = len(distances)
if use_median:
reference_dist = np.median(distances)
else:
reference_dist = min(distances)
denominator = np.sqrt(2 * np.log(k_choose_2 - 1)) if k_choose_2 > 1 else 1.0
sigma = reference_dist / denominator if denominator > 0 else reference_dist
if sigma == 0:
sigma = 1e-9
kernel_distances_sum = 0.0
for d in distances:
kernel_distance = np.sqrt(2 - 2 * np.exp(-(d**2) / (2 * sigma**2)))
kernel_distances_sum += kernel_distance
gild = kernel_distances_sum / k_choose_2 if distances else 0
return gild
+138
View File
@@ -0,0 +1,138 @@
from typing import List, Union, Optional
import numpy as np
from scipy import stats
class Scale:
"""
A class for scaling numerical values using different methods.
Methods:
quantile: Scale values using quantile-based ranking.
linear: Scale values linearly to a target range with outlier handling.
"""
@staticmethod
def quantile(
raw_predictions: Union[List[float], np.ndarray],
target_min: float = 1,
target_max: float = 5,
) -> np.ndarray:
"""
Scale raw predictions to the target range using quantile-based ranking.
Args:
raw_predictions: The raw prediction values.
target_min: Minimum of the target range (default: 1).
target_max: Maximum of the target range (default: 5).
Returns:
numpy.ndarray: Scaled predictions.
Raises:
ValueError: If raw_predictions is empty.
"""
if len(raw_predictions) == 0:
raise ValueError("Raw predictions array is empty.")
# Convert to numpy array if it's not already
raw_predictions = np.array(raw_predictions)
ranks = stats.rankdata(raw_predictions, method="average")
if len(raw_predictions) == 1:
# Handle single element case
scaled_predictions = np.array([(target_min + target_max) / 2])
else:
scaled_predictions = target_min + (ranks - 1) * (
target_max - target_min
) / (len(raw_predictions) - 1)
# Ensure scaled predictions are within [target_min, target_max]
scaled_predictions = np.clip(scaled_predictions, target_min, target_max)
return scaled_predictions
@staticmethod
def linear(
raw_predictions: Union[List[float], np.ndarray],
target_min: float = 1,
target_max: float = 5,
ref_min: Optional[float] = None,
ref_max: Optional[float] = None,
handle_outliers: bool = True,
) -> np.ndarray:
"""
Scale raw predictions to the target range [target_min, target_max].
Args:
raw_predictions: The raw prediction values.
target_min: Minimum of the target range (default: 1).
target_max: Maximum of the target range (default: 5).
ref_min: Reference minimum for raw predictions. If None, will be calculated
from the data or from outlier bounds if handle_outliers=True.
ref_max: Reference maximum for raw predictions. If None, will be calculated
from the data or from outlier bounds if handle_outliers=True.
handle_outliers: Whether to handle outliers using IQR method (default: True).
Returns:
numpy.ndarray: Scaled predictions.
Raises:
ValueError: If raw_predictions is empty.
"""
if len(raw_predictions) == 0:
raise ValueError("Raw predictions array is empty.")
# Convert to numpy array if it's not already
raw_predictions = np.array(raw_predictions)
# Handle single element case
if len(raw_predictions) == 1:
if ref_min is not None and ref_max is not None:
# Scale based on provided reference range
value = raw_predictions[0]
scaled_value = (
target_min
+ (value - ref_min)
* (target_max - target_min)
/ (ref_max - ref_min)
if ref_max != ref_min
else (target_min + target_max) / 2
)
scaled_value = np.clip(scaled_value, target_min, target_max)
return np.array([scaled_value])
else:
# Can't determine range from single value, return middle of target range
return np.array([(target_min + target_max) / 2])
clipped_predictions = raw_predictions.copy()
# Handle outliers if requested
if handle_outliers:
q1, q3 = np.percentile(raw_predictions, [25, 75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
clipped_predictions = np.clip(raw_predictions, lower_bound, upper_bound)
# Determine min and max values
min_raw = np.min(clipped_predictions)
max_raw = np.max(clipped_predictions)
# Use provided reference bounds if given, otherwise use data bounds
actual_ref_min = ref_min if ref_min is not None else min_raw
actual_ref_max = ref_max if ref_max is not None else max_raw
# Scale to [target_min, target_max]
if actual_ref_max == actual_ref_min:
# Reference bounds are equal, return the middle of the target range
return np.full_like(raw_predictions, (target_min + target_max) / 2)
else:
scaled_predictions = target_min + (raw_predictions - actual_ref_min) * (
target_max - target_min
) / (actual_ref_max - actual_ref_min)
# Ensure scaled predictions are within [target_min, target_max]
scaled_predictions = np.clip(scaled_predictions, target_min, target_max)
return scaled_predictions
+90
View File
@@ -0,0 +1,90 @@
from typing import List, Optional, TypeVar, Generic, Iterator
T = TypeVar("T")
class SlidingWindow(Generic[T]):
"""Class for creating and managing sliding windows over a sequence.
This class provides functionality to iterate through windows of a fixed size
over a sequence of items.
"""
def __init__(self, sequence: List[T], window_size: int):
"""Initialize the sliding window.
Args:
sequence: The sequence of items to slide over
window_size: The size of each window (must be positive)
Raises:
ValueError: If window_size is less than 1
TypeError: If sequence is not iterable
"""
if window_size < 1:
raise ValueError("Window size must be at least 1")
if not hasattr(sequence, "__iter__"):
raise TypeError("Sequence must be iterable")
self.sequence = sequence
self.window_size = window_size
self.index = 0
self.max_index = len(sequence) - window_size + 1 if sequence else 0
def get_next_window(self) -> Optional[List[T]]:
"""Return the next window and advance the current position.
Returns:
A list containing the next window of items, or None if all windows
have been processed.
"""
if self.index >= self.max_index:
return None
window = self.sequence[self.index : self.index + self.window_size]
self.index += 1
return window
def reset(self) -> None:
"""Reset the window position to the beginning of the sequence."""
self.index = 0
def has_next(self) -> bool:
"""Check if there are more windows available.
Returns:
True if there are more windows, False otherwise.
"""
return self.index < self.max_index
def __iter__(self) -> Iterator[List[T]]:
"""Make the class iterable.
Returns:
An iterator over all windows in the sequence.
"""
self.reset()
return self
def __next__(self) -> List[T]:
"""Get the next window for iteration.
Returns:
The next window as a list.
Raises:
StopIteration: When all windows have been processed.
"""
window = self.get_next_window()
if window is None:
raise StopIteration
return window
def __len__(self) -> int:
"""Return the total number of windows.
Returns:
The number of complete windows in the sequence.
"""
return max(0, self.max_index)
+631
View File
@@ -0,0 +1,631 @@
import operator
from typing import Any, Dict, List, Union, Optional
import numpy as np
import pandas as pd
from scipy.signal import (
find_peaks,
peak_widths,
)
from pygrex.data_reader import DataReader
class SlidingWindowRanker:
"""
Stratigi, M., Bikakis, N., Stefanidis, K.: Counterfactual explanations for group
recommendations. In: Proceedings of the 27th International Workshop on Design,
Optimization, Languages and Analytical Processing of Big Data (DOLAP 2025)
"""
def __init__(self, config: Dict[str, Any]):
"""
Initialize the SlidingWindowRanker.
Args:
config: Configuration parameters for the evaluator
"""
self.config = config
self.group_predictions: Optional[
Dict[Union[str, int], Dict[Union[str, int], float]]
] = None
self.top_recommendation: Optional[Union[str, int]] = None
def set_group_recommender_values(
self,
group_predictions: Dict[Union[str, int], Dict[Union[str, int], float]],
top_recommendation: Union[str, int],
) -> None:
"""
Set group recommender values.
Args:
group_predictions: Dictionary mapping user IDs to their item predictions
top_recommendation: List of top recommended items for the group
"""
self.group_predictions = group_predictions
self.top_recommendation = top_recommendation
def evaluate(self, data: DataReader) -> Dict[str, Any]:
"""
Evaluate the data using the Stratigis evaluator.
Args:
data: DataReader object containing dataset and transformation methods
Returns:
Dictionary with evaluation metrics
"""
# Implementation would go here
return {}
def calculate_item_popularity_score(
self, items: List[Union[str, int]], data: DataReader
) -> Dict[Union[str, int], float]:
"""
Calculate the normalized popularity of each item based on the number of interactions received.
Args:
items: List of item IDs
data: Data object containing the dataset and transformation methods
Returns:
Dictionary with item IDs as keys and normalized popularity (0-1) as values
"""
# Calculate popularity (number of interactions) for each item
popularity_counts = {}
for item_id in items:
internal_item_id = data.get_new_item_id(item_id)
count = len(data.dataset[data.dataset["itemId"] == internal_item_id])
popularity_counts[item_id] = count
# Find min and max values for normalization
min_count = min(popularity_counts.values()) if popularity_counts else 0
max_count = max(popularity_counts.values()) if popularity_counts else 0
# Add 1% padding to the range
range_value = max_count - min_count
padded_range = range_value + (
range_value / 50
) # Add 2% to range (1% on each end)
padded_min = min_count - (
range_value / 100
) # Subtract 1% of range from minimum
if padded_range == 0:
padded_range = 1 # Avoid division by zero
# Normalize popularity values to [0,1]
popularity_mask = {}
for item_id, count in popularity_counts.items():
popularity_mask[item_id] = (count - padded_min) / padded_range
return popularity_mask
def calculate_relevance_mask(
self,
target_item_id: Union[str, int],
) -> Dict[Union[str, int], float]:
"""
Create a mapping between users and their prediction scores for a specific target item.
Args:
target_item_id :The ID of the item for which prediction scores are needed
Returns:
Dictionary mapping user IDs to their predicted scores for the target item
Note: Users without a prediction for the target item will have a value of 0
Examples
>>> user_preds = {'user1': {'item1': 4.5, 'item2': 3.2}, 'user2': {'item2': 2.8}}
>>> evaluator.set_group_recommender_values(user_preds,top_recommendation)
>>> evaluator.calculate_relevance_mask('item1')
{'user1': 4.5, 'user2': 0}
"""
if self.group_predictions is None:
raise ValueError(
"User predictions not set. Call set_group_recommender_values first."
)
individual_predictions = {}
for user_id, predictions in self.group_predictions.items():
# Get the prediction for the target item if it exists, otherwise default to 0
individual_predictions[user_id] = predictions.get(target_item_id, 0)
return individual_predictions
def calculate_relevance_score(
self,
item_id: Union[str, int],
data: DataReader,
prediction_scores: Dict[Union[str, int], float],
members: List[Union[str, int]],
rating_scale: tuple = (0, 5), # Default rating scale
) -> float:
"""
Calculate the normalized average prediction score for an item based on group members' predictions.
Agrs
item_id: ID of the item to calculate relevance for
data : DataReader object containing dataset and ID mapping methods
prediction_scores : Dictionary mapping user IDs to their prediction scores for items
members : List of user IDs in the group
rating_scale: Tuple indicating (min_rating, max_rating) for normalization
Returns
Normalized average prediction score in range [0,1]
Returns 0 if no users in the group have interacted with the item
Notes
1. Calculates the average prediction score for the item from group members
2. Normalizes the score to [0,1] range with 1% padding
"""
total_score = 0
valid_users_count = 0
internal_item_id = data.get_new_item_id(item_id)
for user_id in members:
# Convert user ID to internal format
internal_user_id = (
data.get_new_user_id(int(user_id))
if isinstance(user_id, (int, np.integer))
else user_id
)
# Check if user has interacted with the item
user_item_data = data.dataset[
(data.dataset["userId"] == internal_user_id)
& (data.dataset["itemId"] == internal_item_id)
]
if user_item_data.empty:
continue
# Get the prediction score for this user
if user_id in prediction_scores:
total_score += prediction_scores[user_id]
valid_users_count += 1
# Return 0 if no valid users found
if valid_users_count == 0:
return 0
# Calculate average score
average_score = total_score / valid_users_count
# Normalize to [0,1] with 1% padding
min_value, max_value = rating_scale
range_value = max_value - min_value
padded_range = range_value + (
range_value / 50
) # Add 2% to range (1% on each end)
padded_min = min_value - (
range_value / 100
) # Subtract 1% of range from minimum
if padded_range == 0:
return 0.0
normalized_score = (average_score - padded_min) / padded_range
return float(normalized_score)
def calculate_item_intensity_score(
self, item_id: Union[str, int], members: List[Union[str, int]], data: DataReader
) -> float:
"""
Calculate what proportion of group members have interacted with the specified item.
Args
item_id : ID of the item to calculate interaction rate for
members : List of user IDs in the group
data : DataReader object containing dataset and ID mapping methods
Returns
Proportion of group members who have interacted with the item (range [0,1])
0 means no group members have interacted with the item
1 means all group members have interacted with the item
"""
# Convert item ID to internal format
if data is None:
print("Error: DataReader object is None. Cannot convert item_id.")
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
internal_item_id = data.get_new_item_id(item_id)
# Convert all user IDs to internal format
internal_members = [data.get_new_user_id(user_id) for user_id in members]
# Count how many users have interacted with the item
interaction_count = len(
data.dataset[
(data.dataset.itemId == internal_item_id)
& data.dataset.userId.isin(internal_members)
]
)
# Calculate proportion of group members who interacted with item
if not members:
return 0 # Avoid division by zero if no members
interaction_rate = interaction_count / len(members)
return interaction_rate
def calculate_rating_score(
self,
item_id: Union[str, int],
members: List[Union[str, int]],
data: DataReader,
rating_scale: tuple = (0, 5),
) -> float:
"""
Calculate the normalized average rating given to an item by group members.
Args
item_id : ID of the item to calculate average rating for
data : DataReader object containing dataset and ID mapping methods
members : List of user IDs in the group
rating_scale: Tuple indicating (min_rating, max_rating) for normalization
Returns
Normalized average rating in range [0,1]
Notes
- Considers all group members in the denominator even if some haven't rated the item
- Normalizes the resulting average to [0,1] with 1% padding
"""
# Convert item ID to internal format
if data is None:
print("Error: DataReader object is None. Cannot convert item_id.")
return 0.0
internal_item_id = data.get_new_item_id(item_id)
# Convert all user IDs to internal format
internal_members = [data.get_new_user_id(user_id) for user_id in members]
# Get ratings from users who have rated this item
rating_data = data.dataset[
(data.dataset.itemId == internal_item_id)
& data.dataset.userId.isin(internal_members)
]
# Calculate average rating (sum of ratings divided by total group size)
if len(members) == 0:
return 0 # Avoid division by zero if no members
total_rating = rating_data["rating"].sum()
average_rating = total_rating / len(members)
# Normalize to [0,1] with 1% padding
min_value, max_value = rating_scale
range_value = max_value - min_value
padded_range = range_value + (
range_value / 50
) # Add 2% to range (1% on each end)
padded_min = min_value - (
range_value / 100
) # Subtract 1% of range from minimum
if padded_range == 0:
return 0.0
normalized_rating = (average_rating - padded_min) / padded_range
return float(normalized_rating)
def calculate_trending_score(
self,
members: List[Union[str, int]],
item_id: Union[str, int],
data: Optional[DataReader] = None,
peak_norm_min_height: float = 0.1,
peak_norm_min_prominence: float = 0.05,
peak_min_distance: int = 3,
peak_width_rel_height: float = 0.5,
) -> tuple[float, Dict[Union[str, int], float], pd.DataFrame]:
"""
Calculates a trending score for a user, using normalized data for hype period detection.
Args
members : List of user IDs in the group
item_id : ID of the item to calculate trending score for
data : DataReader object containing dataset and ID mapping methods
peak_norm_min_height : Minimum height of peaks in normalized data to consider as significant
peak_norm_min_prominence : Minimum prominence of peaks in normalized data
peak_min_distance : Minimum distance between peaks in months
peak_width_rel_height : Relative height for peak width calculation
Returns
tuple: (average_trending_score, individual_scores, hype_periods_for_item)
average_trending_score: Average trending score across all group members (0-1)
individual_scores: Dictionary mapping user IDs to their individual trending scores
hype_periods_for_item: DataFrame containing detected hype periods for the item
"""
if not members:
print("Error: No group members provided for trending score calculation.")
return 0.0, {}, pd.DataFrame()
_df = pd.DataFrame()
if data is not None and isinstance(data, DataReader):
_df = data.dataset.copy()
else:
if data is not None:
print(
f"Warning: data was provided but is not a DataReader object (type: {type(data)})."
)
if _df.empty:
print(
"Error: The DataFrame (_df) is empty. Cannot calculate score or plot."
)
return 0.0, {}, pd.DataFrame()
required_columns = [
"userId",
"itemId",
"rating",
"timestamp",
]
missing_columns = [col for col in required_columns if col not in _df.columns]
if missing_columns:
print(
f"Error: Missing required columns in DataFrame: {', '.join(missing_columns)}"
)
return 0.0, {}, pd.DataFrame()
try:
if "timestamp_dt" not in _df.columns or _df["timestamp_dt"].isnull().all():
_df["timestamp_dt"] = pd.to_datetime(_df["timestamp"], unit="s")
if "year_month" not in _df.columns or _df["year_month"].isnull().all():
_df["year_month"] = _df["timestamp_dt"].dt.to_period("M")
except Exception as e:
print(f"Error during timestamp conversion or year-month extraction: {e}")
return 0.0, {}, pd.DataFrame()
if data is None: # Should not happen if _df is not empty, but as a safeguard
return 0.0, {}, pd.DataFrame()
# Convert item ID to internal format
internal_item_id = data.get_new_item_id(item_id)
# Convert all user IDs to internal format
internal_members = [data.get_new_user_id(user_id) for user_id in members]
# Filter data for the specific item ID only
item_df = _df[_df["itemId"] == internal_item_id]
if item_df.empty:
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
# movie_ratings_per_month contains original rating counts
movie_ratings_per_month = (
item_df.groupby(["itemId", "year_month"], observed=False)
.size()
.reset_index(name="rating_count")
)
if movie_ratings_per_month.empty:
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
hype_periods_for_item = None
# Process the specific item for hype period detection
group_sorted = movie_ratings_per_month.sort_values("year_month").reset_index(
drop=True
)
original_ratings = group_sorted["rating_count"].to_numpy()
# Normalization Step
min_rating = np.min(original_ratings)
max_rating = np.max(original_ratings)
normalized_ratings = None
if (
max_rating > min_rating
): # Avoid division by zero if all ratings are the same
normalized_ratings = (original_ratings - min_rating) / (
max_rating - min_rating
)
elif len(original_ratings) > 0:
normalized_ratings = np.zeros_like(original_ratings, dtype=float)
else: # No ratings for this item in group_sorted (should not happen if groupby is correct)
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
# Peak Detection on Normalized Data
peaks_indices, properties = find_peaks(
normalized_ratings,
height=peak_norm_min_height,
distance=peak_min_distance,
prominence=peak_norm_min_prominence,
)
hype_periods_list = []
if len(peaks_indices) > 0:
widths, _, left_ips, right_ips = peak_widths(
normalized_ratings, peaks_indices, rel_height=peak_width_rel_height
)
for i, peak_idx in enumerate(peaks_indices):
start_idx = max(0, int(round(left_ips[i])))
end_idx = min(len(group_sorted) - 1, int(round(right_ips[i])))
if start_idx <= end_idx:
start_month = group_sorted.iloc[start_idx]["year_month"]
end_month = group_sorted.iloc[end_idx]["year_month"]
hype_periods_list.append(
{
"itemId": item_id,
"hype_start_month": start_month,
"hype_end_month": end_month,
"peak_month": group_sorted.iloc[peak_idx]["year_month"],
"peak_rating_count_original": original_ratings[peak_idx],
"peak_rating_count_normalized": normalized_ratings[
peak_idx
],
}
)
if hype_periods_list:
hype_periods_for_item = pd.DataFrame(hype_periods_list)
else:
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
# Calculate trending scores for each user in the group
individual_scores = {}
valid_scores = []
for idx, user_id in enumerate(internal_members):
user_ratings = item_df[item_df["userId"] == user_id].copy()
if user_ratings.empty:
individual_scores[members[idx]] = 0.0
continue
# Merge user ratings with hype periods
user_ratings_merged = pd.merge(
user_ratings, hype_periods_for_item, on="itemId", how="left"
)
user_ratings_merged["is_match"] = (
(
user_ratings_merged["year_month"]
>= user_ratings_merged["hype_start_month"]
)
& (
user_ratings_merged["year_month"]
<= user_ratings_merged["hype_end_month"]
)
& user_ratings_merged["hype_start_month"].notna()
)
if (
not user_ratings_merged.empty
and "is_match" in user_ratings_merged.columns
):
is_event_trending = user_ratings_merged.groupby(
["userId", "itemId", "timestamp_dt"]
)["is_match"].any()
num_trending_ratings = is_event_trending.sum()
total_unique_rating_events = len(is_event_trending)
else:
num_trending_ratings = 0
total_unique_rating_events = len(
user_ratings.drop_duplicates(
subset=["userId", "itemId", "timestamp_dt"]
)
)
if total_unique_rating_events == 0:
individual_scores[members[idx]] = 0.0
else:
trending_score = num_trending_ratings / total_unique_rating_events
individual_scores[members[idx]] = trending_score
valid_scores.append(trending_score)
# Calculate average trending score across all group members
# Include users with 0.0 scores (no ratings for the item) in the average
all_scores = [individual_scores[user_id] for user_id in members]
average_trending_score = sum(all_scores) / len(members) if members else 0.0
return average_trending_score, individual_scores, hype_periods_for_item
def generate_ranked_items(
self,
all_rated_items: List[Union[str, int]],
data: DataReader,
group_members: List[Union[str, int]],
component_weights: Optional[Dict[str, float]] = None,
) -> tuple[List[Union[str, int]], Dict]:
"""
Ranks items based on multiple scoring factors for a group of users.
Calculates a composite score for each item based on:
- Item popularity
- Group preference intensity
- Predicted ratings
- Relevance to the group
- Trends in the group
Args:
candidate_items: List of items that at least one group member has interacted with
data: The DataReader object containing user-item interactions
group_members: List of user identifiers in the group
component_weights: Optional dictionary with weights for each component
(popularity, intensity, rating, relevance, trend)
Returns:
List of item IDs sorted in descending order by their composite scores
"""
if self.group_predictions is None:
raise ValueError(
"User predictions not set. Call set_group_recommender_values first."
)
if self.top_recommendation is None:
raise ValueError(
"Top recommendation not set. Call set_group_recommender_values first."
)
# Default weights if not provided
if component_weights is None:
component_weights = {
"popularity": 1.0,
"intensity": 1.0,
"rating": 1.0,
"relevance": 1.0,
"trend": 1.0,
}
item_scores = {}
item_metric_details = {}
popularity_scores = self.calculate_item_popularity_score(all_rated_items, data)
relevance_mask = self.calculate_relevance_mask(self.top_recommendation)
for item_id in all_rated_items:
# Calculate individual score components
popularity_score = popularity_scores[item_id]
intensity_score = self.calculate_item_intensity_score(
item_id, group_members, data
)
rating_score = self.calculate_rating_score(item_id, group_members, data)
relevance_score = self.calculate_relevance_score(
item_id, data, relevance_mask, group_members
)
trending_score, _, _ = self.calculate_trending_score(
group_members,
item_id,
data,
0.3,
0.2,
9,
0.6,
)
composite_score = (
component_weights["popularity"] * popularity_score
+ component_weights["intensity"] * intensity_score
+ component_weights["rating"] * rating_score
+ component_weights["relevance"] * relevance_score
+ component_weights["trend"] * trending_score
)
item_metric_details[item_id] = {
"Popularity": popularity_score,
"Intensity": intensity_score,
"Rating": rating_score,
"Relevance": relevance_score,
"Trend": trending_score,
"Composite Score": composite_score,
}
item_scores[item_id] = composite_score
# Sort items by score in descending order
ranked_items = sorted(
item_scores.items(), key=operator.itemgetter(1), reverse=True
)
# Return the sorted item IDs and the detailed metrics
return [item_id for item_id, _ in ranked_items], item_metric_details
+55
View File
@@ -0,0 +1,55 @@
"""
Some handy functions for pytroch model training ...
"""
import torch
from torch.optim import Optimizer
# Checkpoints
def save_checkpoint(model, model_dir):
torch.save(model.state_dict(), model_dir)
def resume_checkpoint(model, model_dir, device_id):
device = f"cuda:{device_id}"
state_dict = torch.load(model_dir, map_location=device)
model.load_state_dict(state_dict)
# Hyper params
def use_cuda(enabled, device_id=0):
if enabled:
assert torch.cuda.is_available(), "CUDA is not available"
torch.cuda.set_device(device_id)
def use_optimizer(
optimizer_name: str,
network: torch.nn.Module,
learning_rate: float,
momentum: float = 0,
weight_decay: float = 0,
alpha: float = 0.99,
) -> Optimizer:
if optimizer_name == "sgd":
optimizer = torch.optim.SGD(
network.parameters(),
lr=learning_rate,
momentum=momentum,
weight_decay=weight_decay,
)
elif optimizer_name == "adam":
optimizer = torch.optim.Adam(
network.parameters(), lr=learning_rate, weight_decay=weight_decay
)
elif optimizer_name == "rmsprop":
optimizer = torch.optim.RMSprop(
network.parameters(), lr=learning_rate, alpha=alpha, momentum=momentum
)
else:
raise ValueError(f"Optimizer '{optimizer_name}' is not supported")
return optimizer