public code v1
This commit is contained in:
@@ -0,0 +1,17 @@
|
||||
from .aggregation_strategy import AggregationStrategy
|
||||
from .association_rules import AssociationRules
|
||||
from .scale import Scale
|
||||
from .sliding_window import SlidingWindow
|
||||
from .emp_loss import EMFLoss
|
||||
from .explanation_diversity import calculate_gild_for_explanations
|
||||
from .sliding_window_ranker import SlidingWindowRanker
|
||||
|
||||
__all__ = [
|
||||
"AggregationStrategy",
|
||||
"AssociationRules",
|
||||
"Scale",
|
||||
"EMFLoss",
|
||||
"calculate_gild_for_explanations",
|
||||
"SlidingWindowRanker",
|
||||
"SlidingWindow",
|
||||
]
|
||||
@@ -0,0 +1,210 @@
|
||||
import numpy as np
|
||||
from typing import Dict, List, Union, Optional, TypeAlias
|
||||
from enum import Enum
|
||||
|
||||
# Type aliases for better readability
|
||||
UserID: TypeAlias = Union[str, int]
|
||||
ItemID: TypeAlias = Union[str, int]
|
||||
EvaluationScore: TypeAlias = float
|
||||
AggregatedScore: TypeAlias = float
|
||||
|
||||
# Main data structure types
|
||||
UserEvaluations: TypeAlias = Dict[UserID, Dict[ItemID, EvaluationScore]]
|
||||
UserRankings: TypeAlias = Dict[UserID, List[ItemID]]
|
||||
AggregatedScores: TypeAlias = Dict[ItemID, AggregatedScore]
|
||||
|
||||
|
||||
class AggregationStrategy(Enum):
|
||||
"""Enumeration of available aggregation strategies."""
|
||||
|
||||
# Individual Predictions
|
||||
AVG_PREDICTIONS = "avg_predictions"
|
||||
LEAST_MISERY = "least_misery"
|
||||
MOST_PLEASURE = "most_pleasure"
|
||||
MOST_RESPECTED_PERSON = "most_respected_person"
|
||||
|
||||
# Individual Preferences
|
||||
ADDITIVE_UTILITARIAN = "additive_utilitarian"
|
||||
MULTIPLICATIVE = "multiplicative"
|
||||
BORDA_COUNT = "borda_count"
|
||||
|
||||
|
||||
class ScoreAggregator:
|
||||
"""
|
||||
A class for aggregating individual predictions or preferences into collective scores.
|
||||
|
||||
Supports two main approaches:
|
||||
1. Individual Predictions: AVG, LM, MP, MRP
|
||||
2. Individual Preferences: AVG, ADD, MUL, BRC
|
||||
|
||||
Felfernig, A., Boratto, L., Stettinger, M., Tkali, M.: Group Recommender Systems:
|
||||
An Introduction. Springer Publishing Company, Incorporated, 1st edn. (2018)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, most_respected_person: Optional[UserID] = None):
|
||||
"""
|
||||
Initialize the ScoreAggregator.
|
||||
|
||||
Args:
|
||||
most_respected_person: User ID of the most respected person (required for MRP strategy)
|
||||
"""
|
||||
self.most_respected_person = most_respected_person
|
||||
|
||||
def aggregate_scores(
|
||||
self,
|
||||
evaluations: UserEvaluations,
|
||||
strategy: AggregationStrategy,
|
||||
rankings: Optional[UserRankings] = None,
|
||||
) -> AggregatedScores:
|
||||
"""
|
||||
Aggregate individual evaluations into collective scores.
|
||||
|
||||
Args:
|
||||
evaluations: Dictionary mapping user_id -> {item_id: evaluation_score}
|
||||
strategy: Aggregation strategy to use
|
||||
rankings: Dictionary mapping user_id -> [ordered_list_of_items] (required for Borda Count)
|
||||
|
||||
Returns:
|
||||
Dictionary mapping item_id -> aggregated_score
|
||||
"""
|
||||
if not evaluations:
|
||||
return {}
|
||||
|
||||
# Get all items across all users
|
||||
all_items: set[ItemID] = set()
|
||||
for user_evals in evaluations.values():
|
||||
all_items.update(user_evals.keys())
|
||||
|
||||
result: AggregatedScores = {}
|
||||
|
||||
for item in all_items:
|
||||
if strategy == AggregationStrategy.AVG_PREDICTIONS:
|
||||
result[item] = self._avg_predictions(evaluations, item)
|
||||
elif strategy == AggregationStrategy.LEAST_MISERY:
|
||||
result[item] = self._least_misery(evaluations, item)
|
||||
elif strategy == AggregationStrategy.MOST_PLEASURE:
|
||||
result[item] = self._most_pleasure(evaluations, item)
|
||||
elif strategy == AggregationStrategy.MOST_RESPECTED_PERSON:
|
||||
result[item] = self._most_respected_person(evaluations, item)
|
||||
elif strategy == AggregationStrategy.ADDITIVE_UTILITARIAN:
|
||||
result[item] = self._additive_utilitarian(evaluations, item)
|
||||
elif strategy == AggregationStrategy.MULTIPLICATIVE:
|
||||
result[item] = self._multiplicative(evaluations, item)
|
||||
elif strategy == AggregationStrategy.BORDA_COUNT:
|
||||
if rankings is None:
|
||||
raise ValueError("Rankings required for Borda Count strategy")
|
||||
result[item] = self._borda_count(rankings, item)
|
||||
else:
|
||||
raise ValueError(f"Unknown aggregation strategy: {strategy}")
|
||||
|
||||
return result
|
||||
|
||||
def get_top_recommendation(
|
||||
self,
|
||||
evaluations: UserEvaluations,
|
||||
strategy: AggregationStrategy,
|
||||
rankings: Optional[UserRankings] = None,
|
||||
) -> ItemID:
|
||||
"""
|
||||
Get the top recommended item based on aggregated scores.
|
||||
|
||||
Args:
|
||||
evaluations: Dictionary mapping user_id -> {item_id: evaluation_score}
|
||||
strategy: Aggregation strategy to use
|
||||
rankings: Dictionary mapping user_id -> [ordered_list_of_items] (required for Borda Count)
|
||||
|
||||
Returns:
|
||||
Item ID with highest aggregated score
|
||||
"""
|
||||
aggregated_scores = self.aggregate_scores(evaluations, strategy, rankings)
|
||||
return max(aggregated_scores.items(), key=lambda x: x[1])[0]
|
||||
|
||||
def _avg_predictions(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Average of item-specific evaluations."""
|
||||
item_evals = [
|
||||
user_evals.get(item, 0)
|
||||
for user_evals in evaluations.values()
|
||||
if item in user_evals
|
||||
]
|
||||
return np.mean(item_evals) if item_evals else 0.0 # type: ignore
|
||||
|
||||
def _least_misery(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Minimum item-specific evaluation."""
|
||||
item_evals = [
|
||||
user_evals.get(item, 0)
|
||||
for user_evals in evaluations.values()
|
||||
if item in user_evals
|
||||
]
|
||||
return min(item_evals) if item_evals else 0.0
|
||||
|
||||
def _most_pleasure(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Maximum item-specific evaluation."""
|
||||
item_evals = [
|
||||
user_evals.get(item, 0)
|
||||
for user_evals in evaluations.values()
|
||||
if item in user_evals
|
||||
]
|
||||
return max(item_evals) if item_evals else 0.0
|
||||
|
||||
def _most_respected_person(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Item-evaluations of most respected user."""
|
||||
if self.most_respected_person is None:
|
||||
raise ValueError("Most respected person not specified")
|
||||
if self.most_respected_person not in evaluations:
|
||||
raise ValueError(
|
||||
f"Most respected person '{self.most_respected_person}' not found in evaluations"
|
||||
)
|
||||
return evaluations[self.most_respected_person].get(item, 0.0)
|
||||
|
||||
def _avg_preferences(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Average of item-specific evaluations (same as avg_predictions)."""
|
||||
return self._avg_predictions(evaluations, item)
|
||||
|
||||
def _additive_utilitarian(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Sum of item-specific evaluations."""
|
||||
item_evals = [
|
||||
user_evals.get(item, 0)
|
||||
for user_evals in evaluations.values()
|
||||
if item in user_evals
|
||||
]
|
||||
return sum(item_evals)
|
||||
|
||||
def _multiplicative(
|
||||
self, evaluations: UserEvaluations, item: ItemID
|
||||
) -> AggregatedScore:
|
||||
"""Multiplication of item-specific evaluations."""
|
||||
item_evals = [
|
||||
user_evals.get(item, 0)
|
||||
for user_evals in evaluations.values()
|
||||
if item in user_evals
|
||||
]
|
||||
if not item_evals:
|
||||
return 0.0
|
||||
result = 1.0
|
||||
for eval_score in item_evals:
|
||||
result *= eval_score
|
||||
return result
|
||||
|
||||
def _borda_count(self, rankings: UserRankings, item: ItemID) -> AggregatedScore:
|
||||
"""Sum of item-specific scores derived from item ranking."""
|
||||
total_score = 0.0
|
||||
for user_ranking in rankings.values():
|
||||
if item in user_ranking:
|
||||
# Score is based on position in ranking (higher position = higher score)
|
||||
position = user_ranking.index(item)
|
||||
score = len(user_ranking) - position - 1 # Reverse position for score
|
||||
total_score += score
|
||||
return total_score
|
||||
@@ -0,0 +1,255 @@
|
||||
from mlxtend.preprocessing import TransactionEncoder
|
||||
from mlxtend.frequent_patterns import fpgrowth, association_rules
|
||||
import pandas as pd
|
||||
from pygrex.data_reader.data_reader import DataReader
|
||||
from typing import List, Optional, Union
|
||||
|
||||
|
||||
class AssociationRules:
|
||||
"""
|
||||
A class to represent association rules mining for recommendation systems.
|
||||
|
||||
This class implements association rules mining using the FP-Growth algorithm
|
||||
to discover frequent itemsets and generate association rules from user-item
|
||||
interaction data. It can be used to find patterns in user behavior and
|
||||
generate item recommendations based on item associations.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
data: DataReader,
|
||||
min_support: float = 0.2,
|
||||
min_confidence: float = 0.2,
|
||||
rating_threshold: float = 4.0,
|
||||
) -> None:
|
||||
"""Initialize the association rules miner with data and parameters.
|
||||
|
||||
Args:
|
||||
data: The DataReader object containing user-item interactions with ratings.
|
||||
min_support: Minimum support threshold for frequent itemsets.
|
||||
Must be between 0 and 1. Default is 0.2.
|
||||
min_confidence: Minimum confidence threshold for association rules.
|
||||
Must be between 0 and 1. Default is 0.2.
|
||||
rating_threshold: Minimum rating threshold to consider an interaction
|
||||
as positive. Default is 4.0.
|
||||
|
||||
Raises:
|
||||
ValueError: If support, confidence, or rating_threshold values are invalid.
|
||||
"""
|
||||
self._validate_parameters(min_support, min_confidence, rating_threshold)
|
||||
|
||||
self.data = data
|
||||
self.min_support = min_support
|
||||
self.min_confidence = min_confidence
|
||||
self.rating_threshold = rating_threshold
|
||||
self._frequent_itemsets: Optional[pd.DataFrame] = None
|
||||
self._association_rules: Optional[pd.DataFrame] = None
|
||||
|
||||
def _validate_parameters(
|
||||
self, min_support: float, min_confidence: float, rating_threshold: float
|
||||
) -> None:
|
||||
"""Validate initialization parameters.
|
||||
|
||||
Args:
|
||||
min_support: Minimum support threshold to validate.
|
||||
min_confidence: Minimum confidence threshold to validate.
|
||||
rating_threshold: Rating threshold to validate.
|
||||
|
||||
Raises:
|
||||
ValueError: If any parameter is invalid.
|
||||
"""
|
||||
if not (0 < min_support <= 1):
|
||||
raise ValueError("min_support must be between 0 and 1")
|
||||
if not (0 < min_confidence <= 1):
|
||||
raise ValueError("min_confidence must be between 0 and 1")
|
||||
if rating_threshold < 0:
|
||||
raise ValueError("rating_threshold must be non-negative")
|
||||
|
||||
def get_df_filtered_by_rating_threshold(self) -> pd.DataFrame:
|
||||
df = self.data.dataset.copy()
|
||||
# Filter interactions based on rating threshold
|
||||
df_filtered = df[df["rating"] >= self.rating_threshold]
|
||||
|
||||
if df_filtered.empty:
|
||||
raise ValueError(
|
||||
f"No interactions found with rating >= {self.rating_threshold}"
|
||||
)
|
||||
return df_filtered
|
||||
|
||||
def _prepare_transactions(self) -> List[List[str]]:
|
||||
"""Prepare transaction data from the dataset.
|
||||
|
||||
Filters the dataset based on rating threshold and groups items
|
||||
by user to create transaction lists.
|
||||
|
||||
Returns:
|
||||
A list of transactions, where each transaction is a list of item IDs
|
||||
that a user has positively interacted with.
|
||||
"""
|
||||
df_filtered = self.get_df_filtered_by_rating_threshold()
|
||||
# Group items by user to create transactions
|
||||
transactions = df_filtered.groupby("userId")["itemId"].apply(list).tolist()
|
||||
|
||||
# Convert item IDs to strings for consistency
|
||||
transactions = [
|
||||
[str(item) for item in transaction] for transaction in transactions
|
||||
]
|
||||
|
||||
return transactions
|
||||
|
||||
def _mine_frequent_itemsets(
|
||||
self, transactions: List[List[Union[str, int]]]
|
||||
) -> pd.DataFrame:
|
||||
"""Mine frequent itemsets using FP-Growth algorithm.
|
||||
|
||||
Args:
|
||||
transactions: List of transactions to mine frequent itemsets from.
|
||||
|
||||
Returns:
|
||||
DataFrame containing frequent itemsets with their support values.
|
||||
|
||||
Raises:
|
||||
ValueError: If no frequent itemsets are found.
|
||||
"""
|
||||
# Encode transactions into binary matrix
|
||||
transaction_encoder = TransactionEncoder()
|
||||
transaction_matrix = transaction_encoder.fit_transform(transactions)
|
||||
|
||||
df_encoded = pd.DataFrame(
|
||||
transaction_matrix, # type: ignore
|
||||
columns=transaction_encoder.columns_,
|
||||
)
|
||||
|
||||
# Apply FP-Growth to find frequent itemsets
|
||||
frequent_itemsets = fpgrowth(
|
||||
df_encoded, min_support=self.min_support, use_colnames=True
|
||||
)
|
||||
|
||||
if frequent_itemsets.empty:
|
||||
raise ValueError(
|
||||
f"No frequent itemsets found with min_support={self.min_support}"
|
||||
)
|
||||
|
||||
return frequent_itemsets
|
||||
|
||||
def _generate_association_rules(
|
||||
self, frequent_itemsets: pd.DataFrame
|
||||
) -> pd.DataFrame:
|
||||
"""Generate association rules from frequent itemsets.
|
||||
|
||||
Args:
|
||||
frequent_itemsets: DataFrame containing frequent itemsets.
|
||||
|
||||
Returns:
|
||||
DataFrame containing association rules with their metrics.
|
||||
|
||||
Raises:
|
||||
ValueError: If no association rules are found.
|
||||
"""
|
||||
rules = association_rules(
|
||||
frequent_itemsets, metric="confidence", min_threshold=self.min_confidence
|
||||
)
|
||||
|
||||
if rules.empty:
|
||||
raise ValueError(
|
||||
f"No association rules found with min_confidence={self.min_confidence}"
|
||||
)
|
||||
|
||||
return rules
|
||||
|
||||
def compute(self) -> pd.DataFrame:
|
||||
"""Compute association rules from the dataset.
|
||||
|
||||
This method performs the complete association rules mining process:
|
||||
1. Prepares transactions from the dataset
|
||||
2. Mines frequent itemsets using FP-Growth
|
||||
3. Generates association rules from frequent itemsets
|
||||
|
||||
Returns:
|
||||
DataFrame containing association rules with metrics including
|
||||
antecedents, consequents, support, confidence, lift, etc.
|
||||
|
||||
Raises:
|
||||
ValueError: If the dataset is empty, no transactions meet the
|
||||
criteria, or no rules can be generated with the given parameters.
|
||||
"""
|
||||
if self.data.dataset.empty:
|
||||
raise ValueError("Dataset is empty")
|
||||
|
||||
# Prepare transactions
|
||||
transactions = self._prepare_transactions()
|
||||
|
||||
if not transactions:
|
||||
raise ValueError("No transactions found after filtering")
|
||||
|
||||
# Mine frequent itemsets
|
||||
self._frequent_itemsets = self._mine_frequent_itemsets(transactions) # type: ignore
|
||||
|
||||
# Generate association rules
|
||||
self._association_rules = self._generate_association_rules(
|
||||
self._frequent_itemsets
|
||||
)
|
||||
|
||||
return self._association_rules
|
||||
|
||||
def get_frequent_itemsets(self) -> Optional[pd.DataFrame]:
|
||||
"""Get the computed frequent itemsets.
|
||||
|
||||
Returns:
|
||||
DataFrame containing frequent itemsets if compute() has been called,
|
||||
None otherwise.
|
||||
"""
|
||||
return self._frequent_itemsets
|
||||
|
||||
def get_recommendations_for_items(
|
||||
self, items: List[Union[str, int]], top_k: int = 10
|
||||
) -> pd.DataFrame:
|
||||
"""Get item recommendations based on association rules.
|
||||
|
||||
Args:
|
||||
items: List of item IDs to get recommendations for.
|
||||
top_k: Maximum number of recommendations to return. Default is 10.
|
||||
|
||||
Returns:
|
||||
DataFrame containing recommended items sorted by confidence.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If compute() hasn't been called yet.
|
||||
ValueError: If items list is empty.
|
||||
"""
|
||||
if self._association_rules is None:
|
||||
raise RuntimeError("Must call compute() before getting recommendations")
|
||||
|
||||
if not items:
|
||||
raise ValueError("Items list cannot be empty")
|
||||
|
||||
items_set = set(str(item) for item in items)
|
||||
|
||||
# Filter rules where antecedents match the given items
|
||||
matching_rules = self._association_rules[
|
||||
self._association_rules["antecedents"].apply(
|
||||
lambda x: items_set.issubset(set(str(item) for item in x))
|
||||
)
|
||||
]
|
||||
|
||||
if matching_rules.empty:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Sort by confidence and return top_k recommendations
|
||||
recommendations = matching_rules.nlargest(top_k, "confidence")
|
||||
|
||||
return recommendations[
|
||||
["antecedents", "consequents", "confidence", "lift", "support"]
|
||||
]
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return string representation of the AssociationRules object."""
|
||||
return (
|
||||
f"AssociationRules(min_support={self.min_support}, "
|
||||
f"min_confidence={self.min_confidence}, "
|
||||
f"rating_threshold={self.rating_threshold})"
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""Return detailed string representation of the AssociationRules object."""
|
||||
return self.__str__()
|
||||
@@ -0,0 +1,17 @@
|
||||
import torch
|
||||
|
||||
|
||||
class EMFLoss(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(EMFLoss, self).__init__()
|
||||
|
||||
def forward(self, ratings_pred, ratings, u, v, reg_term, expl, expl_reg_term):
|
||||
|
||||
mse = (ratings - ratings_pred.view(-1)) ** 2
|
||||
u_l2 = reg_term * torch.norm(u, 2, -1)
|
||||
v_l2 = reg_term * torch.norm(v, 2, -1)
|
||||
expl_constraint = expl_reg_term * torch.norm(u - v, 1, -1) * expl
|
||||
|
||||
loss = mse + u_l2 + v_l2 + expl_constraint
|
||||
|
||||
return loss.mean()
|
||||
@@ -0,0 +1,80 @@
|
||||
from itertools import combinations
|
||||
import numpy as np
|
||||
|
||||
|
||||
def _get_explanation_feature_set(explanation, explainer_type, details=None):
|
||||
"""Helper to extract a consistent feature set from different explanation types."""
|
||||
if explainer_type == "Sliding Window":
|
||||
return set(explanation.get("items", []))
|
||||
elif explainer_type == "EXPGRS":
|
||||
if details is not None:
|
||||
return set(details.get("antecedent", frozenset()))
|
||||
else:
|
||||
return set()
|
||||
elif explainer_type == "LORE4Groups":
|
||||
rules_data = explanation.get("group_factual_rule", {})
|
||||
if isinstance(rules_data, dict):
|
||||
return set(
|
||||
rule for tier_rules in rules_data.values() for rule in tier_rules
|
||||
)
|
||||
elif isinstance(rules_data, list):
|
||||
return set(rules_data)
|
||||
return set()
|
||||
|
||||
|
||||
def calculate_gild_for_explanations(explanations_dict, explainer_type, use_median=True):
|
||||
"""Calculate Gaussian Inter-List Diversity (GILD) for a set of explanations."""
|
||||
|
||||
if not explanations_dict or len(explanations_dict) < 2:
|
||||
return 0.0
|
||||
|
||||
feature_sets = []
|
||||
if explainer_type == "EXPGRS":
|
||||
for item_id, rules_list in explanations_dict.items():
|
||||
if rules_list:
|
||||
feature_sets.append(
|
||||
_get_explanation_feature_set(
|
||||
None, explainer_type, details=rules_list[0]
|
||||
)
|
||||
)
|
||||
elif explainer_type == "Sliding Window":
|
||||
for call, exp_data in explanations_dict.items():
|
||||
feature_sets.append(_get_explanation_feature_set(exp_data, explainer_type))
|
||||
elif explainer_type == "LORE4Groups":
|
||||
for item_id, exp_data in explanations_dict.items():
|
||||
feature_sets.append(_get_explanation_feature_set(exp_data, explainer_type))
|
||||
|
||||
feature_sets = [fs for fs in feature_sets if fs]
|
||||
if len(feature_sets) < 2:
|
||||
return 0.0
|
||||
|
||||
# Calculate pairwise Jaccard distances
|
||||
distances = []
|
||||
for set1, set2 in combinations(feature_sets, 2):
|
||||
intersection_len = len(set1.intersection(set2))
|
||||
union_len = len(set1.union(set2))
|
||||
jaccard_dist = 1.0 - (intersection_len / union_len) if union_len > 0 else 1.0
|
||||
distances.append(jaccard_dist)
|
||||
|
||||
if not distances:
|
||||
return 0.0
|
||||
|
||||
# Calculate sigma using paper's formula
|
||||
k_choose_2 = len(distances)
|
||||
if use_median:
|
||||
reference_dist = np.median(distances)
|
||||
else:
|
||||
reference_dist = min(distances)
|
||||
|
||||
denominator = np.sqrt(2 * np.log(k_choose_2 - 1)) if k_choose_2 > 1 else 1.0
|
||||
sigma = reference_dist / denominator if denominator > 0 else reference_dist
|
||||
if sigma == 0:
|
||||
sigma = 1e-9
|
||||
kernel_distances_sum = 0.0
|
||||
for d in distances:
|
||||
kernel_distance = np.sqrt(2 - 2 * np.exp(-(d**2) / (2 * sigma**2)))
|
||||
kernel_distances_sum += kernel_distance
|
||||
|
||||
gild = kernel_distances_sum / k_choose_2 if distances else 0
|
||||
|
||||
return gild
|
||||
@@ -0,0 +1,138 @@
|
||||
from typing import List, Union, Optional
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
|
||||
|
||||
class Scale:
|
||||
"""
|
||||
A class for scaling numerical values using different methods.
|
||||
|
||||
Methods:
|
||||
quantile: Scale values using quantile-based ranking.
|
||||
linear: Scale values linearly to a target range with outlier handling.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def quantile(
|
||||
raw_predictions: Union[List[float], np.ndarray],
|
||||
target_min: float = 1,
|
||||
target_max: float = 5,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Scale raw predictions to the target range using quantile-based ranking.
|
||||
|
||||
Args:
|
||||
raw_predictions: The raw prediction values.
|
||||
target_min: Minimum of the target range (default: 1).
|
||||
target_max: Maximum of the target range (default: 5).
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: Scaled predictions.
|
||||
|
||||
Raises:
|
||||
ValueError: If raw_predictions is empty.
|
||||
"""
|
||||
if len(raw_predictions) == 0:
|
||||
raise ValueError("Raw predictions array is empty.")
|
||||
|
||||
# Convert to numpy array if it's not already
|
||||
raw_predictions = np.array(raw_predictions)
|
||||
|
||||
ranks = stats.rankdata(raw_predictions, method="average")
|
||||
if len(raw_predictions) == 1:
|
||||
# Handle single element case
|
||||
scaled_predictions = np.array([(target_min + target_max) / 2])
|
||||
else:
|
||||
scaled_predictions = target_min + (ranks - 1) * (
|
||||
target_max - target_min
|
||||
) / (len(raw_predictions) - 1)
|
||||
|
||||
# Ensure scaled predictions are within [target_min, target_max]
|
||||
scaled_predictions = np.clip(scaled_predictions, target_min, target_max)
|
||||
|
||||
return scaled_predictions
|
||||
|
||||
@staticmethod
|
||||
def linear(
|
||||
raw_predictions: Union[List[float], np.ndarray],
|
||||
target_min: float = 1,
|
||||
target_max: float = 5,
|
||||
ref_min: Optional[float] = None,
|
||||
ref_max: Optional[float] = None,
|
||||
handle_outliers: bool = True,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Scale raw predictions to the target range [target_min, target_max].
|
||||
|
||||
Args:
|
||||
raw_predictions: The raw prediction values.
|
||||
target_min: Minimum of the target range (default: 1).
|
||||
target_max: Maximum of the target range (default: 5).
|
||||
ref_min: Reference minimum for raw predictions. If None, will be calculated
|
||||
from the data or from outlier bounds if handle_outliers=True.
|
||||
ref_max: Reference maximum for raw predictions. If None, will be calculated
|
||||
from the data or from outlier bounds if handle_outliers=True.
|
||||
handle_outliers: Whether to handle outliers using IQR method (default: True).
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: Scaled predictions.
|
||||
|
||||
Raises:
|
||||
ValueError: If raw_predictions is empty.
|
||||
"""
|
||||
if len(raw_predictions) == 0:
|
||||
raise ValueError("Raw predictions array is empty.")
|
||||
|
||||
# Convert to numpy array if it's not already
|
||||
raw_predictions = np.array(raw_predictions)
|
||||
|
||||
# Handle single element case
|
||||
if len(raw_predictions) == 1:
|
||||
if ref_min is not None and ref_max is not None:
|
||||
# Scale based on provided reference range
|
||||
value = raw_predictions[0]
|
||||
scaled_value = (
|
||||
target_min
|
||||
+ (value - ref_min)
|
||||
* (target_max - target_min)
|
||||
/ (ref_max - ref_min)
|
||||
if ref_max != ref_min
|
||||
else (target_min + target_max) / 2
|
||||
)
|
||||
scaled_value = np.clip(scaled_value, target_min, target_max)
|
||||
return np.array([scaled_value])
|
||||
else:
|
||||
# Can't determine range from single value, return middle of target range
|
||||
return np.array([(target_min + target_max) / 2])
|
||||
|
||||
clipped_predictions = raw_predictions.copy()
|
||||
|
||||
# Handle outliers if requested
|
||||
if handle_outliers:
|
||||
q1, q3 = np.percentile(raw_predictions, [25, 75])
|
||||
iqr = q3 - q1
|
||||
lower_bound = q1 - 1.5 * iqr
|
||||
upper_bound = q3 + 1.5 * iqr
|
||||
clipped_predictions = np.clip(raw_predictions, lower_bound, upper_bound)
|
||||
|
||||
# Determine min and max values
|
||||
min_raw = np.min(clipped_predictions)
|
||||
max_raw = np.max(clipped_predictions)
|
||||
|
||||
# Use provided reference bounds if given, otherwise use data bounds
|
||||
actual_ref_min = ref_min if ref_min is not None else min_raw
|
||||
actual_ref_max = ref_max if ref_max is not None else max_raw
|
||||
|
||||
# Scale to [target_min, target_max]
|
||||
if actual_ref_max == actual_ref_min:
|
||||
# Reference bounds are equal, return the middle of the target range
|
||||
return np.full_like(raw_predictions, (target_min + target_max) / 2)
|
||||
else:
|
||||
scaled_predictions = target_min + (raw_predictions - actual_ref_min) * (
|
||||
target_max - target_min
|
||||
) / (actual_ref_max - actual_ref_min)
|
||||
|
||||
# Ensure scaled predictions are within [target_min, target_max]
|
||||
scaled_predictions = np.clip(scaled_predictions, target_min, target_max)
|
||||
|
||||
return scaled_predictions
|
||||
@@ -0,0 +1,90 @@
|
||||
from typing import List, Optional, TypeVar, Generic, Iterator
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class SlidingWindow(Generic[T]):
|
||||
"""Class for creating and managing sliding windows over a sequence.
|
||||
|
||||
This class provides functionality to iterate through windows of a fixed size
|
||||
over a sequence of items.
|
||||
"""
|
||||
|
||||
def __init__(self, sequence: List[T], window_size: int):
|
||||
"""Initialize the sliding window.
|
||||
|
||||
Args:
|
||||
sequence: The sequence of items to slide over
|
||||
window_size: The size of each window (must be positive)
|
||||
|
||||
Raises:
|
||||
ValueError: If window_size is less than 1
|
||||
TypeError: If sequence is not iterable
|
||||
"""
|
||||
if window_size < 1:
|
||||
raise ValueError("Window size must be at least 1")
|
||||
|
||||
if not hasattr(sequence, "__iter__"):
|
||||
raise TypeError("Sequence must be iterable")
|
||||
|
||||
self.sequence = sequence
|
||||
self.window_size = window_size
|
||||
self.index = 0
|
||||
self.max_index = len(sequence) - window_size + 1 if sequence else 0
|
||||
|
||||
def get_next_window(self) -> Optional[List[T]]:
|
||||
"""Return the next window and advance the current position.
|
||||
|
||||
Returns:
|
||||
A list containing the next window of items, or None if all windows
|
||||
have been processed.
|
||||
"""
|
||||
if self.index >= self.max_index:
|
||||
return None
|
||||
|
||||
window = self.sequence[self.index : self.index + self.window_size]
|
||||
self.index += 1
|
||||
return window
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset the window position to the beginning of the sequence."""
|
||||
self.index = 0
|
||||
|
||||
def has_next(self) -> bool:
|
||||
"""Check if there are more windows available.
|
||||
|
||||
Returns:
|
||||
True if there are more windows, False otherwise.
|
||||
"""
|
||||
return self.index < self.max_index
|
||||
|
||||
def __iter__(self) -> Iterator[List[T]]:
|
||||
"""Make the class iterable.
|
||||
|
||||
Returns:
|
||||
An iterator over all windows in the sequence.
|
||||
"""
|
||||
self.reset()
|
||||
return self
|
||||
|
||||
def __next__(self) -> List[T]:
|
||||
"""Get the next window for iteration.
|
||||
|
||||
Returns:
|
||||
The next window as a list.
|
||||
|
||||
Raises:
|
||||
StopIteration: When all windows have been processed.
|
||||
"""
|
||||
window = self.get_next_window()
|
||||
if window is None:
|
||||
raise StopIteration
|
||||
return window
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Return the total number of windows.
|
||||
|
||||
Returns:
|
||||
The number of complete windows in the sequence.
|
||||
"""
|
||||
return max(0, self.max_index)
|
||||
@@ -0,0 +1,631 @@
|
||||
import operator
|
||||
from typing import Any, Dict, List, Union, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.signal import (
|
||||
find_peaks,
|
||||
peak_widths,
|
||||
)
|
||||
|
||||
from pygrex.data_reader import DataReader
|
||||
|
||||
|
||||
class SlidingWindowRanker:
|
||||
"""
|
||||
Stratigi, M., Bikakis, N., Stefanidis, K.: Counterfactual explanations for group
|
||||
recommendations. In: Proceedings of the 27th International Workshop on Design,
|
||||
Optimization, Languages and Analytical Processing of Big Data (DOLAP 2025)
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""
|
||||
Initialize the SlidingWindowRanker.
|
||||
|
||||
Args:
|
||||
config: Configuration parameters for the evaluator
|
||||
"""
|
||||
self.config = config
|
||||
self.group_predictions: Optional[
|
||||
Dict[Union[str, int], Dict[Union[str, int], float]]
|
||||
] = None
|
||||
self.top_recommendation: Optional[Union[str, int]] = None
|
||||
|
||||
def set_group_recommender_values(
|
||||
self,
|
||||
group_predictions: Dict[Union[str, int], Dict[Union[str, int], float]],
|
||||
top_recommendation: Union[str, int],
|
||||
) -> None:
|
||||
"""
|
||||
Set group recommender values.
|
||||
|
||||
Args:
|
||||
group_predictions: Dictionary mapping user IDs to their item predictions
|
||||
top_recommendation: List of top recommended items for the group
|
||||
"""
|
||||
self.group_predictions = group_predictions
|
||||
self.top_recommendation = top_recommendation
|
||||
|
||||
def evaluate(self, data: DataReader) -> Dict[str, Any]:
|
||||
"""
|
||||
Evaluate the data using the Stratigis evaluator.
|
||||
|
||||
Args:
|
||||
data: DataReader object containing dataset and transformation methods
|
||||
|
||||
Returns:
|
||||
Dictionary with evaluation metrics
|
||||
"""
|
||||
# Implementation would go here
|
||||
return {}
|
||||
|
||||
def calculate_item_popularity_score(
|
||||
self, items: List[Union[str, int]], data: DataReader
|
||||
) -> Dict[Union[str, int], float]:
|
||||
"""
|
||||
Calculate the normalized popularity of each item based on the number of interactions received.
|
||||
|
||||
Args:
|
||||
items: List of item IDs
|
||||
data: Data object containing the dataset and transformation methods
|
||||
|
||||
Returns:
|
||||
Dictionary with item IDs as keys and normalized popularity (0-1) as values
|
||||
"""
|
||||
# Calculate popularity (number of interactions) for each item
|
||||
popularity_counts = {}
|
||||
for item_id in items:
|
||||
internal_item_id = data.get_new_item_id(item_id)
|
||||
count = len(data.dataset[data.dataset["itemId"] == internal_item_id])
|
||||
popularity_counts[item_id] = count
|
||||
|
||||
# Find min and max values for normalization
|
||||
min_count = min(popularity_counts.values()) if popularity_counts else 0
|
||||
max_count = max(popularity_counts.values()) if popularity_counts else 0
|
||||
|
||||
# Add 1% padding to the range
|
||||
range_value = max_count - min_count
|
||||
padded_range = range_value + (
|
||||
range_value / 50
|
||||
) # Add 2% to range (1% on each end)
|
||||
padded_min = min_count - (
|
||||
range_value / 100
|
||||
) # Subtract 1% of range from minimum
|
||||
|
||||
if padded_range == 0:
|
||||
padded_range = 1 # Avoid division by zero
|
||||
|
||||
# Normalize popularity values to [0,1]
|
||||
popularity_mask = {}
|
||||
for item_id, count in popularity_counts.items():
|
||||
popularity_mask[item_id] = (count - padded_min) / padded_range
|
||||
|
||||
return popularity_mask
|
||||
|
||||
def calculate_relevance_mask(
|
||||
self,
|
||||
target_item_id: Union[str, int],
|
||||
) -> Dict[Union[str, int], float]:
|
||||
"""
|
||||
Create a mapping between users and their prediction scores for a specific target item.
|
||||
|
||||
Args:
|
||||
target_item_id :The ID of the item for which prediction scores are needed
|
||||
|
||||
Returns:
|
||||
Dictionary mapping user IDs to their predicted scores for the target item
|
||||
Note: Users without a prediction for the target item will have a value of 0
|
||||
|
||||
Examples
|
||||
>>> user_preds = {'user1': {'item1': 4.5, 'item2': 3.2}, 'user2': {'item2': 2.8}}
|
||||
>>> evaluator.set_group_recommender_values(user_preds,top_recommendation)
|
||||
>>> evaluator.calculate_relevance_mask('item1')
|
||||
{'user1': 4.5, 'user2': 0}
|
||||
"""
|
||||
|
||||
if self.group_predictions is None:
|
||||
raise ValueError(
|
||||
"User predictions not set. Call set_group_recommender_values first."
|
||||
)
|
||||
|
||||
individual_predictions = {}
|
||||
|
||||
for user_id, predictions in self.group_predictions.items():
|
||||
# Get the prediction for the target item if it exists, otherwise default to 0
|
||||
individual_predictions[user_id] = predictions.get(target_item_id, 0)
|
||||
|
||||
return individual_predictions
|
||||
|
||||
def calculate_relevance_score(
|
||||
self,
|
||||
item_id: Union[str, int],
|
||||
data: DataReader,
|
||||
prediction_scores: Dict[Union[str, int], float],
|
||||
members: List[Union[str, int]],
|
||||
rating_scale: tuple = (0, 5), # Default rating scale
|
||||
) -> float:
|
||||
"""
|
||||
Calculate the normalized average prediction score for an item based on group members' predictions.
|
||||
|
||||
Agrs
|
||||
item_id: ID of the item to calculate relevance for
|
||||
data : DataReader object containing dataset and ID mapping methods
|
||||
prediction_scores : Dictionary mapping user IDs to their prediction scores for items
|
||||
members : List of user IDs in the group
|
||||
rating_scale: Tuple indicating (min_rating, max_rating) for normalization
|
||||
|
||||
Returns
|
||||
Normalized average prediction score in range [0,1]
|
||||
Returns 0 if no users in the group have interacted with the item
|
||||
|
||||
Notes
|
||||
1. Calculates the average prediction score for the item from group members
|
||||
2. Normalizes the score to [0,1] range with 1% padding
|
||||
"""
|
||||
total_score = 0
|
||||
valid_users_count = 0
|
||||
internal_item_id = data.get_new_item_id(item_id)
|
||||
|
||||
for user_id in members:
|
||||
# Convert user ID to internal format
|
||||
internal_user_id = (
|
||||
data.get_new_user_id(int(user_id))
|
||||
if isinstance(user_id, (int, np.integer))
|
||||
else user_id
|
||||
)
|
||||
|
||||
# Check if user has interacted with the item
|
||||
user_item_data = data.dataset[
|
||||
(data.dataset["userId"] == internal_user_id)
|
||||
& (data.dataset["itemId"] == internal_item_id)
|
||||
]
|
||||
|
||||
if user_item_data.empty:
|
||||
continue
|
||||
|
||||
# Get the prediction score for this user
|
||||
if user_id in prediction_scores:
|
||||
total_score += prediction_scores[user_id]
|
||||
valid_users_count += 1
|
||||
|
||||
# Return 0 if no valid users found
|
||||
if valid_users_count == 0:
|
||||
return 0
|
||||
|
||||
# Calculate average score
|
||||
average_score = total_score / valid_users_count
|
||||
|
||||
# Normalize to [0,1] with 1% padding
|
||||
min_value, max_value = rating_scale
|
||||
range_value = max_value - min_value
|
||||
padded_range = range_value + (
|
||||
range_value / 50
|
||||
) # Add 2% to range (1% on each end)
|
||||
padded_min = min_value - (
|
||||
range_value / 100
|
||||
) # Subtract 1% of range from minimum
|
||||
|
||||
if padded_range == 0:
|
||||
return 0.0
|
||||
|
||||
normalized_score = (average_score - padded_min) / padded_range
|
||||
return float(normalized_score)
|
||||
|
||||
def calculate_item_intensity_score(
|
||||
self, item_id: Union[str, int], members: List[Union[str, int]], data: DataReader
|
||||
) -> float:
|
||||
"""
|
||||
Calculate what proportion of group members have interacted with the specified item.
|
||||
|
||||
Args
|
||||
item_id : ID of the item to calculate interaction rate for
|
||||
members : List of user IDs in the group
|
||||
data : DataReader object containing dataset and ID mapping methods
|
||||
|
||||
Returns
|
||||
Proportion of group members who have interacted with the item (range [0,1])
|
||||
0 means no group members have interacted with the item
|
||||
1 means all group members have interacted with the item
|
||||
"""
|
||||
# Convert item ID to internal format
|
||||
if data is None:
|
||||
print("Error: DataReader object is None. Cannot convert item_id.")
|
||||
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
|
||||
internal_item_id = data.get_new_item_id(item_id)
|
||||
|
||||
# Convert all user IDs to internal format
|
||||
internal_members = [data.get_new_user_id(user_id) for user_id in members]
|
||||
|
||||
# Count how many users have interacted with the item
|
||||
interaction_count = len(
|
||||
data.dataset[
|
||||
(data.dataset.itemId == internal_item_id)
|
||||
& data.dataset.userId.isin(internal_members)
|
||||
]
|
||||
)
|
||||
|
||||
# Calculate proportion of group members who interacted with item
|
||||
if not members:
|
||||
return 0 # Avoid division by zero if no members
|
||||
|
||||
interaction_rate = interaction_count / len(members)
|
||||
return interaction_rate
|
||||
|
||||
def calculate_rating_score(
|
||||
self,
|
||||
item_id: Union[str, int],
|
||||
members: List[Union[str, int]],
|
||||
data: DataReader,
|
||||
rating_scale: tuple = (0, 5),
|
||||
) -> float:
|
||||
"""
|
||||
Calculate the normalized average rating given to an item by group members.
|
||||
|
||||
Args
|
||||
item_id : ID of the item to calculate average rating for
|
||||
data : DataReader object containing dataset and ID mapping methods
|
||||
members : List of user IDs in the group
|
||||
rating_scale: Tuple indicating (min_rating, max_rating) for normalization
|
||||
|
||||
Returns
|
||||
Normalized average rating in range [0,1]
|
||||
|
||||
Notes
|
||||
- Considers all group members in the denominator even if some haven't rated the item
|
||||
- Normalizes the resulting average to [0,1] with 1% padding
|
||||
"""
|
||||
# Convert item ID to internal format
|
||||
if data is None:
|
||||
print("Error: DataReader object is None. Cannot convert item_id.")
|
||||
return 0.0
|
||||
internal_item_id = data.get_new_item_id(item_id)
|
||||
|
||||
# Convert all user IDs to internal format
|
||||
internal_members = [data.get_new_user_id(user_id) for user_id in members]
|
||||
|
||||
# Get ratings from users who have rated this item
|
||||
rating_data = data.dataset[
|
||||
(data.dataset.itemId == internal_item_id)
|
||||
& data.dataset.userId.isin(internal_members)
|
||||
]
|
||||
|
||||
# Calculate average rating (sum of ratings divided by total group size)
|
||||
if len(members) == 0:
|
||||
return 0 # Avoid division by zero if no members
|
||||
|
||||
total_rating = rating_data["rating"].sum()
|
||||
average_rating = total_rating / len(members)
|
||||
|
||||
# Normalize to [0,1] with 1% padding
|
||||
min_value, max_value = rating_scale
|
||||
range_value = max_value - min_value
|
||||
padded_range = range_value + (
|
||||
range_value / 50
|
||||
) # Add 2% to range (1% on each end)
|
||||
padded_min = min_value - (
|
||||
range_value / 100
|
||||
) # Subtract 1% of range from minimum
|
||||
|
||||
if padded_range == 0:
|
||||
return 0.0
|
||||
|
||||
normalized_rating = (average_rating - padded_min) / padded_range
|
||||
return float(normalized_rating)
|
||||
|
||||
def calculate_trending_score(
|
||||
self,
|
||||
members: List[Union[str, int]],
|
||||
item_id: Union[str, int],
|
||||
data: Optional[DataReader] = None,
|
||||
peak_norm_min_height: float = 0.1,
|
||||
peak_norm_min_prominence: float = 0.05,
|
||||
peak_min_distance: int = 3,
|
||||
peak_width_rel_height: float = 0.5,
|
||||
) -> tuple[float, Dict[Union[str, int], float], pd.DataFrame]:
|
||||
"""
|
||||
Calculates a trending score for a user, using normalized data for hype period detection.
|
||||
|
||||
Args
|
||||
members : List of user IDs in the group
|
||||
item_id : ID of the item to calculate trending score for
|
||||
data : DataReader object containing dataset and ID mapping methods
|
||||
peak_norm_min_height : Minimum height of peaks in normalized data to consider as significant
|
||||
peak_norm_min_prominence : Minimum prominence of peaks in normalized data
|
||||
peak_min_distance : Minimum distance between peaks in months
|
||||
peak_width_rel_height : Relative height for peak width calculation
|
||||
|
||||
Returns
|
||||
tuple: (average_trending_score, individual_scores, hype_periods_for_item)
|
||||
average_trending_score: Average trending score across all group members (0-1)
|
||||
individual_scores: Dictionary mapping user IDs to their individual trending scores
|
||||
hype_periods_for_item: DataFrame containing detected hype periods for the item
|
||||
"""
|
||||
|
||||
if not members:
|
||||
print("Error: No group members provided for trending score calculation.")
|
||||
return 0.0, {}, pd.DataFrame()
|
||||
|
||||
_df = pd.DataFrame()
|
||||
if data is not None and isinstance(data, DataReader):
|
||||
_df = data.dataset.copy()
|
||||
else:
|
||||
if data is not None:
|
||||
print(
|
||||
f"Warning: data was provided but is not a DataReader object (type: {type(data)})."
|
||||
)
|
||||
|
||||
if _df.empty:
|
||||
print(
|
||||
"Error: The DataFrame (_df) is empty. Cannot calculate score or plot."
|
||||
)
|
||||
return 0.0, {}, pd.DataFrame()
|
||||
|
||||
required_columns = [
|
||||
"userId",
|
||||
"itemId",
|
||||
"rating",
|
||||
"timestamp",
|
||||
]
|
||||
missing_columns = [col for col in required_columns if col not in _df.columns]
|
||||
if missing_columns:
|
||||
print(
|
||||
f"Error: Missing required columns in DataFrame: {', '.join(missing_columns)}"
|
||||
)
|
||||
return 0.0, {}, pd.DataFrame()
|
||||
|
||||
try:
|
||||
if "timestamp_dt" not in _df.columns or _df["timestamp_dt"].isnull().all():
|
||||
_df["timestamp_dt"] = pd.to_datetime(_df["timestamp"], unit="s")
|
||||
if "year_month" not in _df.columns or _df["year_month"].isnull().all():
|
||||
_df["year_month"] = _df["timestamp_dt"].dt.to_period("M")
|
||||
except Exception as e:
|
||||
print(f"Error during timestamp conversion or year-month extraction: {e}")
|
||||
return 0.0, {}, pd.DataFrame()
|
||||
|
||||
if data is None: # Should not happen if _df is not empty, but as a safeguard
|
||||
return 0.0, {}, pd.DataFrame()
|
||||
|
||||
# Convert item ID to internal format
|
||||
internal_item_id = data.get_new_item_id(item_id)
|
||||
|
||||
# Convert all user IDs to internal format
|
||||
internal_members = [data.get_new_user_id(user_id) for user_id in members]
|
||||
|
||||
# Filter data for the specific item ID only
|
||||
item_df = _df[_df["itemId"] == internal_item_id]
|
||||
if item_df.empty:
|
||||
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
|
||||
|
||||
# movie_ratings_per_month contains original rating counts
|
||||
movie_ratings_per_month = (
|
||||
item_df.groupby(["itemId", "year_month"], observed=False)
|
||||
.size()
|
||||
.reset_index(name="rating_count")
|
||||
)
|
||||
|
||||
if movie_ratings_per_month.empty:
|
||||
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
|
||||
|
||||
hype_periods_for_item = None
|
||||
|
||||
# Process the specific item for hype period detection
|
||||
group_sorted = movie_ratings_per_month.sort_values("year_month").reset_index(
|
||||
drop=True
|
||||
)
|
||||
original_ratings = group_sorted["rating_count"].to_numpy()
|
||||
|
||||
# Normalization Step
|
||||
min_rating = np.min(original_ratings)
|
||||
max_rating = np.max(original_ratings)
|
||||
|
||||
normalized_ratings = None
|
||||
if (
|
||||
max_rating > min_rating
|
||||
): # Avoid division by zero if all ratings are the same
|
||||
normalized_ratings = (original_ratings - min_rating) / (
|
||||
max_rating - min_rating
|
||||
)
|
||||
elif len(original_ratings) > 0:
|
||||
normalized_ratings = np.zeros_like(original_ratings, dtype=float)
|
||||
else: # No ratings for this item in group_sorted (should not happen if groupby is correct)
|
||||
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
|
||||
|
||||
# Peak Detection on Normalized Data
|
||||
peaks_indices, properties = find_peaks(
|
||||
normalized_ratings,
|
||||
height=peak_norm_min_height,
|
||||
distance=peak_min_distance,
|
||||
prominence=peak_norm_min_prominence,
|
||||
)
|
||||
|
||||
hype_periods_list = []
|
||||
if len(peaks_indices) > 0:
|
||||
widths, _, left_ips, right_ips = peak_widths(
|
||||
normalized_ratings, peaks_indices, rel_height=peak_width_rel_height
|
||||
)
|
||||
|
||||
for i, peak_idx in enumerate(peaks_indices):
|
||||
start_idx = max(0, int(round(left_ips[i])))
|
||||
end_idx = min(len(group_sorted) - 1, int(round(right_ips[i])))
|
||||
|
||||
if start_idx <= end_idx:
|
||||
start_month = group_sorted.iloc[start_idx]["year_month"]
|
||||
end_month = group_sorted.iloc[end_idx]["year_month"]
|
||||
|
||||
hype_periods_list.append(
|
||||
{
|
||||
"itemId": item_id,
|
||||
"hype_start_month": start_month,
|
||||
"hype_end_month": end_month,
|
||||
"peak_month": group_sorted.iloc[peak_idx]["year_month"],
|
||||
"peak_rating_count_original": original_ratings[peak_idx],
|
||||
"peak_rating_count_normalized": normalized_ratings[
|
||||
peak_idx
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
if hype_periods_list:
|
||||
hype_periods_for_item = pd.DataFrame(hype_periods_list)
|
||||
else:
|
||||
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
|
||||
|
||||
# Calculate trending scores for each user in the group
|
||||
individual_scores = {}
|
||||
valid_scores = []
|
||||
|
||||
for idx, user_id in enumerate(internal_members):
|
||||
user_ratings = item_df[item_df["userId"] == user_id].copy()
|
||||
|
||||
if user_ratings.empty:
|
||||
individual_scores[members[idx]] = 0.0
|
||||
continue
|
||||
|
||||
# Merge user ratings with hype periods
|
||||
user_ratings_merged = pd.merge(
|
||||
user_ratings, hype_periods_for_item, on="itemId", how="left"
|
||||
)
|
||||
|
||||
user_ratings_merged["is_match"] = (
|
||||
(
|
||||
user_ratings_merged["year_month"]
|
||||
>= user_ratings_merged["hype_start_month"]
|
||||
)
|
||||
& (
|
||||
user_ratings_merged["year_month"]
|
||||
<= user_ratings_merged["hype_end_month"]
|
||||
)
|
||||
& user_ratings_merged["hype_start_month"].notna()
|
||||
)
|
||||
|
||||
if (
|
||||
not user_ratings_merged.empty
|
||||
and "is_match" in user_ratings_merged.columns
|
||||
):
|
||||
is_event_trending = user_ratings_merged.groupby(
|
||||
["userId", "itemId", "timestamp_dt"]
|
||||
)["is_match"].any()
|
||||
num_trending_ratings = is_event_trending.sum()
|
||||
total_unique_rating_events = len(is_event_trending)
|
||||
else:
|
||||
num_trending_ratings = 0
|
||||
total_unique_rating_events = len(
|
||||
user_ratings.drop_duplicates(
|
||||
subset=["userId", "itemId", "timestamp_dt"]
|
||||
)
|
||||
)
|
||||
|
||||
if total_unique_rating_events == 0:
|
||||
individual_scores[members[idx]] = 0.0
|
||||
else:
|
||||
trending_score = num_trending_ratings / total_unique_rating_events
|
||||
individual_scores[members[idx]] = trending_score
|
||||
valid_scores.append(trending_score)
|
||||
|
||||
# Calculate average trending score across all group members
|
||||
# Include users with 0.0 scores (no ratings for the item) in the average
|
||||
all_scores = [individual_scores[user_id] for user_id in members]
|
||||
average_trending_score = sum(all_scores) / len(members) if members else 0.0
|
||||
|
||||
return average_trending_score, individual_scores, hype_periods_for_item
|
||||
|
||||
def generate_ranked_items(
|
||||
self,
|
||||
all_rated_items: List[Union[str, int]],
|
||||
data: DataReader,
|
||||
group_members: List[Union[str, int]],
|
||||
component_weights: Optional[Dict[str, float]] = None,
|
||||
) -> tuple[List[Union[str, int]], Dict]:
|
||||
"""
|
||||
Ranks items based on multiple scoring factors for a group of users.
|
||||
|
||||
Calculates a composite score for each item based on:
|
||||
- Item popularity
|
||||
- Group preference intensity
|
||||
- Predicted ratings
|
||||
- Relevance to the group
|
||||
- Trends in the group
|
||||
|
||||
Args:
|
||||
candidate_items: List of items that at least one group member has interacted with
|
||||
data: The DataReader object containing user-item interactions
|
||||
group_members: List of user identifiers in the group
|
||||
component_weights: Optional dictionary with weights for each component
|
||||
(popularity, intensity, rating, relevance, trend)
|
||||
|
||||
Returns:
|
||||
List of item IDs sorted in descending order by their composite scores
|
||||
"""
|
||||
if self.group_predictions is None:
|
||||
raise ValueError(
|
||||
"User predictions not set. Call set_group_recommender_values first."
|
||||
)
|
||||
if self.top_recommendation is None:
|
||||
raise ValueError(
|
||||
"Top recommendation not set. Call set_group_recommender_values first."
|
||||
)
|
||||
|
||||
# Default weights if not provided
|
||||
if component_weights is None:
|
||||
component_weights = {
|
||||
"popularity": 1.0,
|
||||
"intensity": 1.0,
|
||||
"rating": 1.0,
|
||||
"relevance": 1.0,
|
||||
"trend": 1.0,
|
||||
}
|
||||
|
||||
item_scores = {}
|
||||
item_metric_details = {}
|
||||
popularity_scores = self.calculate_item_popularity_score(all_rated_items, data)
|
||||
|
||||
relevance_mask = self.calculate_relevance_mask(self.top_recommendation)
|
||||
|
||||
for item_id in all_rated_items:
|
||||
# Calculate individual score components
|
||||
|
||||
popularity_score = popularity_scores[item_id]
|
||||
|
||||
intensity_score = self.calculate_item_intensity_score(
|
||||
item_id, group_members, data
|
||||
)
|
||||
rating_score = self.calculate_rating_score(item_id, group_members, data)
|
||||
relevance_score = self.calculate_relevance_score(
|
||||
item_id, data, relevance_mask, group_members
|
||||
)
|
||||
|
||||
trending_score, _, _ = self.calculate_trending_score(
|
||||
group_members,
|
||||
item_id,
|
||||
data,
|
||||
0.3,
|
||||
0.2,
|
||||
9,
|
||||
0.6,
|
||||
)
|
||||
|
||||
composite_score = (
|
||||
component_weights["popularity"] * popularity_score
|
||||
+ component_weights["intensity"] * intensity_score
|
||||
+ component_weights["rating"] * rating_score
|
||||
+ component_weights["relevance"] * relevance_score
|
||||
+ component_weights["trend"] * trending_score
|
||||
)
|
||||
|
||||
item_metric_details[item_id] = {
|
||||
"Popularity": popularity_score,
|
||||
"Intensity": intensity_score,
|
||||
"Rating": rating_score,
|
||||
"Relevance": relevance_score,
|
||||
"Trend": trending_score,
|
||||
"Composite Score": composite_score,
|
||||
}
|
||||
item_scores[item_id] = composite_score
|
||||
|
||||
# Sort items by score in descending order
|
||||
ranked_items = sorted(
|
||||
item_scores.items(), key=operator.itemgetter(1), reverse=True
|
||||
)
|
||||
|
||||
# Return the sorted item IDs and the detailed metrics
|
||||
return [item_id for item_id, _ in ranked_items], item_metric_details
|
||||
@@ -0,0 +1,55 @@
|
||||
"""
|
||||
Some handy functions for pytroch model training ...
|
||||
"""
|
||||
|
||||
import torch
|
||||
from torch.optim import Optimizer
|
||||
|
||||
|
||||
# Checkpoints
|
||||
def save_checkpoint(model, model_dir):
|
||||
torch.save(model.state_dict(), model_dir)
|
||||
|
||||
|
||||
def resume_checkpoint(model, model_dir, device_id):
|
||||
device = f"cuda:{device_id}"
|
||||
state_dict = torch.load(model_dir, map_location=device)
|
||||
model.load_state_dict(state_dict)
|
||||
|
||||
|
||||
# Hyper params
|
||||
def use_cuda(enabled, device_id=0):
|
||||
if enabled:
|
||||
assert torch.cuda.is_available(), "CUDA is not available"
|
||||
torch.cuda.set_device(device_id)
|
||||
|
||||
|
||||
def use_optimizer(
|
||||
optimizer_name: str,
|
||||
network: torch.nn.Module,
|
||||
learning_rate: float,
|
||||
momentum: float = 0,
|
||||
weight_decay: float = 0,
|
||||
alpha: float = 0.99,
|
||||
) -> Optimizer:
|
||||
if optimizer_name == "sgd":
|
||||
optimizer = torch.optim.SGD(
|
||||
network.parameters(),
|
||||
lr=learning_rate,
|
||||
momentum=momentum,
|
||||
weight_decay=weight_decay,
|
||||
)
|
||||
|
||||
elif optimizer_name == "adam":
|
||||
optimizer = torch.optim.Adam(
|
||||
network.parameters(), lr=learning_rate, weight_decay=weight_decay
|
||||
)
|
||||
|
||||
elif optimizer_name == "rmsprop":
|
||||
optimizer = torch.optim.RMSprop(
|
||||
network.parameters(), lr=learning_rate, alpha=alpha, momentum=momentum
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Optimizer '{optimizer_name}' is not supported")
|
||||
|
||||
return optimizer
|
||||
Reference in New Issue
Block a user