Files
py-grex/pygrex/utils/sliding_window_ranker.py
T
2026-05-22 10:02:10 +02:00

632 lines
24 KiB
Python

import operator
from typing import Any, Dict, List, Union, Optional
import numpy as np
import pandas as pd
from scipy.signal import (
find_peaks,
peak_widths,
)
from pygrex.data_reader import DataReader
class SlidingWindowRanker:
"""
Stratigi, M., Bikakis, N., Stefanidis, K.: Counterfactual explanations for group
recommendations. In: Proceedings of the 27th International Workshop on Design,
Optimization, Languages and Analytical Processing of Big Data (DOLAP 2025)
"""
def __init__(self, config: Dict[str, Any]):
"""
Initialize the SlidingWindowRanker.
Args:
config: Configuration parameters for the evaluator
"""
self.config = config
self.group_predictions: Optional[
Dict[Union[str, int], Dict[Union[str, int], float]]
] = None
self.top_recommendation: Optional[Union[str, int]] = None
def set_group_recommender_values(
self,
group_predictions: Dict[Union[str, int], Dict[Union[str, int], float]],
top_recommendation: Union[str, int],
) -> None:
"""
Set group recommender values.
Args:
group_predictions: Dictionary mapping user IDs to their item predictions
top_recommendation: List of top recommended items for the group
"""
self.group_predictions = group_predictions
self.top_recommendation = top_recommendation
def evaluate(self, data: DataReader) -> Dict[str, Any]:
"""
Evaluate the data using the Stratigis evaluator.
Args:
data: DataReader object containing dataset and transformation methods
Returns:
Dictionary with evaluation metrics
"""
# Implementation would go here
return {}
def calculate_item_popularity_score(
self, items: List[Union[str, int]], data: DataReader
) -> Dict[Union[str, int], float]:
"""
Calculate the normalized popularity of each item based on the number of interactions received.
Args:
items: List of item IDs
data: Data object containing the dataset and transformation methods
Returns:
Dictionary with item IDs as keys and normalized popularity (0-1) as values
"""
# Calculate popularity (number of interactions) for each item
popularity_counts = {}
for item_id in items:
internal_item_id = data.get_new_item_id(item_id)
count = len(data.dataset[data.dataset["itemId"] == internal_item_id])
popularity_counts[item_id] = count
# Find min and max values for normalization
min_count = min(popularity_counts.values()) if popularity_counts else 0
max_count = max(popularity_counts.values()) if popularity_counts else 0
# Add 1% padding to the range
range_value = max_count - min_count
padded_range = range_value + (
range_value / 50
) # Add 2% to range (1% on each end)
padded_min = min_count - (
range_value / 100
) # Subtract 1% of range from minimum
if padded_range == 0:
padded_range = 1 # Avoid division by zero
# Normalize popularity values to [0,1]
popularity_mask = {}
for item_id, count in popularity_counts.items():
popularity_mask[item_id] = (count - padded_min) / padded_range
return popularity_mask
def calculate_relevance_mask(
self,
target_item_id: Union[str, int],
) -> Dict[Union[str, int], float]:
"""
Create a mapping between users and their prediction scores for a specific target item.
Args:
target_item_id :The ID of the item for which prediction scores are needed
Returns:
Dictionary mapping user IDs to their predicted scores for the target item
Note: Users without a prediction for the target item will have a value of 0
Examples
>>> user_preds = {'user1': {'item1': 4.5, 'item2': 3.2}, 'user2': {'item2': 2.8}}
>>> evaluator.set_group_recommender_values(user_preds,top_recommendation)
>>> evaluator.calculate_relevance_mask('item1')
{'user1': 4.5, 'user2': 0}
"""
if self.group_predictions is None:
raise ValueError(
"User predictions not set. Call set_group_recommender_values first."
)
individual_predictions = {}
for user_id, predictions in self.group_predictions.items():
# Get the prediction for the target item if it exists, otherwise default to 0
individual_predictions[user_id] = predictions.get(target_item_id, 0)
return individual_predictions
def calculate_relevance_score(
self,
item_id: Union[str, int],
data: DataReader,
prediction_scores: Dict[Union[str, int], float],
members: List[Union[str, int]],
rating_scale: tuple = (0, 5), # Default rating scale
) -> float:
"""
Calculate the normalized average prediction score for an item based on group members' predictions.
Agrs
item_id: ID of the item to calculate relevance for
data : DataReader object containing dataset and ID mapping methods
prediction_scores : Dictionary mapping user IDs to their prediction scores for items
members : List of user IDs in the group
rating_scale: Tuple indicating (min_rating, max_rating) for normalization
Returns
Normalized average prediction score in range [0,1]
Returns 0 if no users in the group have interacted with the item
Notes
1. Calculates the average prediction score for the item from group members
2. Normalizes the score to [0,1] range with 1% padding
"""
total_score = 0
valid_users_count = 0
internal_item_id = data.get_new_item_id(item_id)
for user_id in members:
# Convert user ID to internal format
internal_user_id = (
data.get_new_user_id(int(user_id))
if isinstance(user_id, (int, np.integer))
else user_id
)
# Check if user has interacted with the item
user_item_data = data.dataset[
(data.dataset["userId"] == internal_user_id)
& (data.dataset["itemId"] == internal_item_id)
]
if user_item_data.empty:
continue
# Get the prediction score for this user
if user_id in prediction_scores:
total_score += prediction_scores[user_id]
valid_users_count += 1
# Return 0 if no valid users found
if valid_users_count == 0:
return 0
# Calculate average score
average_score = total_score / valid_users_count
# Normalize to [0,1] with 1% padding
min_value, max_value = rating_scale
range_value = max_value - min_value
padded_range = range_value + (
range_value / 50
) # Add 2% to range (1% on each end)
padded_min = min_value - (
range_value / 100
) # Subtract 1% of range from minimum
if padded_range == 0:
return 0.0
normalized_score = (average_score - padded_min) / padded_range
return float(normalized_score)
def calculate_item_intensity_score(
self, item_id: Union[str, int], members: List[Union[str, int]], data: DataReader
) -> float:
"""
Calculate what proportion of group members have interacted with the specified item.
Args
item_id : ID of the item to calculate interaction rate for
members : List of user IDs in the group
data : DataReader object containing dataset and ID mapping methods
Returns
Proportion of group members who have interacted with the item (range [0,1])
0 means no group members have interacted with the item
1 means all group members have interacted with the item
"""
# Convert item ID to internal format
if data is None:
print("Error: DataReader object is None. Cannot convert item_id.")
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
internal_item_id = data.get_new_item_id(item_id)
# Convert all user IDs to internal format
internal_members = [data.get_new_user_id(user_id) for user_id in members]
# Count how many users have interacted with the item
interaction_count = len(
data.dataset[
(data.dataset.itemId == internal_item_id)
& data.dataset.userId.isin(internal_members)
]
)
# Calculate proportion of group members who interacted with item
if not members:
return 0 # Avoid division by zero if no members
interaction_rate = interaction_count / len(members)
return interaction_rate
def calculate_rating_score(
self,
item_id: Union[str, int],
members: List[Union[str, int]],
data: DataReader,
rating_scale: tuple = (0, 5),
) -> float:
"""
Calculate the normalized average rating given to an item by group members.
Args
item_id : ID of the item to calculate average rating for
data : DataReader object containing dataset and ID mapping methods
members : List of user IDs in the group
rating_scale: Tuple indicating (min_rating, max_rating) for normalization
Returns
Normalized average rating in range [0,1]
Notes
- Considers all group members in the denominator even if some haven't rated the item
- Normalizes the resulting average to [0,1] with 1% padding
"""
# Convert item ID to internal format
if data is None:
print("Error: DataReader object is None. Cannot convert item_id.")
return 0.0
internal_item_id = data.get_new_item_id(item_id)
# Convert all user IDs to internal format
internal_members = [data.get_new_user_id(user_id) for user_id in members]
# Get ratings from users who have rated this item
rating_data = data.dataset[
(data.dataset.itemId == internal_item_id)
& data.dataset.userId.isin(internal_members)
]
# Calculate average rating (sum of ratings divided by total group size)
if len(members) == 0:
return 0 # Avoid division by zero if no members
total_rating = rating_data["rating"].sum()
average_rating = total_rating / len(members)
# Normalize to [0,1] with 1% padding
min_value, max_value = rating_scale
range_value = max_value - min_value
padded_range = range_value + (
range_value / 50
) # Add 2% to range (1% on each end)
padded_min = min_value - (
range_value / 100
) # Subtract 1% of range from minimum
if padded_range == 0:
return 0.0
normalized_rating = (average_rating - padded_min) / padded_range
return float(normalized_rating)
def calculate_trending_score(
self,
members: List[Union[str, int]],
item_id: Union[str, int],
data: Optional[DataReader] = None,
peak_norm_min_height: float = 0.1,
peak_norm_min_prominence: float = 0.05,
peak_min_distance: int = 3,
peak_width_rel_height: float = 0.5,
) -> tuple[float, Dict[Union[str, int], float], pd.DataFrame]:
"""
Calculates a trending score for a user, using normalized data for hype period detection.
Args
members : List of user IDs in the group
item_id : ID of the item to calculate trending score for
data : DataReader object containing dataset and ID mapping methods
peak_norm_min_height : Minimum height of peaks in normalized data to consider as significant
peak_norm_min_prominence : Minimum prominence of peaks in normalized data
peak_min_distance : Minimum distance between peaks in months
peak_width_rel_height : Relative height for peak width calculation
Returns
tuple: (average_trending_score, individual_scores, hype_periods_for_item)
average_trending_score: Average trending score across all group members (0-1)
individual_scores: Dictionary mapping user IDs to their individual trending scores
hype_periods_for_item: DataFrame containing detected hype periods for the item
"""
if not members:
print("Error: No group members provided for trending score calculation.")
return 0.0, {}, pd.DataFrame()
_df = pd.DataFrame()
if data is not None and isinstance(data, DataReader):
_df = data.dataset.copy()
else:
if data is not None:
print(
f"Warning: data was provided but is not a DataReader object (type: {type(data)})."
)
if _df.empty:
print(
"Error: The DataFrame (_df) is empty. Cannot calculate score or plot."
)
return 0.0, {}, pd.DataFrame()
required_columns = [
"userId",
"itemId",
"rating",
"timestamp",
]
missing_columns = [col for col in required_columns if col not in _df.columns]
if missing_columns:
print(
f"Error: Missing required columns in DataFrame: {', '.join(missing_columns)}"
)
return 0.0, {}, pd.DataFrame()
try:
if "timestamp_dt" not in _df.columns or _df["timestamp_dt"].isnull().all():
_df["timestamp_dt"] = pd.to_datetime(_df["timestamp"], unit="s")
if "year_month" not in _df.columns or _df["year_month"].isnull().all():
_df["year_month"] = _df["timestamp_dt"].dt.to_period("M")
except Exception as e:
print(f"Error during timestamp conversion or year-month extraction: {e}")
return 0.0, {}, pd.DataFrame()
if data is None: # Should not happen if _df is not empty, but as a safeguard
return 0.0, {}, pd.DataFrame()
# Convert item ID to internal format
internal_item_id = data.get_new_item_id(item_id)
# Convert all user IDs to internal format
internal_members = [data.get_new_user_id(user_id) for user_id in members]
# Filter data for the specific item ID only
item_df = _df[_df["itemId"] == internal_item_id]
if item_df.empty:
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
# movie_ratings_per_month contains original rating counts
movie_ratings_per_month = (
item_df.groupby(["itemId", "year_month"], observed=False)
.size()
.reset_index(name="rating_count")
)
if movie_ratings_per_month.empty:
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
hype_periods_for_item = None
# Process the specific item for hype period detection
group_sorted = movie_ratings_per_month.sort_values("year_month").reset_index(
drop=True
)
original_ratings = group_sorted["rating_count"].to_numpy()
# Normalization Step
min_rating = np.min(original_ratings)
max_rating = np.max(original_ratings)
normalized_ratings = None
if (
max_rating > min_rating
): # Avoid division by zero if all ratings are the same
normalized_ratings = (original_ratings - min_rating) / (
max_rating - min_rating
)
elif len(original_ratings) > 0:
normalized_ratings = np.zeros_like(original_ratings, dtype=float)
else: # No ratings for this item in group_sorted (should not happen if groupby is correct)
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
# Peak Detection on Normalized Data
peaks_indices, properties = find_peaks(
normalized_ratings,
height=peak_norm_min_height,
distance=peak_min_distance,
prominence=peak_norm_min_prominence,
)
hype_periods_list = []
if len(peaks_indices) > 0:
widths, _, left_ips, right_ips = peak_widths(
normalized_ratings, peaks_indices, rel_height=peak_width_rel_height
)
for i, peak_idx in enumerate(peaks_indices):
start_idx = max(0, int(round(left_ips[i])))
end_idx = min(len(group_sorted) - 1, int(round(right_ips[i])))
if start_idx <= end_idx:
start_month = group_sorted.iloc[start_idx]["year_month"]
end_month = group_sorted.iloc[end_idx]["year_month"]
hype_periods_list.append(
{
"itemId": item_id,
"hype_start_month": start_month,
"hype_end_month": end_month,
"peak_month": group_sorted.iloc[peak_idx]["year_month"],
"peak_rating_count_original": original_ratings[peak_idx],
"peak_rating_count_normalized": normalized_ratings[
peak_idx
],
}
)
if hype_periods_list:
hype_periods_for_item = pd.DataFrame(hype_periods_list)
else:
return 0.0, {user_id: 0.0 for user_id in members}, pd.DataFrame()
# Calculate trending scores for each user in the group
individual_scores = {}
valid_scores = []
for idx, user_id in enumerate(internal_members):
user_ratings = item_df[item_df["userId"] == user_id].copy()
if user_ratings.empty:
individual_scores[members[idx]] = 0.0
continue
# Merge user ratings with hype periods
user_ratings_merged = pd.merge(
user_ratings, hype_periods_for_item, on="itemId", how="left"
)
user_ratings_merged["is_match"] = (
(
user_ratings_merged["year_month"]
>= user_ratings_merged["hype_start_month"]
)
& (
user_ratings_merged["year_month"]
<= user_ratings_merged["hype_end_month"]
)
& user_ratings_merged["hype_start_month"].notna()
)
if (
not user_ratings_merged.empty
and "is_match" in user_ratings_merged.columns
):
is_event_trending = user_ratings_merged.groupby(
["userId", "itemId", "timestamp_dt"]
)["is_match"].any()
num_trending_ratings = is_event_trending.sum()
total_unique_rating_events = len(is_event_trending)
else:
num_trending_ratings = 0
total_unique_rating_events = len(
user_ratings.drop_duplicates(
subset=["userId", "itemId", "timestamp_dt"]
)
)
if total_unique_rating_events == 0:
individual_scores[members[idx]] = 0.0
else:
trending_score = num_trending_ratings / total_unique_rating_events
individual_scores[members[idx]] = trending_score
valid_scores.append(trending_score)
# Calculate average trending score across all group members
# Include users with 0.0 scores (no ratings for the item) in the average
all_scores = [individual_scores[user_id] for user_id in members]
average_trending_score = sum(all_scores) / len(members) if members else 0.0
return average_trending_score, individual_scores, hype_periods_for_item
def generate_ranked_items(
self,
all_rated_items: List[Union[str, int]],
data: DataReader,
group_members: List[Union[str, int]],
component_weights: Optional[Dict[str, float]] = None,
) -> tuple[List[Union[str, int]], Dict]:
"""
Ranks items based on multiple scoring factors for a group of users.
Calculates a composite score for each item based on:
- Item popularity
- Group preference intensity
- Predicted ratings
- Relevance to the group
- Trends in the group
Args:
candidate_items: List of items that at least one group member has interacted with
data: The DataReader object containing user-item interactions
group_members: List of user identifiers in the group
component_weights: Optional dictionary with weights for each component
(popularity, intensity, rating, relevance, trend)
Returns:
List of item IDs sorted in descending order by their composite scores
"""
if self.group_predictions is None:
raise ValueError(
"User predictions not set. Call set_group_recommender_values first."
)
if self.top_recommendation is None:
raise ValueError(
"Top recommendation not set. Call set_group_recommender_values first."
)
# Default weights if not provided
if component_weights is None:
component_weights = {
"popularity": 1.0,
"intensity": 1.0,
"rating": 1.0,
"relevance": 1.0,
"trend": 1.0,
}
item_scores = {}
item_metric_details = {}
popularity_scores = self.calculate_item_popularity_score(all_rated_items, data)
relevance_mask = self.calculate_relevance_mask(self.top_recommendation)
for item_id in all_rated_items:
# Calculate individual score components
popularity_score = popularity_scores[item_id]
intensity_score = self.calculate_item_intensity_score(
item_id, group_members, data
)
rating_score = self.calculate_rating_score(item_id, group_members, data)
relevance_score = self.calculate_relevance_score(
item_id, data, relevance_mask, group_members
)
trending_score, _, _ = self.calculate_trending_score(
group_members,
item_id,
data,
0.3,
0.2,
9,
0.6,
)
composite_score = (
component_weights["popularity"] * popularity_score
+ component_weights["intensity"] * intensity_score
+ component_weights["rating"] * rating_score
+ component_weights["relevance"] * relevance_score
+ component_weights["trend"] * trending_score
)
item_metric_details[item_id] = {
"Popularity": popularity_score,
"Intensity": intensity_score,
"Rating": rating_score,
"Relevance": relevance_score,
"Trend": trending_score,
"Composite Score": composite_score,
}
item_scores[item_id] = composite_score
# Sort items by score in descending order
ranked_items = sorted(
item_scores.items(), key=operator.itemgetter(1), reverse=True
)
# Return the sorted item IDs and the detailed metrics
return [item_id for item_id, _ in ranked_items], item_metric_details