public code v1

2026-05-22 10:02:10 +02:00
commit 46a9ecf065
166 changed files with 6982454 additions and 0 deletions
@@ -0,0 +1,4 @@
+from .recommender import Recommender
+from .group_recommender import GroupRecommender
+
+__all__ = ["Recommender", "GroupRecommender"]
@@ -0,0 +1,72 @@
+import numpy as np
+import pandas as pd
+from tqdm.autonotebook import tqdm
+
+
+class GenericRecommender:
+    def __init__(self, dataset_metadata, model, top_n: int = 10):
+        self.top_n = top_n
+        self.dataset = dataset_metadata.dataset
+        self.model = model
+        self.catalogue = set(self.dataset["itemId"])
+
+    def recommend_all(self):
+        """
+        Get all recommendations.
+        :param top_n:
+        :return: recommendations for any user.
+        """
+
+        ratings = self.dataset.groupby("userId")
+
+        recommendations = pd.DataFrame({"userId": [], "itemId": [], "rank": []})
+
+        with tqdm(
+            total=self.dataset["userId"].nunique(), desc="Recommending for users: "
+        ) as pbar:
+            for user_id, user_ratings in ratings:
+                # Replace .append() with pd.concat() - pandas 2.2.x +
+                recommendations = pd.concat(
+                    [recommendations, self.recommend_user(user_id, user_ratings)],  # type: ignore
+                    ignore_index=True,
+                )
+                pbar.update()
+
+        return recommendations
+
+    def rank_prediction(self, user_id, target_item_id, predictions):
+        # Ensure predictions are flattened if they're 2D
+        if isinstance(predictions, np.ndarray) and predictions.ndim > 1:
+            predictions = predictions.flatten()
+        recommendations = pd.DataFrame(
+            {"userId": user_id, "itemId": target_item_id, "prediction": predictions}
+        )
+
+        recommendations["rank"] = recommendations["prediction"].rank(
+            method="first", ascending=False
+        )
+
+        recommendations.sort_values(["userId", "rank"], inplace=True)
+
+        recommendations = recommendations[recommendations["rank"] <= self.top_n]
+
+        return recommendations[["userId", "itemId", "rank"]]
+
+    def get_unrated(self, user_ratings):
+        """
+        Extract the set of items a user has not rated.
+        :param user_ratings: list, items rated.
+        :return: list, items not rated.
+        """
+        unrated_item_id = self.catalogue - set(user_ratings)
+        unrated_item_id = list(unrated_item_id)
+        return unrated_item_id
+
+    def get_rated(self, user_id):
+        """
+        Extract the set of items a user has not rated.
+        :param user_id: userId rated.
+        :return: list, rated items.
+        """
+        rated = self.dataset[self.dataset["userId"] == user_id]
+        return rated
@@ -0,0 +1,391 @@
+from typing import Dict, List, Union, Optional
+
+import numpy as np
+
+from pygrex.data_reader.data_reader import DataReader
+from pygrex.models.recommender_model import RecommenderModel
+from pygrex.utils.aggregation_strategy import ScoreAggregator, AggregationStrategy
+from pygrex.utils.scale import Scale
+
+
+class GroupRecommender:
+    """
+    A class to represent a group recommender system that follows the workflow:
+    1. Setup and Candidate Selection
+    2. Individual Preference Collection
+    3. Score Aggregation
+    4. Final Recommendation List
+    """
+
+    def __init__(self, data: DataReader):
+        """Initialize the group recommender with data.
+
+        Args:
+            data: The dataset containing user-item interactions.
+        """
+        self.data = data
+        self._group_predictions = None
+        self._members = None
+        self._item_pool = None
+        self._model = None
+        self._aggregation_strategy = None
+        self._score_aggregator = None
+        self._aggregated_scores = None
+        self._top_recommendation = None
+
+    def setup_recommendation(
+        self,
+        model: RecommenderModel,
+        members: List[Union[str, int]],
+        data: DataReader,
+        aggregation_strategy: AggregationStrategy,  # type: ignore
+        most_respected_person: Optional[Union[str, int]] = None,
+    ) -> None:
+        """
+        Setup and Candidate Selection: Initialize the group recommendation process.
+               Args:
+                   model: The recommendation model to use
+                   members: List of user IDs representing the group members
+                   data: DataReader object containing the dataset
+                   aggregation_strategy: Strategy for aggregating individual predictions
+                   most_respected_person: User ID of most respected person (required for MRP strategy)
+        """
+        self._members = members
+        self._model = model
+        self._aggregation_strategy = aggregation_strategy
+
+        # Initialize score aggregator
+        self._score_aggregator = ScoreAggregator(
+            most_respected_person=most_respected_person
+        )
+
+        # get all item IDs from the dataset
+        item_ids = data.dataset["itemId"].unique()
+
+        # Get items that no group member has interacted with
+        self._item_pool = self.get_non_interacted_items_for_recommendation(
+            self.data,
+            item_ids,  # type: ignore
+            members,  # type: ignore
+        )
+        
+        # Filter item_pool to only include IDs that are valid for the model
+        # This prevents out-of-bounds errors when the model was trained with a different
+        # number of items than what's currently in the dataset
+        max_item_id = self._get_max_valid_item_id(model)
+        # Convert to int array and filter out invalid IDs
+        item_pool_int = self._item_pool.astype(int)
+        valid_mask = (item_pool_int >= 0) & (item_pool_int < max_item_id)
+        self._item_pool = item_pool_int[valid_mask]
+
+        # Individual Preference Collection: Generate predictions for each group member
+        self._group_predictions = self._generate_group_predictions()
+
+        # Score Aggregation: Aggregate individual predictions into collective scores
+        self._aggregated_scores = self._aggregate_group_scores()
+
+    def _generate_group_predictions(self) -> Dict[Union[str, int], Dict[int, float]]:
+        """
+        Individual Preference Collection: Generate predictions for all group members.
+
+        Returns:
+            A dictionary with user IDs as keys and their predictions as values
+        """
+        if not self._members or self._model is None or self._item_pool is None:
+            raise ValueError(
+                "You must call setup_recommendation before generating predictions"
+            )
+
+        predictions = {}
+        for member in self._members:
+            user_pred = self.generate_recommendation(
+                self._model,
+                member,
+                self._item_pool,  # type: ignore
+                self.data,  # type: ignore
+            )
+            predictions[member] = user_pred
+
+        return predictions
+
+    def _aggregate_group_scores(self) -> Dict[int, float]:
+        """
+        Score Aggregation: Aggregate individual predictions into collective scores.
+
+        Returns:
+            Dictionary mapping item IDs to aggregated scores
+        """
+        if (
+            self._group_predictions is None
+            or self._score_aggregator is None
+            or self._aggregation_strategy is None
+        ):
+            raise ValueError(
+                "You must call setup_recommendation before aggregating scores"
+            )
+
+        # For Borda Count, we need to create rankings from predictions
+        rankings = None
+        if self._aggregation_strategy == AggregationStrategy.BORDA_COUNT:
+            rankings = self._create_rankings_from_predictions()
+
+        # Use ScoreAggregator to aggregate scores
+        aggregated_scores = self._score_aggregator.aggregate_scores(
+            evaluations=self._group_predictions,  # type: ignore
+            strategy=self._aggregation_strategy,
+            rankings=rankings,  # type: ignore
+        )
+
+        # Sort items by their aggregated scores in descending order
+        sorted_scores = dict(
+            sorted(aggregated_scores.items(), key=lambda x: x[1], reverse=True)
+        )
+
+        return sorted_scores  # type: ignore
+
+    def _create_rankings_from_predictions(self) -> Dict[Union[str, int], List[int]]:
+        """
+        Create rankings from predictions for Borda Count aggregation.
+
+        Returns:
+            Dictionary mapping user IDs to ranked lists of item IDs
+        """
+        if self._group_predictions is None:
+            raise ValueError("Group predictions not available")
+
+        rankings = {}
+        for user_id, predictions in self._group_predictions.items():
+            # Sort items by prediction score in descending order
+            sorted_items = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
+            rankings[user_id] = [item_id for item_id, _ in sorted_items]
+
+        return rankings
+
+    def _get_max_valid_item_id(self, model: RecommenderModel) -> int:
+        """
+        Get the maximum valid item ID for the given model.
+        
+        Args:
+            model: The recommendation model
+            
+        Returns:
+            Maximum valid item ID (exclusive, so valid IDs are [0, max_item_id))
+        """
+        # For implicit models (MFImplicitModel), check item_factors shape
+        if hasattr(model, 'model') and model.model is not None:
+            if hasattr(model.model, 'item_factors'):
+                return model.model.item_factors.shape[0]
+        # Check if model has total_items attribute (set during fit)
+        if hasattr(model, 'total_items') and model.total_items is not None:
+            return model.total_items
+        # Fallback to data.num_item if model shape is not available
+        return self.data.num_item
+    
+    def get_non_interacted_items_for_recommendation(
+        self,
+        data: DataReader,
+        item_ids: List[Union[str, int]],
+        members: List[Union[str, int]],
+    ) -> np.ndarray:
+        """
+        Returns the list of item IDs that none of the specified group members have interacted with.
+
+        This method is typically used in recommendation systems to filter out items that have already
+        been interacted with by any member of the group, ensuring that recommendations focus on new or
+        unseen items.
+
+        Args:
+            data: The original dataset containing user-item interactions.
+            item_ids: A list of all available item IDs to consider.
+            members: A list of user IDs representing the group.
+
+        Returns:
+            np.ndarray: A list of item IDs that have not been interacted with by any member of the group.
+        """
+
+        consecutive_member_ids = [data.get_new_user_id(int(m)) for m in members]
+        consecutive_member_ids = [m for m in consecutive_member_ids if m is not None]
+
+        # Get all unique item IDs interacted with by users in the group
+        interacted_item_ids = data.dataset.loc[
+            data.dataset.userId.isin(consecutive_member_ids), "itemId"
+        ].unique()
+
+        # Use numpy set difference to get non-interacted item IDs
+        item_pool = np.setdiff1d(item_ids, interacted_item_ids, assume_unique=True)
+
+        return item_pool
+
+    def generate_recommendation(
+        self,
+        model: RecommenderModel,
+        member: Union[str, int],
+        item_pool: List[Union[str, int]],
+        data: DataReader,
+    ) -> Dict[int, float]:
+        """
+        Generate recommendations for a user based on the provided model.
+
+        Args:
+            model: A recommendation model that implements the RecommenderModel interface
+            member: The ID of the user
+            item_pool: List of item IDs to predict ratings/scores for
+            data: The dataset containing user-item interactions
+
+        Returns:
+            A dictionary mapping item IDs to predicted ratings/scores
+        """
+        member = int(member)
+        new_member_id = data.get_new_user_id(member)
+
+        if new_member_id is None:
+            return {}  # Return empty predictions for this user
+
+        # Additional safety check: filter item_pool to valid IDs before prediction
+        # This provides a second layer of protection in case filtering was missed earlier
+        max_valid_item_id = self._get_max_valid_item_id(model)
+        if isinstance(item_pool, np.ndarray):
+            item_pool = item_pool.astype(int)
+            item_pool = item_pool[(item_pool >= 0) & (item_pool < max_valid_item_id)]
+        elif isinstance(item_pool, list):
+            item_pool = [int(item) for item in item_pool if 0 <= int(item) < max_valid_item_id]
+        
+        if len(item_pool) == 0:
+            print(f"No valid items found for user {new_member_id}. Returning empty predictions.")
+            return {}  # Return empty predictions if no valid items
+
+        raw_predictions = model.predict(new_member_id, item_pool)  # type: ignore
+        if not isinstance(raw_predictions, (list, np.ndarray)):
+            raise TypeError(
+                f"Model's predict function returned an unexpected type: {type(raw_predictions)}"
+            )
+
+        # raw_predictions = []
+        # # Generate predictions for each item in the pool
+        # for item in item_pool:
+        #     item = int(item)
+        #     raw_predictions.append(model.predict(new_member_id, item))  # type: ignore
+
+        # Ensure raw_predictions is a numpy array
+        raw_predictions = np.array(raw_predictions)
+
+        # # Flatten the predictions if it's a 2D array (single user, multiple items)
+        # if raw_predictions.ndim == 2 and raw_predictions.shape[0] == 1:
+        #     raw_predictions = raw_predictions.flatten()
+
+        # # Check if the length of raw_predictions matches item_pool
+        # if len(raw_predictions) != len(item_pool):
+        #     raise ValueError(
+        #         "Mismatch between predictions and item IDs. Check the model's predict function."
+        #     )
+
+        # Apply scaling to normalize predictions to 1-5 range
+        scaled_linear = Scale.linear(
+            np.array(raw_predictions),
+            target_min=1,
+            target_max=5,
+        )
+        # Convert the scaled predictions into a dictionary with original item IDs as keys
+        predictions = {}
+        for item, scaled_pred in zip(item_pool, scaled_linear):
+            # Ensure item_id is treated as an integer
+            item_original_id = data.get_original_item_id(int(item))
+            if item_original_id is not None:
+                predictions[int(item_original_id)] = scaled_pred  # type: ignore
+
+        # Sort the predictions in descending order of scores
+        sorted_predictions = dict(
+            sorted(predictions.items(), key=lambda item: item[1], reverse=True)
+        )
+
+        return sorted_predictions
+
+    def get_group_recommendations(
+        self, top_k: Optional[int] = None
+    ) -> Union[int, List[int]]:
+        """
+        Final Recommendation List: Get recommendations for the group based on aggregated scores.
+
+        Args:
+            top_k: The number of recommendations to return.
+                  If None, returns all recommendations sorted by score.
+                  If 1, returns only the top recommendation as a single item ID.
+                  If > 1, returns the top k recommendations as a list of item IDs.
+
+        Returns:
+            If top_k is 1, a single item ID. Otherwise, a list of item IDs.
+        """
+        if self._aggregated_scores is None:
+            raise ValueError(
+                "You must call setup_recommendation before getting recommendations"
+            )
+
+        sorted_items = list(self._aggregated_scores.items())
+
+        # Return results based on top_k parameter
+        if top_k is None:
+            # Return all items as a list of item IDs
+            return [item_id for item_id, _ in sorted_items]
+        elif top_k == 1:
+            # Return only the top item ID
+            if sorted_items:
+                return sorted_items[0][0]
+            return None  # type: ignore
+        else:
+            # Return top k item IDs
+            return [
+                item_id for item_id, _ in sorted_items[: min(top_k, len(sorted_items))]
+            ]
+
+    def get_top_recommendation(self) -> int:
+        """
+        Get the top recommendation for the group.
+
+        Returns:
+            The item ID with the highest aggregated score across all group members.
+        """
+        if self._top_recommendation is None:
+            self._top_recommendation = self.get_group_recommendations(top_k=1)
+        return self._top_recommendation  # type: ignore
+
+    def get_recommendation_scores(self) -> Dict[int, float]:
+        """
+        Get the aggregated scores for all items across the group.
+
+        Returns:
+            A dictionary with item IDs as keys and their aggregated scores as values.
+        """
+        if self._aggregated_scores is None:
+            raise ValueError(
+                "You must call setup_recommendation before getting recommendation scores"
+            )
+        return self._aggregated_scores.copy()
+
+    def get_aggregation_strategy(self) -> Optional[AggregationStrategy]:
+        """
+        Get the current aggregation strategy.
+
+        Returns:
+            The aggregation strategy being used, or None if not set.
+        """
+        return self._aggregation_strategy
+
+    def get_group_members(self) -> Optional[List[Union[str, int]]]:
+        """
+        Get the current group members.
+
+        Returns:
+            List of group member IDs, or None if not set.
+        """
+        return self._members.copy() if self._members else None
+
+    def get_individual_predictions(
+        self,
+    ) -> Optional[Dict[Union[str, int], Dict[int, float]]]:
+        """
+        Get the individual predictions for all group members.
+
+        Returns:
+            Dictionary mapping user IDs to their individual predictions, or None if not available.
+        """
+        return self._group_predictions.copy() if self._group_predictions else None
@@ -0,0 +1,57 @@
+import pandas as pd
+from typing import Optional
+
+from .generic_recommender import GenericRecommender
+
+
+class Recommender(GenericRecommender):
+    def __init__(self, dataset_metadata, model, top_n: int = 10):
+        super(Recommender, self).__init__(dataset_metadata, model, top_n)
+
+    def get_predictions(
+        self,
+        user_id: int,
+        target_item_id: list,
+    ):
+        predictions = self.model.predict(user_id, target_item_id)
+        return predictions
+
+    def recommend(self, user_id: int, target_item_id: list):
+        """
+        Generate recommendations on specific itemId and userId
+        :param user_id: list, user Ids
+        :param target_item_id: list, item Ids
+        :param rated_items: list, of rated interactions.
+        :return: data.frame [userId, itemId, rank], recommendations ranking for the specified pairs of userId and itemId.
+        """
+        predictions = self.get_predictions(user_id, target_item_id)
+
+        return self.rank_prediction(user_id, target_item_id, predictions)
+
+    def recommend_user(
+        self, user_id: Optional[int] = None, user_ratings: Optional[pd.DataFrame] = None
+    ):
+        """
+        Get recommendations for a user.
+        :param user_id: int, a user Id
+        :param user_ratings: list, interactions on the user
+        :return: dataframe [userId, itemId, rank], recommendations ranking for the specified userId.
+        """
+        if user_ratings is None:
+            if user_id is None:
+                raise ValueError("Either 'user_id' or 'user_ratings' must be provided.")
+            user_ratings = self.get_rated(user_id=user_id)
+
+        if user_ratings is None:
+            return pd.DataFrame(
+                columns=["userId", "itemId", "rank"]
+            )  # Return empty recommendations
+
+        if user_id is None:
+            raise ValueError(
+                "Could not determine user_id from the provided user_ratings."
+            )
+
+        unrated_item_id = self.get_unrated(user_ratings["itemId"])
+
+        return self.recommend(user_id=user_id, target_item_id=unrated_item_id)