public code v1

This commit is contained in:
2026-05-22 10:02:10 +02:00
commit 46a9ecf065
166 changed files with 6982454 additions and 0 deletions
+162
View File
@@ -0,0 +1,162 @@
import streamlit as st
import pandas as pd
import os
from io import StringIO
# Library Imports
from pygrex.data_reader import DataReader, GroupInteractionHandler
# Page Configuration
st.set_page_config(page_title="Data Preparation", page_icon="📄", layout="wide")
st.title("📄 Data Preparation")
# Default File Paths
DEFAULT_RATINGS_PATH = "datasets/stratigis/ratings.csv"
DEFAULT_GROUPS_PATH = "datasets/stratigis/groupsWithHighRatings5.txt"
# Session State Initialization
if "data_loaded" not in st.session_state:
st.session_state.data_loaded = False
st.session_state.data_reader = None
st.session_state.group_handler = None
st.session_state.num_groups = 0
# DATA INPUT SECTION
# Ratings Input
st.header("1. Ratings Data")
st.markdown(
"You can upload your own ratings file or use the default **MovieLens 100k** dataset."
)
ratings_file_buffer = st.file_uploader(
"Upload Your Ratings Data (Optional)", type=["csv"]
)
# Group Input
st.header("2. Group Data")
group_input_method = st.radio(
"Choose group input method:",
("Enter groups manually", "Upload a group file"),
horizontal=True,
)
# Load default group data for the text area
default_group_text = ""
if os.path.exists(DEFAULT_GROUPS_PATH) and ratings_file_buffer is None:
with open(DEFAULT_GROUPS_PATH, "r") as f:
default_group_text = f.read()
if group_input_method == "Enter groups manually":
group_text_input = st.text_area(
"Enter group members (one group per line, members separated by '_')",
value=default_group_text,
height=150,
)
else:
groups_file_buffer = st.file_uploader(
"Upload Your Group Data (Optional)", type=["txt"]
)
# Preprocessing Options
st.header("3. Preprocessing")
binarize_data = st.checkbox(
"Binarize ratings (for implicit feedback models)", value=True
)
if binarize_data:
binary_threshold = st.number_input(
"Rating threshold for binarization", min_value=0.0, value=1.0, step=0.5
)
# Main Loading Logic
st.header("4. Load and Process")
if st.button("Load and Process Data", type="primary"):
with st.spinner("Processing data..."):
try:
desired_columns = ["userId", "itemId", "rating", "timestamp"]
# Determine which ratings file to use
if ratings_file_buffer:
ratings_df = pd.read_csv(
StringIO(ratings_file_buffer.getvalue().decode("utf-8")),
sep=",",
usecols=lambda column: column in desired_columns,
)
else:
if not os.path.exists(DEFAULT_RATINGS_PATH):
st.error(
f"Default ratings file not found at: `{DEFAULT_RATINGS_PATH}`"
)
st.stop()
ratings_df = pd.read_csv(
DEFAULT_RATINGS_PATH,
sep=",",
names=desired_columns,
skiprows=1,
)
ratings_df = ratings_df[desired_columns]
# Determine which group data to use and prepare it for the handler
temp_dir = "temp/group_data"
os.makedirs(temp_dir, exist_ok=True)
groups_filepath = os.path.join(temp_dir, "current_groups.txt")
if group_input_method == "Enter groups manually":
with open(groups_filepath, "w") as f:
f.write(group_text_input) # type: ignore
st.session_state.group_filename = os.path.basename(groups_filepath)
else: # File upload method
if groups_file_buffer: # type: ignore
with open(groups_filepath, "wb") as f:
f.write(groups_file_buffer.getbuffer())
st.session_state.group_filename = groups_file_buffer.name
else: # Fallback to default if no file is uploaded
if not os.path.exists(DEFAULT_GROUPS_PATH):
st.error(
f"Default groups file not found at: `{DEFAULT_GROUPS_PATH}`"
)
st.stop()
groups_filepath = DEFAULT_GROUPS_PATH
st.session_state.group_filename = os.path.basename(groups_filepath)
# Instantiate library classes and process data
data_reader = DataReader(dataframe=ratings_df)
group_handler = GroupInteractionHandler(filepath_or_buffer=groups_filepath)
if binarize_data:
data_reader.binarize(binary_threshold=binary_threshold) # type: ignore
data_reader.make_consecutive_ids_in_dataset()
available_groups = group_handler.read_groups(
filename=st.session_state.group_filename
)
# Store results in session state
st.session_state.data_reader = data_reader
st.session_state.group_handler = group_handler
st.session_state.num_groups = len(available_groups)
st.session_state.data_loaded = True
st.success("✅ Data loaded and processed successfully!")
except Exception as e:
st.error(f"An error occurred: {e}")
st.session_state.data_loaded = False
# Enhanced Data Summary
if st.session_state.data_loaded:
st.markdown("")
st.header("Data Summary")
dr = st.session_state.data_reader
col1, col2 = st.columns(2)
with col1:
st.metric("👥 Unique Users", f"{dr.num_user:,}") # type: ignore
st.metric("📦 Unique Items", f"{dr.num_item:,}") # type: ignore
with col2:
st.metric("⭐ Total Ratings", f"{len(dr.get_raw_dataset()):,}") # type: ignore
st.metric("👨‍👩‍👧‍👦 Number of Groups", f"{st.session_state.num_groups:,}")
with st.expander("Processed Ratings DataFrame Head:", expanded=True):
st.dataframe(dr.dataset.head(), hide_index=True) # type: ignore
+956
View File
@@ -0,0 +1,956 @@
import streamlit as st
import time
# Library Imports
from pygrex.models import (
ALS,
BPR,
ExplAutoencoderTorch,
EMFModel,
GMFModel,
MLPModel,
SVD,
KNNBasic,
)
from pygrex.evaluator import (
run_leave_one_out_evaluation,
run_evaluation_with_proper_split,
)
st.set_page_config(page_title="Model Training", page_icon="🧠", layout="wide")
st.title("🧠 Model Selection & Training")
# Check if data is loaded
if not st.session_state.get("data_loaded", False):
st.warning("⚠️ Please load data on the **📄 Data Preparation** page first.")
st.stop() # Stop execution if no data is loaded
# Model Selection
st.header("1. Select a Model")
# As you add more models to your library, you can add them to this list.
model_option = st.selectbox(
"Choose a recommendation model:",
("ALS", "BPR", "Autoencoder", "EMF", "GMF", "MLP", "KNN", "SVD"),
)
# Hyperparameter Configuration
st.header("2. Configure Hyperparameters")
model_params = {}
if model_option == "ALS":
st.subheader("ALS (Alternating Least Squares) Parameters")
# Create columns for a cleaner layout
col1, col2, col3 = st.columns(3)
with col1:
latent_dim = st.number_input(
"Latent Dimensions (factors)",
min_value=1,
max_value=500,
value=100,
step=10,
help="The number of latent factors to compute.",
)
with col2:
reg_term = st.number_input(
"Regularization Term",
min_value=0.001,
max_value=1.0,
value=0.001,
step=0.001,
format="%.3f",
help="The regularization factor.",
)
with col3:
epochs = st.number_input(
"Epochs (iterations)",
min_value=1,
max_value=200,
value=10,
step=5,
help="The number of ALS iterations.",
)
model_params = {
"latent_dim": latent_dim,
"reg_term": reg_term,
"epochs": epochs,
}
elif model_option == "BPR":
st.subheader("BPR (Bayesian Personalised Ranking) Parameters")
# First Row
col1_r1, col2_r1, col3_r1 = st.columns(3)
with col1_r1:
latent_dim = st.number_input(
"Latent Dimensions (factors)",
min_value=1,
max_value=500,
value=100,
step=10,
help="The number of latent factors to compute.",
)
with col2_r1:
reg_term = st.number_input(
"Regularization Term",
min_value=0.001,
max_value=1.0,
value=0.001,
step=0.001,
format="%.3f",
help="The regularization factor.",
)
with col3_r1:
epochs = st.number_input(
"Epochs (iterations)",
min_value=1,
max_value=200,
value=10,
step=5,
help="The number of ALS iterations.",
)
# Second Row
col1_r2, col2_r2, col3_r2 = st.columns(3)
with col1_r2:
learning_rate = st.number_input(
"Learning Rate",
min_value=0.0,
max_value=0.1,
value=0.01,
step=0.01,
format="%.2f",
help="The step size at each iteration while moving toward a minimum of the loss function.",
)
model_params = {
"latent_dim": latent_dim,
"reg_term": reg_term,
"epochs": epochs,
"learning_rate": learning_rate,
}
elif model_option == "Autoencoder":
st.subheader("Autoencoder Parameters")
# First Row
col1_r1, col2_r1, col3_r1 = st.columns(3)
with col1_r1:
learning_rate = st.number_input(
"Learning Rate",
min_value=0.0001,
max_value=0.1,
value=0.005,
step=0.001,
format="%.4f",
help="The step size at each iteration while moving toward a minimum of the loss function.",
)
with col2_r1:
weight_decay = st.number_input(
"Weight Decay",
min_value=0.0000001,
max_value=0.0001,
value=0.0000001,
step=0.0000001,
format="%.7f",
help="The regularization factor to prevent overfitting by penalizing large weights.",
)
with col3_r1:
hidden_layer_features = st.number_input(
"Hidden Layer Features",
min_value=4,
max_value=128,
value=8,
step=4,
help="The number of features in the hidden layers of the neural network.",
)
# Second Row
col1_r2, col2_r2, col3_r2 = st.columns(3)
with col1_r2:
epochs = st.number_input(
"Epochs (iterations)",
min_value=1,
max_value=200,
value=30,
step=5,
help="The number of complete passes through the entire training dataset.",
)
with col2_r2:
cuda = st.checkbox(
"Use CUDA (GPU)",
value=False,
help="Check to use NVIDIA CUDA for GPU acceleration if available.",
)
with col3_r2:
optimizer_name = st.selectbox(
"Optimizer",
options=["adam", "sgd", "rmsprop"],
index=0, # 'adam'
help="The optimization algorithm to use for training the model.",
)
# Third Row
col1_r3, col2_r3, col3_r3 = st.columns(3)
with col1_r3:
positive_threshold = st.number_input(
"Positive Threshold",
min_value=1,
max_value=5,
value=3,
step=1,
help="The minimum rating value considered as a 'positive' interaction.",
)
with col2_r3:
knn = st.number_input(
"K-Nearest Neighbors (KNN)",
min_value=1,
max_value=50,
value=10,
step=1,
help="The number of nearest neighbors to consider for KNN-based models.",
)
with col3_r3:
expl = st.checkbox(
"Enable Explanations",
value=True,
help="Check to enable model explanations or interpretability features.",
)
model_params = {
"learning_rate": learning_rate,
"weight_decay": weight_decay,
"hidden_layer_features": hidden_layer_features,
"epochs": epochs,
"cuda": cuda,
"optimizer_name": optimizer_name,
"positive_threshold": positive_threshold,
"knn": knn,
"expl": expl,
}
elif model_option == "EMF":
st.subheader("EMF (Explainable Matrix Factorisation) Parameters")
# First Row
col1_r1, col2_r1, col3_r1 = st.columns(3)
with col1_r1:
learning_rate = st.number_input(
"Learning Rate",
min_value=0.0001,
max_value=0.1,
value=0.01,
step=0.001,
format="%.4f",
help="The step size at each iteration for the EMF model.",
)
with col2_r1:
reg_term = st.number_input(
"Regularization Term",
min_value=0.0001,
max_value=1.0,
value=0.001,
step=0.001,
format="%.4f",
help="The regularization factor for the main matrix factorization components.",
)
with col3_r1:
expl_reg_term = st.number_input(
"Explanation Regularization Term",
min_value=0.0,
max_value=1.0,
value=0.0,
step=0.001,
format="%.4f",
help="The regularization factor for the explanation components in EMF.",
)
# Second Row
col1_r2, col2_r2, col3_r2 = st.columns(3)
with col1_r2:
latent_dim = st.number_input(
"Latent Dimension",
min_value=10,
max_value=200,
value=80,
step=10,
help="The number of latent factors used in the matrix factorization.",
)
with col2_r2:
epochs = st.number_input(
"Epochs (iterations)",
min_value=1,
max_value=200,
value=10,
step=5,
help="The number of complete passes through the entire training dataset for EMF.",
)
with col3_r2:
positive_threshold = st.number_input(
"Positive Threshold",
min_value=1,
max_value=5,
value=3,
step=1,
help="The minimum rating value considered as a 'positive' interaction for EMF.",
)
# Third Row
col1_r3, col2_r3, col3_r3 = st.columns(3)
with col1_r3:
knn = st.number_input(
"K-Nearest Neighbors (KNN)",
min_value=1,
max_value=50,
value=10,
step=1,
help="The number of nearest neighbors to consider for KNN-based aspects of EMF.",
)
model_params = {
"learning_rate": learning_rate,
"reg_term": reg_term,
"expl_reg_term": expl_reg_term,
"latent_dim": latent_dim,
"epochs": epochs,
"positive_threshold": positive_threshold,
"knn": knn,
}
elif model_option == "GMF":
st.subheader("GMF (Generalised Matrix Factorisation) Parameters")
# First Row
col1_r1, col2_r1, col3_r1 = st.columns(3)
with col1_r1:
learning_rate = st.number_input(
"Learning Rate",
min_value=0.0001,
max_value=0.1,
value=0.005,
step=0.001,
format="%.4f",
help="The step size at each iteration for the GMF model.",
)
with col2_r1:
weight_decay = st.number_input(
"Weight Decay",
min_value=0.0000001,
max_value=0.0001,
value=0.0000001,
step=0.0000001,
format="%.7f",
help="The regularization factor to prevent overfitting in GMF.",
)
with col3_r1:
latent_dim = st.number_input(
"Latent Dimension",
min_value=4,
max_value=128,
value=8,
step=4,
help="The number of latent factors for users and items in GMF.",
)
# Second Row
col1_r2, col2_r2, col3_r2 = st.columns(3)
with col1_r2:
epochs = st.number_input(
"Epochs (iterations)",
min_value=1,
max_value=200,
value=30,
step=5,
help="The number of complete passes through the training data for GMF.",
)
with col2_r2:
num_negative = st.number_input(
"Number of Negative Samples",
min_value=1,
max_value=100,
value=10,
step=1,
help="The number of negative samples per positive interaction during training.",
)
with col3_r2:
batch_size = st.number_input(
"Batch Size",
min_value=64,
max_value=4096,
value=1024,
step=64,
help="The number of samples per gradient update.",
)
# Third Row
col1_r3, col2_r3, col3_r3 = st.columns(3)
with col1_r3:
cuda = st.checkbox(
"Use CUDA (GPU)",
value=False,
help="Check to use NVIDIA CUDA for GPU acceleration if available for GMF.",
)
with col2_r3:
optimizer_name = st.selectbox(
"Optimizer",
options=["adam", "sgd", "rmsprop"],
index=0, # 'adam'
help="The optimization algorithm to use for training the GMF model.",
)
# col3_r3 is left empty here if no further parameters for GMF
model_params = {
"learning_rate": learning_rate,
"weight_decay": weight_decay,
"latent_dim": latent_dim,
"epochs": epochs,
"num_negative": num_negative,
"batch_size": batch_size,
"cuda": cuda,
"optimizer_name": optimizer_name,
}
elif model_option == "MLP":
st.subheader("MLP (Multi-Layer Perceptron) Parameters")
# First Row
col1_r1, col2_r1, col3_r1 = st.columns(3)
with col1_r1:
learning_rate = st.number_input(
"Learning Rate",
min_value=0.0001,
max_value=0.1,
value=0.005,
step=0.001,
format="%.4f",
help="The step size at each iteration for the MLP model.",
)
with col2_r1:
weight_decay = st.number_input(
"Weight Decay",
min_value=0.0000001,
max_value=0.0001,
value=0.0000001,
step=0.0000001,
format="%.7f",
help="The regularization factor to prevent overfitting in MLP.",
)
with col3_r1:
latent_dim = st.number_input(
"Latent Dimension",
min_value=4,
max_value=128,
value=8,
step=4,
help="The number of latent factors for users and items in MLP.",
)
# Second Row
col1_r2, col2_r2, col3_r2 = st.columns(3)
with col1_r2:
epochs = st.number_input(
"Epochs (iterations)",
min_value=1,
max_value=200,
value=30,
step=5,
help="The number of complete passes through the training data for MLP.",
)
with col2_r2:
num_negative = st.number_input(
"Number of Negative Samples",
min_value=1,
max_value=100,
value=10,
step=1,
help="The number of negative samples per positive interaction during MLP training.",
)
with col3_r2:
batch_size = st.number_input(
"Batch Size",
min_value=64,
max_value=4096,
value=1024,
step=64,
help="The number of samples per gradient update for MLP.",
)
# Third Row
col1_r3, col2_r3, col3_r3 = st.columns(3)
with col1_r3:
cuda = st.checkbox(
"Use CUDA (GPU)",
value=False,
help="Check to use NVIDIA CUDA for GPU acceleration if available for MLP.",
)
with col2_r3:
optimizer_name = st.selectbox(
"Optimizer",
options=["adam", "sgd", "rmsprop"],
index=0, # 'adam'
help="The optimization algorithm to use for training the MLP model.",
)
# col3_r3 is left empty here if no further parameters for MLP
model_params = {
"learning_rate": learning_rate,
"weight_decay": weight_decay,
"latent_dim": latent_dim,
"epochs": epochs,
"num_negative": num_negative,
"batch_size": batch_size,
"cuda": cuda,
"optimizer_name": optimizer_name,
}
elif model_option == "KNN":
st.subheader("KNN (K-Nearest Neighbors) Parameters")
# First Row
col1_r1, col2_r1, col3_r1 = st.columns(3)
with col1_r1:
k_neighbors = st.number_input(
"Number of Neighbors (k)",
min_value=1,
max_value=100,
value=50,
step=1,
help="The number of nearest neighbors to consider for making predictions.",
)
with col2_r1:
min_k_neighbors = st.number_input(
"Minimum Number of Neighbors",
min_value=1,
max_value=20,
value=3,
step=1,
help="The minimum number of neighbors required to make a prediction.",
)
with col3_r1:
similarity_type = st.selectbox(
"Similarity Metric",
options=["cosine", "pearson"],
index=1, # 'pearson'
help="The similarity metric to use for finding nearest neighbors.",
)
# Second Row
col1_r2, col2_r2, col3_r2 = st.columns(3)
with col1_r2:
boolean_user_based = st.checkbox(
"User-Based Collaborative Filtering",
value=True,
help="Check to use user-based collaborative filtering; uncheck for item-based.",
)
# col2_r2 and col3_r2 is left empty here if no further parameters for KNN
model_params = {
"k_neighbors": k_neighbors,
"min_k_neighbors": min_k_neighbors,
"similarity_type": similarity_type,
"boolean_user_based": boolean_user_based,
}
elif model_option == "SVD":
st.subheader("SVD Parameters")
N_FACTORS = 64
N_EPOCHS = 30
LEARNING_RATE = 0.005
REGULARIZATION = 0.08
RANDOM_STATE = 42
# First Row
col1_r1, col2_r1, col3_r1 = st.columns(3)
with col1_r1:
n_factors = st.number_input(
"Latent Dimensions (factors)",
min_value=1,
max_value=100,
value=64,
step=1,
help="The number of latent factors to compute.",
)
with col2_r1:
n_epochs = st.number_input(
"Epochs (iterations)",
min_value=1,
max_value=35,
value=30,
step=1,
help="The number of model iterations.",
)
with col3_r1:
learning_rater = st.number_input(
"Learning Rate",
min_value=0.001,
max_value=0.050,
value=0.005,
help="The step size at each iteration while moving toward a minimum of the loss function.",
)
# Second Row
col1_r2, col2_r2, col3_r2 = st.columns(3)
with col1_r2:
early_stopping = st.checkbox(
"Enable Early Stopping",
value=False,
help="Check to stop training when validation performance degrades.",
)
with col2_r2:
reg = st.number_input(
"Regularization Term",
min_value=0.01,
max_value=0.50,
value=0.08,
step=0.01,
format="%.3f",
help="The regularization factor.",
)
with col3_r2:
init_mean = st.number_input(
"Initialization Mean",
min_value=0.0,
max_value=0.5,
value=0.0,
step=0.01,
format="%.2f",
help="The mean for initializing latent factors.",
)
# Third Row
col1_r3, col2_r3, col3_r3 = st.columns(3)
with col1_r3:
init_std = st.number_input(
"Initialization Standard Deviation",
min_value=0.00,
max_value=0.5,
value=0.0,
step=0.01,
format="%.2f",
help="The standard deviation for initializing latent factors.",
)
with col2_r3:
random_state = st.number_input(
"Random State (Seed)",
min_value=1,
max_value=100,
value=42,
step=1,
help="The seed for random number generation to ensure reproducibility.",
)
# col3_r3 is left empty here if no further parameters for SVD
model_params = {
"n_factors": n_factors,
"n_epochs": n_epochs,
"learning_rater": learning_rater,
"reg": reg,
"init_mean": init_mean,
"init_std": init_std,
"random_state": random_state,
"early_stopping": early_stopping,
}
else:
st.info(f"Configuration for **{model_option}** is not yet implemented.")
st.stop()
# Model Training
st.header("3. Train the Model")
if st.button("Train Model", type="primary"):
with st.spinner(f"Training **{model_option}** model... This may take a moment."):
try:
# Retrieve the data_reader object from session state
data_reader = st.session_state.data_reader
model = None
# 1. Instantiate the model with user-defined hyperparameters
if model_option == "ALS":
model = ALS(**model_params)
elif model_option == "BPR":
model = BPR(**model_params)
elif model_option == "Autoencoder":
autoencoder_params = {
k: v
for k, v in model_params.items()
if k not in ["num_users", "num_items"]
}
model = ExplAutoencoderTorch(**autoencoder_params)
elif model_option == "EMF":
emf_params = {
k: v
for k, v in model_params.items()
if k not in ["num_users", "num_items"]
}
model = EMFModel(**emf_params)
elif model_option == "GMF":
gmf_params = {
k: v
for k, v in model_params.items()
if k not in ["num_users", "num_items"]
}
model = GMFModel(**gmf_params)
elif model_option == "MLP":
mlp_params = {
k: v
for k, v in model_params.items()
if k not in ["num_users", "num_items"]
}
model = MLPModel(**mlp_params)
elif model_option == "KNN":
if "k_neighbors" in model_params:
model_params["k"] = model_params.pop("k_neighbors")
knn_params = {
k: v
for k, v in model_params.items()
if k not in ["num_users", "num_items"]
}
model = KNNBasic(**knn_params)
elif model_option == "SVD":
if "learning_rater" in model_params:
model_params["lr"] = model_params.pop("learning_rater")
svd_params = {
k: v
for k, v in model_params.items()
if k not in ["num_users", "num_items"]
}
model = SVD(**svd_params)
if model:
start_time = time.time()
# 2. Fit the model using the processed dataset
model.fit(data_reader)
end_time = time.time()
training_time = end_time - start_time
# 3. Store the trained model in session state for the next page
st.session_state.trained_model = model
st.session_state.model_name = model_option
st.success(
f"✅ **{model_option}** model trained successfully in {training_time:.2f} seconds!"
)
except Exception as e:
st.error(f"An error occurred during model training: {e}")
if "trained_model" in st.session_state:
del st.session_state.trained_model
if "trained_model" in st.session_state:
st.markdown("")
st.header("4. Offline Model Evaluation")
with st.expander("🔬 Run Model Evaluation", expanded=True):
st.markdown("""
Choose your evaluation method:
- **Leave-One-Out**: More thorough but slower (recommended for final evaluation)
- **Train/Test Split**: Faster and practical for iterative testing
**Metrics Explained:**
- **Hit Ratio @10**: Percentage of users for whom we found at least one relevant item in top-10
- **NDCG @10**: Measures ranking quality - higher values mean better ranking of relevant items
""")
# Evaluation method selection
eval_method = st.radio(
"Select Evaluation Method:",
["Train/Test Split (Fast)", "Leave-One-Out (Thorough)"],
index=0,
)
# Parameters
col1, col2 = st.columns(2)
with col1:
test_size = 0.2 # Default value
if eval_method == "Train/Test Split (Fast)":
test_size = st.slider("Test Set Size (%)", 10, 30, 20) / 100
eval_top_n = st.number_input("Top-N for evaluation", 1, 20, 10)
with col2:
if eval_method == "Leave-One-Out (Thorough)":
st.info("Leave-one-out will use 1 item per user for testing")
# Run evaluation button
eval_button_key = f"run_eval_{eval_method.replace(' ', '_').replace('(', '').replace(')', '')}"
if st.button("Run Evaluation", key=eval_button_key, type="primary"):
with st.spinner(
f"Running {eval_method.lower()} evaluation... Please wait."
):
try:
# Get the model configuration for re-instantiation
model_name = st.session_state.model_name
data_reader = st.session_state.data_reader
# Re-instantiate model with same parameters
if model_option == "ALS":
eval_model = ALS(**model_params)
elif model_option == "BPR":
eval_model = BPR(**model_params)
elif model_option == "Autoencoder":
autoencoder_params = {
k: v
for k, v in model_params.items()
if k not in ["num_users", "num_items"]
}
eval_model = ExplAutoencoderTorch(**autoencoder_params)
elif model_option == "EMF":
emf_params = {
k: v
for k, v in model_params.items()
if k not in ["num_users", "num_items"]
}
eval_model = EMFModel(**emf_params)
elif model_option == "GMF":
gmf_params = {
k: v
for k, v in model_params.items()
if k not in ["num_users", "num_items"]
}
eval_model = GMFModel(**gmf_params)
elif model_option == "MLP":
mlp_params = {
k: v
for k, v in model_params.items()
if k not in ["num_users", "num_items"]
}
eval_model = MLPModel(**mlp_params)
elif model_option == "KNN":
if "k_neighbors" in model_params:
model_params["k"] = model_params.pop("k_neighbors")
knn_params = {
k: v
for k, v in model_params.items()
if k not in ["num_users", "num_items"]
}
eval_model = KNNBasic(**knn_params)
elif model_option == "SVD":
if "learning_rater" in model_params:
model_params["lr"] = model_params.pop("learning_rater")
svd_params = {
k: v
for k, v in model_params.items()
if k not in ["num_users", "num_items"]
}
eval_model = SVD(**svd_params)
else:
st.error(f"Evaluation not implemented for {model_name}")
st.stop()
# Run the appropriate evaluation
if eval_method == "Leave-One-Out (Thorough)":
evaluation_scores = run_leave_one_out_evaluation(
data_reader=data_reader,
model=eval_model,
top_n=eval_top_n,
)
else: # Train/Test Split
evaluation_scores = run_evaluation_with_proper_split(
data_reader=data_reader,
model=eval_model,
test_size=test_size,
top_n=eval_top_n,
)
# Store results
st.session_state.evaluation_scores = evaluation_scores
st.session_state.eval_method = eval_method
except Exception as e:
st.error(f"Evaluation failed: {str(e)}")
st.exception(e)
# Display results if available
if "evaluation_scores" in st.session_state:
st.markdown("")
st.subheader("📊 Evaluation Results")
scores = st.session_state.evaluation_scores
method = st.session_state.get("eval_method", "")
# Metrics display
col1, col2, col3 = st.columns(3)
with col1:
st.metric(
label=f"Hit Ratio @{eval_top_n}",
value=f"{scores.get('Hit Ratio', 0.0):.2%}",
help="Percentage of test users for whom at least one relevant item was found in top-10",
)
with col2:
ndcg_value = scores.get("NDCG", scores.get("eNDCG", 0.0))
st.metric(
label=f"NDCG @{eval_top_n}",
value=f"{ndcg_value:.4f}",
help="Normalized Discounted Cumulative Gain - measures ranking quality",
)
with col3:
st.metric(
label="Evaluation Time",
value=f"{scores.get('evaluation_time', 0):.1f}s",
help="Time taken to complete the evaluation",
)
# Additional info
if "test_interactions" in scores:
st.info(
f"📈 Evaluated on {scores['test_interactions']:,} test interactions using {method}"
)
# Performance interpretation
hit_ratio = scores.get("Hit Ratio", 0.0)
ndcg = ndcg_value
st.markdown("### 🎯 Performance Interpretation")
if hit_ratio > 0.15 and ndcg > 0.08:
st.success(
"🎉 Excellent performance! Your model shows strong recommendation capability."
)
elif hit_ratio > 0.08 and ndcg > 0.04:
st.success("✅ Good performance! Your model is working well.")
elif hit_ratio > 0.03 and ndcg > 0.02:
st.warning(
"⚠️ Moderate performance. Consider tuning hyperparameters or trying a different model."
)
else:
st.error(
"❌ Poor performance. The model may need significant improvements."
)
st.info("Navigate to the **🎯 Group Recommendation** page to continue.")
+156
View File
@@ -0,0 +1,156 @@
import streamlit as st
import pandas as pd
from pygrex.recommender import GroupRecommender
from pygrex.utils import AggregationStrategy
st.set_page_config(page_title="Group Recommendation", page_icon="🎯", layout="wide")
st.title("🎯 Group Recommendation")
# Session State Checks
# Ensure data is loaded and a model is trained before proceeding.
if not st.session_state.get("data_loaded", False):
st.warning("⚠️ Please load data on the **📄 Data Preparation** page first.")
st.stop()
if not st.session_state.get("trained_model", False):
st.warning("⚠️ Please train a model on the **🧠 Model Training** page first.")
st.stop()
# Retrieve objects from session state
data_reader = st.session_state.data_reader
group_handler = st.session_state.group_handler
model = st.session_state.trained_model
model_name = st.session_state.model_name
# Recommendation Setup
st.header("1. Select a Group and Strategy")
group_filename = st.session_state.group_filename
try:
available_groups = group_handler.read_groups(filename=group_filename)
col1, col2 = st.columns(2)
with col1:
selected_group_id = st.selectbox(
"Choose a group:",
options=available_groups,
help="These groups were loaded from your group data file.",
)
# Parse and display members of the selected group
if selected_group_id:
group_members = group_handler.parse_group_members(selected_group_id)
st.write("👥 **Group Members:**", ", ".join(map(str, group_members)))
with col2:
# Use the AggregationStrategy Enum to populate the selectbox
agg_strategy_enum = st.selectbox(
"Choose an aggregation strategy:",
options=list(AggregationStrategy),
format_func=lambda x: x.name.replace("_", " ").title(),
help="Select the method for combining individual member preferences.",
)
# Conditional Input for Most Respected Person
mrp_id = None
if agg_strategy_enum == AggregationStrategy.MOST_RESPECTED_PERSON:
mrp_id = st.selectbox(
"Select the Most Respected Person:",
options=group_members, # type: ignore
help="This user's preferences will solely determine the group recommendation.",
)
except Exception as e:
st.error(f"Could not read groups from file '{group_filename}'. Error: {e}")
st.stop()
# Top-K Configuration
st.header("2. Specify Number of Recommendations")
top_k = st.slider(
"Number of items to recommend (Top-K):",
min_value=1,
max_value=50,
value=10,
help="Adjust the slider to change the length of the final recommendation list.",
)
# Generate Recommendations
st.header("3. Generate and View Recommendations")
if st.button("Generate Group Recommendations", type="primary"):
if not selected_group_id:
st.warning("Please select a group first.")
else:
with st.spinner("Generating recommendations..."):
try:
# 1. Instantiate the GroupRecommender
group_recommender = GroupRecommender(data=data_reader)
# 2. Setup the recommendation process
group_recommender.setup_recommendation(
model=model,
members=group_members, # type: ignore
data=data_reader,
aggregation_strategy=agg_strategy_enum,
most_respected_person=mrp_id,
)
# 3. Get the final recommendation list
recommended_items = group_recommender.get_group_recommendations(
top_k=top_k
)
# Store the recommender instance for the explanation page
st.session_state.group_recommender = group_recommender
st.session_state.recommended_items = recommended_items
st.success("✅ Recommendations generated successfully!")
except Exception as e:
st.error(f"An error occurred while generating recommendations: {e}")
# Display Results
if "recommended_items" in st.session_state:
st.markdown("")
st.subheader(f"Top {top_k} Recommended Items")
recommender = st.session_state.group_recommender
scores = recommender.get_recommendation_scores()
# Create a DataFrame for nice display
rec_data = []
for i, item_id in enumerate(st.session_state.recommended_items): # type: ignore
rec_data.append(
{
"Rank": i + 1,
"Item ID": item_id,
"Aggregated Score": scores.get(item_id, 0.0),
}
)
if not rec_data:
st.info("No recommendations were generated for this group.")
else:
st.dataframe(pd.DataFrame(rec_data), use_container_width=True, hide_index=True)
# Show detailed individual predictions
with st.expander("🔍 View Individual Predictions"):
individual_preds = recommender.get_individual_predictions()
if individual_preds:
# Convert to a more readable DataFrame
df_preds = pd.DataFrame(
individual_preds
).T # Transpose to have users as rows
df_preds.index.name = "User ID"
st.write(
"Predicted scores (1-5 scale) for each user on items in the candidate pool:"
)
st.dataframe(df_preds.head(10))
else:
st.write("No individual predictions available.")
st.info(
"Navigate to the **💬 Explanation & Evaluation** page to analyze these recommendations."
)
File diff suppressed because it is too large Load Diff