public code v1
This commit is contained in:
@@ -0,0 +1,162 @@
|
||||
import streamlit as st
|
||||
import pandas as pd
|
||||
import os
|
||||
from io import StringIO
|
||||
|
||||
# Library Imports
|
||||
from pygrex.data_reader import DataReader, GroupInteractionHandler
|
||||
|
||||
# Page Configuration
|
||||
st.set_page_config(page_title="Data Preparation", page_icon="📄", layout="wide")
|
||||
|
||||
st.title("📄 Data Preparation")
|
||||
|
||||
# Default File Paths
|
||||
DEFAULT_RATINGS_PATH = "datasets/stratigis/ratings.csv"
|
||||
DEFAULT_GROUPS_PATH = "datasets/stratigis/groupsWithHighRatings5.txt"
|
||||
|
||||
# Session State Initialization
|
||||
if "data_loaded" not in st.session_state:
|
||||
st.session_state.data_loaded = False
|
||||
st.session_state.data_reader = None
|
||||
st.session_state.group_handler = None
|
||||
st.session_state.num_groups = 0
|
||||
|
||||
# DATA INPUT SECTION
|
||||
|
||||
# Ratings Input
|
||||
st.header("1. Ratings Data")
|
||||
st.markdown(
|
||||
"You can upload your own ratings file or use the default **MovieLens 100k** dataset."
|
||||
)
|
||||
ratings_file_buffer = st.file_uploader(
|
||||
"Upload Your Ratings Data (Optional)", type=["csv"]
|
||||
)
|
||||
|
||||
# Group Input
|
||||
st.header("2. Group Data")
|
||||
group_input_method = st.radio(
|
||||
"Choose group input method:",
|
||||
("Enter groups manually", "Upload a group file"),
|
||||
horizontal=True,
|
||||
)
|
||||
|
||||
# Load default group data for the text area
|
||||
default_group_text = ""
|
||||
if os.path.exists(DEFAULT_GROUPS_PATH) and ratings_file_buffer is None:
|
||||
with open(DEFAULT_GROUPS_PATH, "r") as f:
|
||||
default_group_text = f.read()
|
||||
|
||||
if group_input_method == "Enter groups manually":
|
||||
group_text_input = st.text_area(
|
||||
"Enter group members (one group per line, members separated by '_')",
|
||||
value=default_group_text,
|
||||
height=150,
|
||||
)
|
||||
else:
|
||||
groups_file_buffer = st.file_uploader(
|
||||
"Upload Your Group Data (Optional)", type=["txt"]
|
||||
)
|
||||
|
||||
# Preprocessing Options
|
||||
st.header("3. Preprocessing")
|
||||
binarize_data = st.checkbox(
|
||||
"Binarize ratings (for implicit feedback models)", value=True
|
||||
)
|
||||
if binarize_data:
|
||||
binary_threshold = st.number_input(
|
||||
"Rating threshold for binarization", min_value=0.0, value=1.0, step=0.5
|
||||
)
|
||||
|
||||
# Main Loading Logic
|
||||
st.header("4. Load and Process")
|
||||
if st.button("Load and Process Data", type="primary"):
|
||||
with st.spinner("Processing data..."):
|
||||
try:
|
||||
desired_columns = ["userId", "itemId", "rating", "timestamp"]
|
||||
# Determine which ratings file to use
|
||||
if ratings_file_buffer:
|
||||
ratings_df = pd.read_csv(
|
||||
StringIO(ratings_file_buffer.getvalue().decode("utf-8")),
|
||||
sep=",",
|
||||
usecols=lambda column: column in desired_columns,
|
||||
)
|
||||
else:
|
||||
if not os.path.exists(DEFAULT_RATINGS_PATH):
|
||||
st.error(
|
||||
f"Default ratings file not found at: `{DEFAULT_RATINGS_PATH}`"
|
||||
)
|
||||
st.stop()
|
||||
ratings_df = pd.read_csv(
|
||||
DEFAULT_RATINGS_PATH,
|
||||
sep=",",
|
||||
names=desired_columns,
|
||||
skiprows=1,
|
||||
)
|
||||
ratings_df = ratings_df[desired_columns]
|
||||
|
||||
# Determine which group data to use and prepare it for the handler
|
||||
temp_dir = "temp/group_data"
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
groups_filepath = os.path.join(temp_dir, "current_groups.txt")
|
||||
|
||||
if group_input_method == "Enter groups manually":
|
||||
with open(groups_filepath, "w") as f:
|
||||
f.write(group_text_input) # type: ignore
|
||||
st.session_state.group_filename = os.path.basename(groups_filepath)
|
||||
else: # File upload method
|
||||
if groups_file_buffer: # type: ignore
|
||||
with open(groups_filepath, "wb") as f:
|
||||
f.write(groups_file_buffer.getbuffer())
|
||||
st.session_state.group_filename = groups_file_buffer.name
|
||||
else: # Fallback to default if no file is uploaded
|
||||
if not os.path.exists(DEFAULT_GROUPS_PATH):
|
||||
st.error(
|
||||
f"Default groups file not found at: `{DEFAULT_GROUPS_PATH}`"
|
||||
)
|
||||
st.stop()
|
||||
groups_filepath = DEFAULT_GROUPS_PATH
|
||||
st.session_state.group_filename = os.path.basename(groups_filepath)
|
||||
|
||||
# Instantiate library classes and process data
|
||||
data_reader = DataReader(dataframe=ratings_df)
|
||||
group_handler = GroupInteractionHandler(filepath_or_buffer=groups_filepath)
|
||||
|
||||
if binarize_data:
|
||||
data_reader.binarize(binary_threshold=binary_threshold) # type: ignore
|
||||
data_reader.make_consecutive_ids_in_dataset()
|
||||
|
||||
available_groups = group_handler.read_groups(
|
||||
filename=st.session_state.group_filename
|
||||
)
|
||||
|
||||
# Store results in session state
|
||||
st.session_state.data_reader = data_reader
|
||||
st.session_state.group_handler = group_handler
|
||||
st.session_state.num_groups = len(available_groups)
|
||||
st.session_state.data_loaded = True
|
||||
|
||||
st.success("✅ Data loaded and processed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"An error occurred: {e}")
|
||||
st.session_state.data_loaded = False
|
||||
|
||||
|
||||
# Enhanced Data Summary
|
||||
if st.session_state.data_loaded:
|
||||
st.markdown("")
|
||||
st.header("Data Summary")
|
||||
|
||||
dr = st.session_state.data_reader
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
st.metric("👥 Unique Users", f"{dr.num_user:,}") # type: ignore
|
||||
st.metric("📦 Unique Items", f"{dr.num_item:,}") # type: ignore
|
||||
with col2:
|
||||
st.metric("⭐ Total Ratings", f"{len(dr.get_raw_dataset()):,}") # type: ignore
|
||||
st.metric("👨👩👧👦 Number of Groups", f"{st.session_state.num_groups:,}")
|
||||
|
||||
with st.expander("Processed Ratings DataFrame Head:", expanded=True):
|
||||
st.dataframe(dr.dataset.head(), hide_index=True) # type: ignore
|
||||
@@ -0,0 +1,956 @@
|
||||
import streamlit as st
|
||||
import time
|
||||
|
||||
# Library Imports
|
||||
from pygrex.models import (
|
||||
ALS,
|
||||
BPR,
|
||||
ExplAutoencoderTorch,
|
||||
EMFModel,
|
||||
GMFModel,
|
||||
MLPModel,
|
||||
SVD,
|
||||
KNNBasic,
|
||||
)
|
||||
from pygrex.evaluator import (
|
||||
run_leave_one_out_evaluation,
|
||||
run_evaluation_with_proper_split,
|
||||
)
|
||||
|
||||
st.set_page_config(page_title="Model Training", page_icon="🧠", layout="wide")
|
||||
|
||||
st.title("🧠 Model Selection & Training")
|
||||
|
||||
# Check if data is loaded
|
||||
if not st.session_state.get("data_loaded", False):
|
||||
st.warning("⚠️ Please load data on the **📄 Data Preparation** page first.")
|
||||
st.stop() # Stop execution if no data is loaded
|
||||
|
||||
# Model Selection
|
||||
st.header("1. Select a Model")
|
||||
# As you add more models to your library, you can add them to this list.
|
||||
model_option = st.selectbox(
|
||||
"Choose a recommendation model:",
|
||||
("ALS", "BPR", "Autoencoder", "EMF", "GMF", "MLP", "KNN", "SVD"),
|
||||
)
|
||||
|
||||
# Hyperparameter Configuration
|
||||
st.header("2. Configure Hyperparameters")
|
||||
model_params = {}
|
||||
|
||||
if model_option == "ALS":
|
||||
st.subheader("ALS (Alternating Least Squares) Parameters")
|
||||
|
||||
# Create columns for a cleaner layout
|
||||
col1, col2, col3 = st.columns(3)
|
||||
with col1:
|
||||
latent_dim = st.number_input(
|
||||
"Latent Dimensions (factors)",
|
||||
min_value=1,
|
||||
max_value=500,
|
||||
value=100,
|
||||
step=10,
|
||||
help="The number of latent factors to compute.",
|
||||
)
|
||||
with col2:
|
||||
reg_term = st.number_input(
|
||||
"Regularization Term",
|
||||
min_value=0.001,
|
||||
max_value=1.0,
|
||||
value=0.001,
|
||||
step=0.001,
|
||||
format="%.3f",
|
||||
help="The regularization factor.",
|
||||
)
|
||||
with col3:
|
||||
epochs = st.number_input(
|
||||
"Epochs (iterations)",
|
||||
min_value=1,
|
||||
max_value=200,
|
||||
value=10,
|
||||
step=5,
|
||||
help="The number of ALS iterations.",
|
||||
)
|
||||
model_params = {
|
||||
"latent_dim": latent_dim,
|
||||
"reg_term": reg_term,
|
||||
"epochs": epochs,
|
||||
}
|
||||
|
||||
|
||||
elif model_option == "BPR":
|
||||
st.subheader("BPR (Bayesian Personalised Ranking) Parameters")
|
||||
|
||||
# First Row
|
||||
col1_r1, col2_r1, col3_r1 = st.columns(3)
|
||||
with col1_r1:
|
||||
latent_dim = st.number_input(
|
||||
"Latent Dimensions (factors)",
|
||||
min_value=1,
|
||||
max_value=500,
|
||||
value=100,
|
||||
step=10,
|
||||
help="The number of latent factors to compute.",
|
||||
)
|
||||
with col2_r1:
|
||||
reg_term = st.number_input(
|
||||
"Regularization Term",
|
||||
min_value=0.001,
|
||||
max_value=1.0,
|
||||
value=0.001,
|
||||
step=0.001,
|
||||
format="%.3f",
|
||||
help="The regularization factor.",
|
||||
)
|
||||
with col3_r1:
|
||||
epochs = st.number_input(
|
||||
"Epochs (iterations)",
|
||||
min_value=1,
|
||||
max_value=200,
|
||||
value=10,
|
||||
step=5,
|
||||
help="The number of ALS iterations.",
|
||||
)
|
||||
|
||||
# Second Row
|
||||
col1_r2, col2_r2, col3_r2 = st.columns(3)
|
||||
|
||||
with col1_r2:
|
||||
learning_rate = st.number_input(
|
||||
"Learning Rate",
|
||||
min_value=0.0,
|
||||
max_value=0.1,
|
||||
value=0.01,
|
||||
step=0.01,
|
||||
format="%.2f",
|
||||
help="The step size at each iteration while moving toward a minimum of the loss function.",
|
||||
)
|
||||
model_params = {
|
||||
"latent_dim": latent_dim,
|
||||
"reg_term": reg_term,
|
||||
"epochs": epochs,
|
||||
"learning_rate": learning_rate,
|
||||
}
|
||||
|
||||
elif model_option == "Autoencoder":
|
||||
st.subheader("Autoencoder Parameters")
|
||||
# First Row
|
||||
col1_r1, col2_r1, col3_r1 = st.columns(3)
|
||||
|
||||
with col1_r1:
|
||||
learning_rate = st.number_input(
|
||||
"Learning Rate",
|
||||
min_value=0.0001,
|
||||
max_value=0.1,
|
||||
value=0.005,
|
||||
step=0.001,
|
||||
format="%.4f",
|
||||
help="The step size at each iteration while moving toward a minimum of the loss function.",
|
||||
)
|
||||
|
||||
with col2_r1:
|
||||
weight_decay = st.number_input(
|
||||
"Weight Decay",
|
||||
min_value=0.0000001,
|
||||
max_value=0.0001,
|
||||
value=0.0000001,
|
||||
step=0.0000001,
|
||||
format="%.7f",
|
||||
help="The regularization factor to prevent overfitting by penalizing large weights.",
|
||||
)
|
||||
|
||||
with col3_r1:
|
||||
hidden_layer_features = st.number_input(
|
||||
"Hidden Layer Features",
|
||||
min_value=4,
|
||||
max_value=128,
|
||||
value=8,
|
||||
step=4,
|
||||
help="The number of features in the hidden layers of the neural network.",
|
||||
)
|
||||
|
||||
# Second Row
|
||||
col1_r2, col2_r2, col3_r2 = st.columns(3)
|
||||
|
||||
with col1_r2:
|
||||
epochs = st.number_input(
|
||||
"Epochs (iterations)",
|
||||
min_value=1,
|
||||
max_value=200,
|
||||
value=30,
|
||||
step=5,
|
||||
help="The number of complete passes through the entire training dataset.",
|
||||
)
|
||||
|
||||
with col2_r2:
|
||||
cuda = st.checkbox(
|
||||
"Use CUDA (GPU)",
|
||||
value=False,
|
||||
help="Check to use NVIDIA CUDA for GPU acceleration if available.",
|
||||
)
|
||||
|
||||
with col3_r2:
|
||||
optimizer_name = st.selectbox(
|
||||
"Optimizer",
|
||||
options=["adam", "sgd", "rmsprop"],
|
||||
index=0, # 'adam'
|
||||
help="The optimization algorithm to use for training the model.",
|
||||
)
|
||||
|
||||
# Third Row
|
||||
col1_r3, col2_r3, col3_r3 = st.columns(3)
|
||||
|
||||
with col1_r3:
|
||||
positive_threshold = st.number_input(
|
||||
"Positive Threshold",
|
||||
min_value=1,
|
||||
max_value=5,
|
||||
value=3,
|
||||
step=1,
|
||||
help="The minimum rating value considered as a 'positive' interaction.",
|
||||
)
|
||||
|
||||
with col2_r3:
|
||||
knn = st.number_input(
|
||||
"K-Nearest Neighbors (KNN)",
|
||||
min_value=1,
|
||||
max_value=50,
|
||||
value=10,
|
||||
step=1,
|
||||
help="The number of nearest neighbors to consider for KNN-based models.",
|
||||
)
|
||||
|
||||
with col3_r3:
|
||||
expl = st.checkbox(
|
||||
"Enable Explanations",
|
||||
value=True,
|
||||
help="Check to enable model explanations or interpretability features.",
|
||||
)
|
||||
model_params = {
|
||||
"learning_rate": learning_rate,
|
||||
"weight_decay": weight_decay,
|
||||
"hidden_layer_features": hidden_layer_features,
|
||||
"epochs": epochs,
|
||||
"cuda": cuda,
|
||||
"optimizer_name": optimizer_name,
|
||||
"positive_threshold": positive_threshold,
|
||||
"knn": knn,
|
||||
"expl": expl,
|
||||
}
|
||||
|
||||
elif model_option == "EMF":
|
||||
st.subheader("EMF (Explainable Matrix Factorisation) Parameters")
|
||||
|
||||
# First Row
|
||||
col1_r1, col2_r1, col3_r1 = st.columns(3)
|
||||
|
||||
with col1_r1:
|
||||
learning_rate = st.number_input(
|
||||
"Learning Rate",
|
||||
min_value=0.0001,
|
||||
max_value=0.1,
|
||||
value=0.01,
|
||||
step=0.001,
|
||||
format="%.4f",
|
||||
help="The step size at each iteration for the EMF model.",
|
||||
)
|
||||
|
||||
with col2_r1:
|
||||
reg_term = st.number_input(
|
||||
"Regularization Term",
|
||||
min_value=0.0001,
|
||||
max_value=1.0,
|
||||
value=0.001,
|
||||
step=0.001,
|
||||
format="%.4f",
|
||||
help="The regularization factor for the main matrix factorization components.",
|
||||
)
|
||||
|
||||
with col3_r1:
|
||||
expl_reg_term = st.number_input(
|
||||
"Explanation Regularization Term",
|
||||
min_value=0.0,
|
||||
max_value=1.0,
|
||||
value=0.0,
|
||||
step=0.001,
|
||||
format="%.4f",
|
||||
help="The regularization factor for the explanation components in EMF.",
|
||||
)
|
||||
|
||||
# Second Row
|
||||
col1_r2, col2_r2, col3_r2 = st.columns(3)
|
||||
|
||||
with col1_r2:
|
||||
latent_dim = st.number_input(
|
||||
"Latent Dimension",
|
||||
min_value=10,
|
||||
max_value=200,
|
||||
value=80,
|
||||
step=10,
|
||||
help="The number of latent factors used in the matrix factorization.",
|
||||
)
|
||||
|
||||
with col2_r2:
|
||||
epochs = st.number_input(
|
||||
"Epochs (iterations)",
|
||||
min_value=1,
|
||||
max_value=200,
|
||||
value=10,
|
||||
step=5,
|
||||
help="The number of complete passes through the entire training dataset for EMF.",
|
||||
)
|
||||
|
||||
with col3_r2:
|
||||
positive_threshold = st.number_input(
|
||||
"Positive Threshold",
|
||||
min_value=1,
|
||||
max_value=5,
|
||||
value=3,
|
||||
step=1,
|
||||
help="The minimum rating value considered as a 'positive' interaction for EMF.",
|
||||
)
|
||||
|
||||
# Third Row
|
||||
col1_r3, col2_r3, col3_r3 = st.columns(3)
|
||||
|
||||
with col1_r3:
|
||||
knn = st.number_input(
|
||||
"K-Nearest Neighbors (KNN)",
|
||||
min_value=1,
|
||||
max_value=50,
|
||||
value=10,
|
||||
step=1,
|
||||
help="The number of nearest neighbors to consider for KNN-based aspects of EMF.",
|
||||
)
|
||||
model_params = {
|
||||
"learning_rate": learning_rate,
|
||||
"reg_term": reg_term,
|
||||
"expl_reg_term": expl_reg_term,
|
||||
"latent_dim": latent_dim,
|
||||
"epochs": epochs,
|
||||
"positive_threshold": positive_threshold,
|
||||
"knn": knn,
|
||||
}
|
||||
|
||||
elif model_option == "GMF":
|
||||
st.subheader("GMF (Generalised Matrix Factorisation) Parameters")
|
||||
|
||||
# First Row
|
||||
col1_r1, col2_r1, col3_r1 = st.columns(3)
|
||||
|
||||
with col1_r1:
|
||||
learning_rate = st.number_input(
|
||||
"Learning Rate",
|
||||
min_value=0.0001,
|
||||
max_value=0.1,
|
||||
value=0.005,
|
||||
step=0.001,
|
||||
format="%.4f",
|
||||
help="The step size at each iteration for the GMF model.",
|
||||
)
|
||||
|
||||
with col2_r1:
|
||||
weight_decay = st.number_input(
|
||||
"Weight Decay",
|
||||
min_value=0.0000001,
|
||||
max_value=0.0001,
|
||||
value=0.0000001,
|
||||
step=0.0000001,
|
||||
format="%.7f",
|
||||
help="The regularization factor to prevent overfitting in GMF.",
|
||||
)
|
||||
|
||||
with col3_r1:
|
||||
latent_dim = st.number_input(
|
||||
"Latent Dimension",
|
||||
min_value=4,
|
||||
max_value=128,
|
||||
value=8,
|
||||
step=4,
|
||||
help="The number of latent factors for users and items in GMF.",
|
||||
)
|
||||
|
||||
# Second Row
|
||||
col1_r2, col2_r2, col3_r2 = st.columns(3)
|
||||
|
||||
with col1_r2:
|
||||
epochs = st.number_input(
|
||||
"Epochs (iterations)",
|
||||
min_value=1,
|
||||
max_value=200,
|
||||
value=30,
|
||||
step=5,
|
||||
help="The number of complete passes through the training data for GMF.",
|
||||
)
|
||||
|
||||
with col2_r2:
|
||||
num_negative = st.number_input(
|
||||
"Number of Negative Samples",
|
||||
min_value=1,
|
||||
max_value=100,
|
||||
value=10,
|
||||
step=1,
|
||||
help="The number of negative samples per positive interaction during training.",
|
||||
)
|
||||
|
||||
with col3_r2:
|
||||
batch_size = st.number_input(
|
||||
"Batch Size",
|
||||
min_value=64,
|
||||
max_value=4096,
|
||||
value=1024,
|
||||
step=64,
|
||||
help="The number of samples per gradient update.",
|
||||
)
|
||||
|
||||
# Third Row
|
||||
col1_r3, col2_r3, col3_r3 = st.columns(3)
|
||||
|
||||
with col1_r3:
|
||||
cuda = st.checkbox(
|
||||
"Use CUDA (GPU)",
|
||||
value=False,
|
||||
help="Check to use NVIDIA CUDA for GPU acceleration if available for GMF.",
|
||||
)
|
||||
|
||||
with col2_r3:
|
||||
optimizer_name = st.selectbox(
|
||||
"Optimizer",
|
||||
options=["adam", "sgd", "rmsprop"],
|
||||
index=0, # 'adam'
|
||||
help="The optimization algorithm to use for training the GMF model.",
|
||||
)
|
||||
|
||||
# col3_r3 is left empty here if no further parameters for GMF
|
||||
model_params = {
|
||||
"learning_rate": learning_rate,
|
||||
"weight_decay": weight_decay,
|
||||
"latent_dim": latent_dim,
|
||||
"epochs": epochs,
|
||||
"num_negative": num_negative,
|
||||
"batch_size": batch_size,
|
||||
"cuda": cuda,
|
||||
"optimizer_name": optimizer_name,
|
||||
}
|
||||
|
||||
elif model_option == "MLP":
|
||||
st.subheader("MLP (Multi-Layer Perceptron) Parameters")
|
||||
|
||||
# First Row
|
||||
col1_r1, col2_r1, col3_r1 = st.columns(3)
|
||||
|
||||
with col1_r1:
|
||||
learning_rate = st.number_input(
|
||||
"Learning Rate",
|
||||
min_value=0.0001,
|
||||
max_value=0.1,
|
||||
value=0.005,
|
||||
step=0.001,
|
||||
format="%.4f",
|
||||
help="The step size at each iteration for the MLP model.",
|
||||
)
|
||||
|
||||
with col2_r1:
|
||||
weight_decay = st.number_input(
|
||||
"Weight Decay",
|
||||
min_value=0.0000001,
|
||||
max_value=0.0001,
|
||||
value=0.0000001,
|
||||
step=0.0000001,
|
||||
format="%.7f",
|
||||
help="The regularization factor to prevent overfitting in MLP.",
|
||||
)
|
||||
|
||||
with col3_r1:
|
||||
latent_dim = st.number_input(
|
||||
"Latent Dimension",
|
||||
min_value=4,
|
||||
max_value=128,
|
||||
value=8,
|
||||
step=4,
|
||||
help="The number of latent factors for users and items in MLP.",
|
||||
)
|
||||
|
||||
# Second Row
|
||||
col1_r2, col2_r2, col3_r2 = st.columns(3)
|
||||
|
||||
with col1_r2:
|
||||
epochs = st.number_input(
|
||||
"Epochs (iterations)",
|
||||
min_value=1,
|
||||
max_value=200,
|
||||
value=30,
|
||||
step=5,
|
||||
help="The number of complete passes through the training data for MLP.",
|
||||
)
|
||||
|
||||
with col2_r2:
|
||||
num_negative = st.number_input(
|
||||
"Number of Negative Samples",
|
||||
min_value=1,
|
||||
max_value=100,
|
||||
value=10,
|
||||
step=1,
|
||||
help="The number of negative samples per positive interaction during MLP training.",
|
||||
)
|
||||
|
||||
with col3_r2:
|
||||
batch_size = st.number_input(
|
||||
"Batch Size",
|
||||
min_value=64,
|
||||
max_value=4096,
|
||||
value=1024,
|
||||
step=64,
|
||||
help="The number of samples per gradient update for MLP.",
|
||||
)
|
||||
|
||||
# Third Row
|
||||
col1_r3, col2_r3, col3_r3 = st.columns(3)
|
||||
|
||||
with col1_r3:
|
||||
cuda = st.checkbox(
|
||||
"Use CUDA (GPU)",
|
||||
value=False,
|
||||
help="Check to use NVIDIA CUDA for GPU acceleration if available for MLP.",
|
||||
)
|
||||
|
||||
with col2_r3:
|
||||
optimizer_name = st.selectbox(
|
||||
"Optimizer",
|
||||
options=["adam", "sgd", "rmsprop"],
|
||||
index=0, # 'adam'
|
||||
help="The optimization algorithm to use for training the MLP model.",
|
||||
)
|
||||
|
||||
# col3_r3 is left empty here if no further parameters for MLP
|
||||
model_params = {
|
||||
"learning_rate": learning_rate,
|
||||
"weight_decay": weight_decay,
|
||||
"latent_dim": latent_dim,
|
||||
"epochs": epochs,
|
||||
"num_negative": num_negative,
|
||||
"batch_size": batch_size,
|
||||
"cuda": cuda,
|
||||
"optimizer_name": optimizer_name,
|
||||
}
|
||||
|
||||
elif model_option == "KNN":
|
||||
st.subheader("KNN (K-Nearest Neighbors) Parameters")
|
||||
|
||||
# First Row
|
||||
col1_r1, col2_r1, col3_r1 = st.columns(3)
|
||||
|
||||
with col1_r1:
|
||||
k_neighbors = st.number_input(
|
||||
"Number of Neighbors (k)",
|
||||
min_value=1,
|
||||
max_value=100,
|
||||
value=50,
|
||||
step=1,
|
||||
help="The number of nearest neighbors to consider for making predictions.",
|
||||
)
|
||||
|
||||
with col2_r1:
|
||||
min_k_neighbors = st.number_input(
|
||||
"Minimum Number of Neighbors",
|
||||
min_value=1,
|
||||
max_value=20,
|
||||
value=3,
|
||||
step=1,
|
||||
help="The minimum number of neighbors required to make a prediction.",
|
||||
)
|
||||
|
||||
with col3_r1:
|
||||
similarity_type = st.selectbox(
|
||||
"Similarity Metric",
|
||||
options=["cosine", "pearson"],
|
||||
index=1, # 'pearson'
|
||||
help="The similarity metric to use for finding nearest neighbors.",
|
||||
)
|
||||
|
||||
# Second Row
|
||||
col1_r2, col2_r2, col3_r2 = st.columns(3)
|
||||
|
||||
with col1_r2:
|
||||
boolean_user_based = st.checkbox(
|
||||
"User-Based Collaborative Filtering",
|
||||
value=True,
|
||||
help="Check to use user-based collaborative filtering; uncheck for item-based.",
|
||||
)
|
||||
|
||||
# col2_r2 and col3_r2 is left empty here if no further parameters for KNN
|
||||
model_params = {
|
||||
"k_neighbors": k_neighbors,
|
||||
"min_k_neighbors": min_k_neighbors,
|
||||
"similarity_type": similarity_type,
|
||||
"boolean_user_based": boolean_user_based,
|
||||
}
|
||||
|
||||
elif model_option == "SVD":
|
||||
st.subheader("SVD Parameters")
|
||||
N_FACTORS = 64
|
||||
N_EPOCHS = 30
|
||||
LEARNING_RATE = 0.005
|
||||
REGULARIZATION = 0.08
|
||||
RANDOM_STATE = 42
|
||||
# First Row
|
||||
col1_r1, col2_r1, col3_r1 = st.columns(3)
|
||||
with col1_r1:
|
||||
n_factors = st.number_input(
|
||||
"Latent Dimensions (factors)",
|
||||
min_value=1,
|
||||
max_value=100,
|
||||
value=64,
|
||||
step=1,
|
||||
help="The number of latent factors to compute.",
|
||||
)
|
||||
|
||||
with col2_r1:
|
||||
n_epochs = st.number_input(
|
||||
"Epochs (iterations)",
|
||||
min_value=1,
|
||||
max_value=35,
|
||||
value=30,
|
||||
step=1,
|
||||
help="The number of model iterations.",
|
||||
)
|
||||
|
||||
with col3_r1:
|
||||
learning_rater = st.number_input(
|
||||
"Learning Rate",
|
||||
min_value=0.001,
|
||||
max_value=0.050,
|
||||
value=0.005,
|
||||
help="The step size at each iteration while moving toward a minimum of the loss function.",
|
||||
)
|
||||
|
||||
# Second Row
|
||||
col1_r2, col2_r2, col3_r2 = st.columns(3)
|
||||
|
||||
with col1_r2:
|
||||
early_stopping = st.checkbox(
|
||||
"Enable Early Stopping",
|
||||
value=False,
|
||||
help="Check to stop training when validation performance degrades.",
|
||||
)
|
||||
with col2_r2:
|
||||
reg = st.number_input(
|
||||
"Regularization Term",
|
||||
min_value=0.01,
|
||||
max_value=0.50,
|
||||
value=0.08,
|
||||
step=0.01,
|
||||
format="%.3f",
|
||||
help="The regularization factor.",
|
||||
)
|
||||
with col3_r2:
|
||||
init_mean = st.number_input(
|
||||
"Initialization Mean",
|
||||
min_value=0.0,
|
||||
max_value=0.5,
|
||||
value=0.0,
|
||||
step=0.01,
|
||||
format="%.2f",
|
||||
help="The mean for initializing latent factors.",
|
||||
)
|
||||
# Third Row
|
||||
|
||||
col1_r3, col2_r3, col3_r3 = st.columns(3)
|
||||
with col1_r3:
|
||||
init_std = st.number_input(
|
||||
"Initialization Standard Deviation",
|
||||
min_value=0.00,
|
||||
max_value=0.5,
|
||||
value=0.0,
|
||||
step=0.01,
|
||||
format="%.2f",
|
||||
help="The standard deviation for initializing latent factors.",
|
||||
)
|
||||
with col2_r3:
|
||||
random_state = st.number_input(
|
||||
"Random State (Seed)",
|
||||
min_value=1,
|
||||
max_value=100,
|
||||
value=42,
|
||||
step=1,
|
||||
help="The seed for random number generation to ensure reproducibility.",
|
||||
)
|
||||
# col3_r3 is left empty here if no further parameters for SVD
|
||||
model_params = {
|
||||
"n_factors": n_factors,
|
||||
"n_epochs": n_epochs,
|
||||
"learning_rater": learning_rater,
|
||||
"reg": reg,
|
||||
"init_mean": init_mean,
|
||||
"init_std": init_std,
|
||||
"random_state": random_state,
|
||||
"early_stopping": early_stopping,
|
||||
}
|
||||
else:
|
||||
st.info(f"Configuration for **{model_option}** is not yet implemented.")
|
||||
st.stop()
|
||||
|
||||
# Model Training
|
||||
st.header("3. Train the Model")
|
||||
|
||||
if st.button("Train Model", type="primary"):
|
||||
with st.spinner(f"Training **{model_option}** model... This may take a moment."):
|
||||
try:
|
||||
# Retrieve the data_reader object from session state
|
||||
data_reader = st.session_state.data_reader
|
||||
model = None
|
||||
# 1. Instantiate the model with user-defined hyperparameters
|
||||
if model_option == "ALS":
|
||||
model = ALS(**model_params)
|
||||
elif model_option == "BPR":
|
||||
model = BPR(**model_params)
|
||||
elif model_option == "Autoencoder":
|
||||
autoencoder_params = {
|
||||
k: v
|
||||
for k, v in model_params.items()
|
||||
if k not in ["num_users", "num_items"]
|
||||
}
|
||||
model = ExplAutoencoderTorch(**autoencoder_params)
|
||||
elif model_option == "EMF":
|
||||
emf_params = {
|
||||
k: v
|
||||
for k, v in model_params.items()
|
||||
if k not in ["num_users", "num_items"]
|
||||
}
|
||||
model = EMFModel(**emf_params)
|
||||
elif model_option == "GMF":
|
||||
gmf_params = {
|
||||
k: v
|
||||
for k, v in model_params.items()
|
||||
if k not in ["num_users", "num_items"]
|
||||
}
|
||||
model = GMFModel(**gmf_params)
|
||||
elif model_option == "MLP":
|
||||
mlp_params = {
|
||||
k: v
|
||||
for k, v in model_params.items()
|
||||
if k not in ["num_users", "num_items"]
|
||||
}
|
||||
model = MLPModel(**mlp_params)
|
||||
elif model_option == "KNN":
|
||||
if "k_neighbors" in model_params:
|
||||
model_params["k"] = model_params.pop("k_neighbors")
|
||||
knn_params = {
|
||||
k: v
|
||||
for k, v in model_params.items()
|
||||
if k not in ["num_users", "num_items"]
|
||||
}
|
||||
model = KNNBasic(**knn_params)
|
||||
elif model_option == "SVD":
|
||||
if "learning_rater" in model_params:
|
||||
model_params["lr"] = model_params.pop("learning_rater")
|
||||
svd_params = {
|
||||
k: v
|
||||
for k, v in model_params.items()
|
||||
if k not in ["num_users", "num_items"]
|
||||
}
|
||||
model = SVD(**svd_params)
|
||||
if model:
|
||||
start_time = time.time()
|
||||
# 2. Fit the model using the processed dataset
|
||||
model.fit(data_reader)
|
||||
end_time = time.time()
|
||||
training_time = end_time - start_time
|
||||
# 3. Store the trained model in session state for the next page
|
||||
st.session_state.trained_model = model
|
||||
st.session_state.model_name = model_option
|
||||
|
||||
st.success(
|
||||
f"✅ **{model_option}** model trained successfully in {training_time:.2f} seconds!"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"An error occurred during model training: {e}")
|
||||
if "trained_model" in st.session_state:
|
||||
del st.session_state.trained_model
|
||||
|
||||
if "trained_model" in st.session_state:
|
||||
st.markdown("")
|
||||
st.header("4. Offline Model Evaluation")
|
||||
|
||||
with st.expander("🔬 Run Model Evaluation", expanded=True):
|
||||
st.markdown("""
|
||||
Choose your evaluation method:
|
||||
- **Leave-One-Out**: More thorough but slower (recommended for final evaluation)
|
||||
- **Train/Test Split**: Faster and practical for iterative testing
|
||||
|
||||
**Metrics Explained:**
|
||||
- **Hit Ratio @10**: Percentage of users for whom we found at least one relevant item in top-10
|
||||
- **NDCG @10**: Measures ranking quality - higher values mean better ranking of relevant items
|
||||
""")
|
||||
|
||||
# Evaluation method selection
|
||||
eval_method = st.radio(
|
||||
"Select Evaluation Method:",
|
||||
["Train/Test Split (Fast)", "Leave-One-Out (Thorough)"],
|
||||
index=0,
|
||||
)
|
||||
|
||||
# Parameters
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
test_size = 0.2 # Default value
|
||||
if eval_method == "Train/Test Split (Fast)":
|
||||
test_size = st.slider("Test Set Size (%)", 10, 30, 20) / 100
|
||||
eval_top_n = st.number_input("Top-N for evaluation", 1, 20, 10)
|
||||
|
||||
with col2:
|
||||
if eval_method == "Leave-One-Out (Thorough)":
|
||||
st.info("Leave-one-out will use 1 item per user for testing")
|
||||
|
||||
# Run evaluation button
|
||||
eval_button_key = f"run_eval_{eval_method.replace(' ', '_').replace('(', '').replace(')', '')}"
|
||||
|
||||
if st.button("Run Evaluation", key=eval_button_key, type="primary"):
|
||||
with st.spinner(
|
||||
f"Running {eval_method.lower()} evaluation... Please wait."
|
||||
):
|
||||
try:
|
||||
# Get the model configuration for re-instantiation
|
||||
model_name = st.session_state.model_name
|
||||
data_reader = st.session_state.data_reader
|
||||
|
||||
# Re-instantiate model with same parameters
|
||||
if model_option == "ALS":
|
||||
eval_model = ALS(**model_params)
|
||||
elif model_option == "BPR":
|
||||
eval_model = BPR(**model_params)
|
||||
elif model_option == "Autoencoder":
|
||||
autoencoder_params = {
|
||||
k: v
|
||||
for k, v in model_params.items()
|
||||
if k not in ["num_users", "num_items"]
|
||||
}
|
||||
eval_model = ExplAutoencoderTorch(**autoencoder_params)
|
||||
elif model_option == "EMF":
|
||||
emf_params = {
|
||||
k: v
|
||||
for k, v in model_params.items()
|
||||
if k not in ["num_users", "num_items"]
|
||||
}
|
||||
eval_model = EMFModel(**emf_params)
|
||||
elif model_option == "GMF":
|
||||
gmf_params = {
|
||||
k: v
|
||||
for k, v in model_params.items()
|
||||
if k not in ["num_users", "num_items"]
|
||||
}
|
||||
eval_model = GMFModel(**gmf_params)
|
||||
elif model_option == "MLP":
|
||||
mlp_params = {
|
||||
k: v
|
||||
for k, v in model_params.items()
|
||||
if k not in ["num_users", "num_items"]
|
||||
}
|
||||
eval_model = MLPModel(**mlp_params)
|
||||
elif model_option == "KNN":
|
||||
if "k_neighbors" in model_params:
|
||||
model_params["k"] = model_params.pop("k_neighbors")
|
||||
knn_params = {
|
||||
k: v
|
||||
for k, v in model_params.items()
|
||||
if k not in ["num_users", "num_items"]
|
||||
}
|
||||
eval_model = KNNBasic(**knn_params)
|
||||
elif model_option == "SVD":
|
||||
if "learning_rater" in model_params:
|
||||
model_params["lr"] = model_params.pop("learning_rater")
|
||||
svd_params = {
|
||||
k: v
|
||||
for k, v in model_params.items()
|
||||
if k not in ["num_users", "num_items"]
|
||||
}
|
||||
eval_model = SVD(**svd_params)
|
||||
else:
|
||||
st.error(f"Evaluation not implemented for {model_name}")
|
||||
st.stop()
|
||||
|
||||
# Run the appropriate evaluation
|
||||
if eval_method == "Leave-One-Out (Thorough)":
|
||||
evaluation_scores = run_leave_one_out_evaluation(
|
||||
data_reader=data_reader,
|
||||
model=eval_model,
|
||||
top_n=eval_top_n,
|
||||
)
|
||||
else: # Train/Test Split
|
||||
evaluation_scores = run_evaluation_with_proper_split(
|
||||
data_reader=data_reader,
|
||||
model=eval_model,
|
||||
test_size=test_size,
|
||||
top_n=eval_top_n,
|
||||
)
|
||||
|
||||
# Store results
|
||||
st.session_state.evaluation_scores = evaluation_scores
|
||||
st.session_state.eval_method = eval_method
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Evaluation failed: {str(e)}")
|
||||
st.exception(e)
|
||||
|
||||
# Display results if available
|
||||
if "evaluation_scores" in st.session_state:
|
||||
st.markdown("")
|
||||
st.subheader("📊 Evaluation Results")
|
||||
|
||||
scores = st.session_state.evaluation_scores
|
||||
method = st.session_state.get("eval_method", "")
|
||||
|
||||
# Metrics display
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
st.metric(
|
||||
label=f"Hit Ratio @{eval_top_n}",
|
||||
value=f"{scores.get('Hit Ratio', 0.0):.2%}",
|
||||
help="Percentage of test users for whom at least one relevant item was found in top-10",
|
||||
)
|
||||
|
||||
with col2:
|
||||
ndcg_value = scores.get("NDCG", scores.get("eNDCG", 0.0))
|
||||
st.metric(
|
||||
label=f"NDCG @{eval_top_n}",
|
||||
value=f"{ndcg_value:.4f}",
|
||||
help="Normalized Discounted Cumulative Gain - measures ranking quality",
|
||||
)
|
||||
|
||||
with col3:
|
||||
st.metric(
|
||||
label="Evaluation Time",
|
||||
value=f"{scores.get('evaluation_time', 0):.1f}s",
|
||||
help="Time taken to complete the evaluation",
|
||||
)
|
||||
|
||||
# Additional info
|
||||
if "test_interactions" in scores:
|
||||
st.info(
|
||||
f"📈 Evaluated on {scores['test_interactions']:,} test interactions using {method}"
|
||||
)
|
||||
|
||||
# Performance interpretation
|
||||
hit_ratio = scores.get("Hit Ratio", 0.0)
|
||||
ndcg = ndcg_value
|
||||
|
||||
st.markdown("### 🎯 Performance Interpretation")
|
||||
|
||||
if hit_ratio > 0.15 and ndcg > 0.08:
|
||||
st.success(
|
||||
"🎉 Excellent performance! Your model shows strong recommendation capability."
|
||||
)
|
||||
elif hit_ratio > 0.08 and ndcg > 0.04:
|
||||
st.success("✅ Good performance! Your model is working well.")
|
||||
elif hit_ratio > 0.03 and ndcg > 0.02:
|
||||
st.warning(
|
||||
"⚠️ Moderate performance. Consider tuning hyperparameters or trying a different model."
|
||||
)
|
||||
else:
|
||||
st.error(
|
||||
"❌ Poor performance. The model may need significant improvements."
|
||||
)
|
||||
|
||||
st.info("Navigate to the **🎯 Group Recommendation** page to continue.")
|
||||
@@ -0,0 +1,156 @@
|
||||
import streamlit as st
|
||||
import pandas as pd
|
||||
|
||||
from pygrex.recommender import GroupRecommender
|
||||
from pygrex.utils import AggregationStrategy
|
||||
|
||||
st.set_page_config(page_title="Group Recommendation", page_icon="🎯", layout="wide")
|
||||
st.title("🎯 Group Recommendation")
|
||||
|
||||
# Session State Checks
|
||||
# Ensure data is loaded and a model is trained before proceeding.
|
||||
if not st.session_state.get("data_loaded", False):
|
||||
st.warning("⚠️ Please load data on the **📄 Data Preparation** page first.")
|
||||
st.stop()
|
||||
if not st.session_state.get("trained_model", False):
|
||||
st.warning("⚠️ Please train a model on the **🧠 Model Training** page first.")
|
||||
st.stop()
|
||||
|
||||
# Retrieve objects from session state
|
||||
data_reader = st.session_state.data_reader
|
||||
group_handler = st.session_state.group_handler
|
||||
model = st.session_state.trained_model
|
||||
model_name = st.session_state.model_name
|
||||
|
||||
# Recommendation Setup
|
||||
st.header("1. Select a Group and Strategy")
|
||||
|
||||
group_filename = st.session_state.group_filename
|
||||
|
||||
try:
|
||||
available_groups = group_handler.read_groups(filename=group_filename)
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
selected_group_id = st.selectbox(
|
||||
"Choose a group:",
|
||||
options=available_groups,
|
||||
help="These groups were loaded from your group data file.",
|
||||
)
|
||||
|
||||
# Parse and display members of the selected group
|
||||
if selected_group_id:
|
||||
group_members = group_handler.parse_group_members(selected_group_id)
|
||||
st.write("👥 **Group Members:**", ", ".join(map(str, group_members)))
|
||||
|
||||
with col2:
|
||||
# Use the AggregationStrategy Enum to populate the selectbox
|
||||
agg_strategy_enum = st.selectbox(
|
||||
"Choose an aggregation strategy:",
|
||||
options=list(AggregationStrategy),
|
||||
format_func=lambda x: x.name.replace("_", " ").title(),
|
||||
help="Select the method for combining individual member preferences.",
|
||||
)
|
||||
|
||||
# Conditional Input for Most Respected Person
|
||||
mrp_id = None
|
||||
if agg_strategy_enum == AggregationStrategy.MOST_RESPECTED_PERSON:
|
||||
mrp_id = st.selectbox(
|
||||
"Select the Most Respected Person:",
|
||||
options=group_members, # type: ignore
|
||||
help="This user's preferences will solely determine the group recommendation.",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Could not read groups from file '{group_filename}'. Error: {e}")
|
||||
st.stop()
|
||||
|
||||
# Top-K Configuration
|
||||
st.header("2. Specify Number of Recommendations")
|
||||
top_k = st.slider(
|
||||
"Number of items to recommend (Top-K):",
|
||||
min_value=1,
|
||||
max_value=50,
|
||||
value=10,
|
||||
help="Adjust the slider to change the length of the final recommendation list.",
|
||||
)
|
||||
|
||||
# Generate Recommendations
|
||||
st.header("3. Generate and View Recommendations")
|
||||
|
||||
if st.button("Generate Group Recommendations", type="primary"):
|
||||
if not selected_group_id:
|
||||
st.warning("Please select a group first.")
|
||||
else:
|
||||
with st.spinner("Generating recommendations..."):
|
||||
try:
|
||||
# 1. Instantiate the GroupRecommender
|
||||
group_recommender = GroupRecommender(data=data_reader)
|
||||
|
||||
# 2. Setup the recommendation process
|
||||
group_recommender.setup_recommendation(
|
||||
model=model,
|
||||
members=group_members, # type: ignore
|
||||
data=data_reader,
|
||||
aggregation_strategy=agg_strategy_enum,
|
||||
most_respected_person=mrp_id,
|
||||
)
|
||||
|
||||
# 3. Get the final recommendation list
|
||||
recommended_items = group_recommender.get_group_recommendations(
|
||||
top_k=top_k
|
||||
)
|
||||
|
||||
# Store the recommender instance for the explanation page
|
||||
st.session_state.group_recommender = group_recommender
|
||||
st.session_state.recommended_items = recommended_items
|
||||
|
||||
st.success("✅ Recommendations generated successfully!")
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"An error occurred while generating recommendations: {e}")
|
||||
|
||||
|
||||
# Display Results
|
||||
if "recommended_items" in st.session_state:
|
||||
st.markdown("")
|
||||
st.subheader(f"Top {top_k} Recommended Items")
|
||||
|
||||
recommender = st.session_state.group_recommender
|
||||
scores = recommender.get_recommendation_scores()
|
||||
|
||||
# Create a DataFrame for nice display
|
||||
rec_data = []
|
||||
for i, item_id in enumerate(st.session_state.recommended_items): # type: ignore
|
||||
rec_data.append(
|
||||
{
|
||||
"Rank": i + 1,
|
||||
"Item ID": item_id,
|
||||
"Aggregated Score": scores.get(item_id, 0.0),
|
||||
}
|
||||
)
|
||||
|
||||
if not rec_data:
|
||||
st.info("No recommendations were generated for this group.")
|
||||
else:
|
||||
st.dataframe(pd.DataFrame(rec_data), use_container_width=True, hide_index=True)
|
||||
|
||||
# Show detailed individual predictions
|
||||
with st.expander("🔍 View Individual Predictions"):
|
||||
individual_preds = recommender.get_individual_predictions()
|
||||
if individual_preds:
|
||||
# Convert to a more readable DataFrame
|
||||
df_preds = pd.DataFrame(
|
||||
individual_preds
|
||||
).T # Transpose to have users as rows
|
||||
df_preds.index.name = "User ID"
|
||||
st.write(
|
||||
"Predicted scores (1-5 scale) for each user on items in the candidate pool:"
|
||||
)
|
||||
st.dataframe(df_preds.head(10))
|
||||
else:
|
||||
st.write("No individual predictions available.")
|
||||
|
||||
st.info(
|
||||
"Navigate to the **💬 Explanation & Evaluation** page to analyze these recommendations."
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user