Source code for graforvfl.network.gfo_rvfl_comparator

#!/usr/bin/env python
# Created by "Thieu" at 15:44, 03/04/2025 ----------%                                                                               
#       Email: nguyenthieu2102@gmail.com            %                                                    
#       Github: https://github.com/thieu1995        %                         
# --------------------------------------------------%

import time
import pickle
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from graforvfl.network.gfo_rvfl_cv import GfoRvflCV


[docs]class GfoRvflComparator: """ A class to compare different optimizers for the GfoRvflCV model. Attributes: problem_type (str): Type of problem, either 'regression' or 'classification'. bounds (dict): Bounds for the hyperparameters. optim_list (list): List of optimizers to compare. optim_params_list (list): List of parameters for each optimizer. scoring (str): Scoring metric to evaluate the model. cv (int): Number of cross-validation folds. seed (int): Random seed for reproducibility. verbose (bool): Verbosity mode. kwargs (dict): Additional keyword arguments. """ def __init__(self, problem_type="regression", bounds=None, optim_list=None, optim_params_list=None, scoring="MSE", cv=None, seed=None, verbose=True, **kwargs): """ Initializes the GfoRvflComparator with the given parameters. Args: problem_type (str): Type of problem, either 'regression' or 'classification'. bounds (list): Bounds for the hyperparameters. optim_list (list): List of optimizers to compare. optim_params_list (list): List of parameters for each optimizer. scoring (str): Scoring metric to evaluate the model. cv (int): Number of cross-validation folds. seed (int): Random seed for reproducibility. verbose (bool): Verbosity mode. **kwargs: Additional keyword arguments. """ if len(optim_list) != len(optim_params_list): raise ValueError("Length of optim_list and optim_params_list must be the same.") self.problem_type = problem_type self.bounds = bounds self.optim_list = optim_list self.optim_params_list = optim_params_list self.scoring = scoring self.cv = cv self.seed = seed self.verbose = verbose self.kwargs = kwargs self.generator = np.random.default_rng(seed) self.optimizer_names = []
[docs] def run(self, X_train, y_train, X_test, y_test, n_trials=3, list_metrics=("MSE", "NSE", "KGE", "R", "MAE"), save_results=True, save_models=False, path_save="history"): """ Run comparison across all optimizers. Args: X_train (array-like): Training data features. y_train (array-like): Training data labels. X_test (array-like): Testing data features. y_test (array-like): Testing data labels. n_trials (int): Number of trials to run for each optimizer. list_metrics (tuple): List of metrics to evaluate. save_results (bool): Whether to save the results to files. save_models (bool): Whether to save the trained models. path_save (str): Path to save the results and models. Returns: tuple: List of trained models, list of training losses, and DataFrame of metric results. """ Path(path_save).mkdir(parents=True, exist_ok=True) if n_trials < 1: n_trials = 1 seed_list = self.generator.choice(range(1, 100), n_trials, replace=False) metric_results = [] # Store all trial results for this optimizer metric_unfold = [] list_trained_models = [] list_loss_train = [] for optim, params in zip(self.optim_list, self.optim_params_list): print(f"Running optimizer: {optim} with params: {params}") trial_models = [] trial_loss_train = {} for idx_trial, seed in enumerate(seed_list): print(f"\tTrial {idx_trial+1}/{n_trials} with seed: {seed}...") # Ghi lại thời gian chạy start_time = time.perf_counter() # Tạo model với optimizer hiện tại model = GfoRvflCV(problem_type=self.problem_type, bounds=self.bounds, optim=optim, optim_params=params, scoring=self.scoring, cv=self.cv, seed=seed, verbose=self.verbose, **self.kwargs) # Train model model.fit(X_train, y_train) # Ghi lại thời gian chạy elapsed_time = time.perf_counter() - start_time scores = model.best_estimator.scores(X_test, y_test, list_metrics=list_metrics) # Lưu kết quả metric_results.append({ "optimizer": model.optim.name, "trial": idx_trial, "best_scores": scores, "best_params": model.best_params, "optimizer_params": params, "time_seconds": elapsed_time, }) trial_models.append(model) trial_loss_train[f"trial_{idx_trial}"] = model.loss_train # Handle metric unfold to save in csv file res = {"optimizer": model.optim.name, "trial": idx_trial, "time_seconds": elapsed_time, **scores} metric_unfold.append(res) # Save optimizer name for later use if idx_trial == 0: self.optimizer_names.append(model.optim.name) list_trained_models.append(trial_models) list_loss_train.append(trial_loss_train) if save_results: pd.DataFrame(trial_loss_train).to_csv(f"{path_save}/{model.optim.name}-loss_train.csv", index=False) if save_models: for idx, model in enumerate(trial_models): with open(f"{path_save}/{model.optim.name}-trial_{idx}-model.pkl", "wb") as ff: pickle.dump(model, ff) if save_results: met = list(list_metrics) + ["time_seconds"] pd.DataFrame(metric_results).to_csv(f"{path_save}/metric_results_full.csv", index=False) df = pd.DataFrame(metric_unfold) df_mean = df.groupby("optimizer")[met].mean().reset_index() df_std = df.groupby("optimizer")[met].std().reset_index() df_mean.to_csv(f"{path_save}/metric_results_mean.csv", index=False) df_std.to_csv(f"{path_save}/metric_results_std.csv", index=False) df.to_csv(f"{path_save}/metric_results_unfold.csv", index=False) return list_trained_models, list_loss_train, pd.DataFrame(metric_results)
[docs] def plot_loss_train_per_trial(self, path_read="history", path_save="history", fig_size=(7, 5), exts=(".png", ".pdf"), verbose=False): """ Plot comparison of loss_train for each trial. Args: path_read (str): Path where the loss_train files are saved. path_save (str): Path where to save the figures. fig_size (tuple): Size of the figure. exts (tuple): File extensions for saving the figures. verbose (bool): Whether to print additional information. """ y_label = "Accuracy" if self.problem_type == "classification" else "Loss" Path(path_read).mkdir(parents=True, exist_ok=True) Path(path_save).mkdir(parents=True, exist_ok=True) dfs = [] for optim_name in self.optimizer_names: df = pd.read_csv(f"{path_read}/{optim_name}-loss_train.csv") dfs.append(df) # Số lượng trials (giả sử tất cả models có số trial như nhau) n_trials = dfs[0].shape[1] fig_size = fig_size if isinstance(fig_size, tuple) else (7, 5) for idx in range(n_trials): plt.figure(figsize=fig_size) for df, name in zip(dfs, self.optimizer_names): plt.plot(df[f"trial_{idx}"], label=name) plt.title(f"Trial {idx}") plt.xlabel("Iteration") plt.ylabel(y_label) plt.legend() plt.grid(True) plt.tight_layout() for ext in exts: plt.savefig(f"{path_save}/loss_train_trial_{idx}{ext}") if verbose: plt.show()
[docs] def plot_loss_train_average(self, path_read="history", path_save="history", fig_size=(7, 5), exts=(".png", ".pdf"), verbose=False): """ Plot average loss_train across trials for each model. Args: path_read (str): Path where the loss_train files are saved. path_save (str): Path where to save the figures. fig_size (tuple): Size of the figure. exts (tuple): File extensions for saving the figures. verbose (bool): Whether to print additional information. """ Path(path_read).mkdir(parents=True, exist_ok=True) Path(path_save).mkdir(parents=True, exist_ok=True) y_label = "Average Accuracy" if self.problem_type == "classification" else "Average Loss" fig_size = fig_size if isinstance(fig_size, tuple) else (7, 5) plt.figure(figsize=fig_size) for optim_name in self.optimizer_names: df = pd.read_csv(f"{path_read}/{optim_name}-loss_train.csv") mean_loss = df.mean(axis=1) plt.plot(mean_loss, label=optim_name) plt.title(f"{y_label} Of Training Set Over Trials") plt.xlabel("Iteration") plt.ylabel(y_label) plt.legend() plt.grid(True) plt.tight_layout() for ext in exts: plt.savefig(f"{path_save}/loss_train_average{ext}") if verbose: plt.show()
[docs] def plot_metric_boxplot(self, path_read="history", path_save="history", fig_size=(7, 5), exts=(".png", ".pdf"), verbose=False): """ Plot boxplot for each metric. Args: path_read (str): Path where the loss_train files are saved. path_save (str): Path where to save the figures. fig_size (tuple): Size of the figure. exts (tuple): File extensions for saving the figures. verbose (bool): Whether to print additional information. """ Path(path_read).mkdir(parents=True, exist_ok=True) Path(path_save).mkdir(parents=True, exist_ok=True) df = pd.read_csv(f"{path_read}/metric_results_unfold.csv") metrics = df.columns.difference(["optimizer", "trial", "time_seconds"]) fig_size = fig_size if isinstance(fig_size, tuple) else (7, 5) for metric in metrics: plt.figure(figsize=fig_size) df.boxplot(column=metric, by="optimizer") plt.title(f"Boxplot for {metric}") plt.suptitle("") plt.xlabel("Optimizer") plt.ylabel(metric) plt.tight_layout() for ext in exts: plt.savefig(f"{path_save}/metric_boxplot_{metric}{ext}") if verbose: plt.show()
[docs] def plot_average_runtime(self, path_read="history", path_save="history", fig_size=(7, 5), exts=(".png", ".pdf"), verbose=False): """ Plot average runtime for each model. Args: path_read (str): Path where the loss_train files are saved. path_save (str): Path where to save the figures. fig_size (tuple): Size of the figure. exts (tuple): File extensions for saving the figures. verbose (bool): Whether to print additional information. """ Path(path_read).mkdir(parents=True, exist_ok=True) Path(path_save).mkdir(parents=True, exist_ok=True) df = pd.read_csv(f"{path_read}/metric_results_unfold.csv") avg_runtime = df.groupby("optimizer")["time_seconds"].mean() fig_size = fig_size if isinstance(fig_size, tuple) else (7, 5) plt.figure(figsize=fig_size) avg_runtime.plot(kind="bar") plt.xlabel("Optimizer") plt.ylabel("Average Runtime (seconds)") plt.title("Average Runtime Comparison") plt.tight_layout() for ext in exts: plt.savefig(f"{path_save}/average_runtime_chart{ext}") if verbose: plt.show()