Source code for matclassification.methods.core.HSClassifier

# -*- coding: utf-8 -*-
'''
MAT-Tools: Python Framework for Multiple Aspect Trajectory Data Mining

The present package offers a tool, to support the user in the task of data analysis of multiple aspect trajectories. It integrates into a unique framework for multiple aspects trajectories and in general for multidimensional sequence data mining methods.
Copyright (C) 2022, MIT license (this portion of code is subject to licensing from source project distribution)

Created on Dec, 2021
Copyright (C) 2022, License GPL Version 3 or superior (see LICENSE file)

Authors:
    - Tarlis Portela
'''
import os 
import pandas as pd
from datetime import datetime

from tqdm.auto import tqdm
# --------------------------------------------------------------------------------
from matclassification.methods.core import AbstractClassifier
# -------------------------------------------------------------------------------- 

# Hiperparameter Optimization Classifier - For Trajectory input data
[docs] class HSClassifier(AbstractClassifier): """ Hyperparameter Optimization Classifier for Trajectory Input Data. This class extends `AbstractClassifier` to include functionality for training and testing machine learning models with hyperparameter optimization. It is designed for trajectory data inputs and handles multiple configurations to find the best-performing model. Check: help(AbstractClassifier) Parameters: ----------- name : str, optional Classifier name (default: 'NN'). save_results : bool, optional Flag to enable saving results to disk (default: False). n_jobs : int, optional Number of parallel jobs to run (default: -1 for using all processors). verbose : bool, optional Flag for verbosity (default: False). random_state : int, optional Random seed for reproducibility (default: 42). filterwarnings : str, optional Warning filter level (default: 'ignore'). Methods: -------- train(dir_validation='.'): Trains the model using a single hyperparameter configuration. If validation is enabled, it will evaluate on a validation set; otherwise, it evaluates on the test set. Results are optionally saved to a CSV file. Parameters: ----------- dir_validation : str, optional Directory where validation results will be saved (default: current directory). Returns: -------- pd.DataFrame A DataFrame containing the training report with evaluation metrics for the model. test(rounds=1, dir_evaluation='.'): Tests the model over a specified number of rounds, each with a different random seed, to simulate multiple model evaluations. Parameters: ----------- rounds : int, optional The number of evaluation rounds (default: 1). dir_evaluation : str, optional Directory where evaluation results will be saved (default: current directory). Returns: -------- pd.DataFrame, np.array A DataFrame containing the evaluation report and the predicted labels for the test data. """ def __init__(self, name='NN', save_results=False, n_jobs=-1, verbose=False, random_state=42, filterwarnings='ignore'): super().__init__(name=name, n_jobs=n_jobs, verbose=verbose, random_state=random_state, filterwarnings=filterwarnings) self.save_results = save_results ## Overwrite train and test methods to do enable Hiperparameter Optimizations: ## (in this case, trains only one default configuration.
[docs] def train(self, dir_validation='.'): """ Trains the model using all hyperparameter configurations. If validation is enabled, it will evaluate on a validation set; otherwise, it evaluates on the test set. Results are optionally saved to a CSV file. Parameters: ----------- dir_validation : str, optional Directory where validation results will be saved (default: current directory). Returns: -------- pd.DataFrame A DataFrame containing the training report with evaluation metrics for the model. """ # This implementation, trains only one model # (but, you may overwrite the method following this method) self.start_time = datetime.now() X_train = self.X_train y_train = self.y_train if self.validate: X_val = self.X_val y_val = self.y_val else: X_val = self.X_test y_val = self.y_test if self.isverbose: print('['+self.name+':] Training hiperparameter model') data = [] # TODO: Hiperparam config training... ## This part you may want to run for each configuration (as a progress bar): #for config in pbar: filename = os.path.join(dir_validation, 'val_'+self.name.lower()+'.csv') if os.path.exists(filename): print('Skip ---> {}'.format(filename)) else: self.model = self.create() # pass the config dict() self.fit(X_train, y_train, X_val, y_val) #, config) validation_report, y_pred = self.predict(X_val, y_val) validation_report['clstime'] = self.duration() if self.save_results: validation_report.to_csv(filename, index=False) data.append( validation_report ) # self.model.free() self.report = pd.concat(data) self.report.reset_index(drop=True, inplace=True) # Use sorting if each train is a different model for hiperparam search, and you are loonig for the best model acc. #self.report.sort_values('acc', ascending=False, inplace=True) return self.report
[docs] def test(self, rounds=1, dir_evaluation='.'): """ Tests the model in teh simgle best model trained, over a specified number of rounds, each with a different random seeds, to simulate multiple model evaluations. Parameters: ----------- rounds : int, optional The number of evaluation rounds (default: 1). dir_evaluation : str, optional Directory where evaluation results will be saved (default: current directory). Returns: -------- pd.DataFrame, np.array A DataFrame containing the evaluation report and the predicted labels for the test data. """ X_train = self.X_train y_train = self.y_train if self.validate: X_val = self.X_val y_val = self.y_val else: X_val = self.X_test y_val = self.y_test X_test = self.X_test y_test = self.y_test filename = os.path.join(dir_evaluation, 'eval_'+self.name.lower()+'.csv') if os.path.exists(filename): if self.isverbose: print('['+self.name+':] Model previoulsy built.') # TODO read #return self.read_report(filename, prefix='eval_') else: if self.isverbose: print('['+self.name+':] Creating a model to test set') pbar = tqdm(range(rounds), desc="Model Testing") else: pbar = list(range(rounds)) random_state = self.config['random_state'] evaluate_report = [] for e in pbar: re = (random_state+e) self.config['random_state'] = re self.message(pbar, 'Round {} of {} (random {})'.format(e, rounds, re)) self.model = self.create() self.fit(X_train, y_train, X_val, y_val) eval_report, y_pred = self.predict(X_test, y_test) eval_report['clstime'] = self.duration() evaluate_report.append(eval_report) self.config['random_state'] = random_state self.test_report = pd.concat(evaluate_report) self.test_report.reset_index(drop=True, inplace=True) if self.isverbose: print('['+self.name+':] Processing time: {} milliseconds. Done.'.format(self.duration())) return self.test_report, y_pred