Source code for matclassification.methods.core.MClassifier

# -*- coding: utf-8 -*-
'''
MAT-Tools: Python Framework for Multiple Aspect Trajectory Data Mining

The present package offers a tool, to support the user in the task of data analysis of multiple aspect trajectories. It integrates into a unique framework for multiple aspects trajectories and in general for multidimensional sequence data mining methods.
Copyright (C) 2022, MIT license (this portion of code is subject to licensing from source project distribution)

Created on Dec, 2021
Copyright (C) 2022, License GPL Version 3 or superior (see LICENSE file)

Authors:
    - Tarlis Portela
'''
# --------------------------------------------------------------------------------
from matclassification.methods.core import *
# --------------------------------------------------------------------------------  

# Generic Movelet Classifier

[docs]
class MClassifier(AbstractClassifier):
    """
    Generic Classifier for Movelet Input.

    This class extends `AbstractClassifier` to handle trajectory data in the form of movelets. 
    
    Check: help(AbstractClassifier)
    
    Parameters:
    -----------
    name : str, optional
        Name of the classifier model (default: 'NN').
    n_jobs : int, optional
        Number of parallel jobs to run (default: -1 for using all processors).
    verbose : int, optional
        Level of verbosity (default: 0 for no output).
    random_state : int, optional
        Random seed for reproducibility (default: 42).
    filterwarnings : str, optional
        Warning filter level (default: 'ignore').

    """
    
    def __init__(self, 
                 name='NN',
                 n_jobs=-1,
                 verbose=0,
                 random_state=42,
                 filterwarnings='ignore'):
        
        super().__init__(name=name, n_jobs=n_jobs, verbose=verbose, random_state=random_state, filterwarnings=filterwarnings)
        

[docs]
    def xy(self,
           train, test,
           
           tid_col='tid', 
           class_col='label',
           validate = False,
           encode_labels=True):
        """
        Prepares the feature and label data for the classifier by splitting the training set, 
        encoding labels, and scaling the features.

        Parameters:
        -----------
        train : pd.DataFrame
            The training dataset.
        test : pd.DataFrame
            The test dataset.
        tid_col : str, optional
            Column name representing the trajectory ID (default: 'tid').
        class_col : str, optional
            Column name representing the class label (default: 'label').
        validate : bool, optional
            If True, splits the training data into training and validation sets (default: False) >> #TODO Under Dev.
        encode_labels : bool, optional
            If True, encodes the labels using `LabelEncoder` and one-hot encoding (default: True).

        Returns:
        --------
        num_classes : int
            Number of unique class labels.
        num_features : int
            Number of features in the dataset excluding the class label.
        le : LabelEncoder or None
            LabelEncoder instance used to transform the class labels, if `encode_labels` is True.
        X_set : list
            List containing the feature matrices (training, validation, test).
        y_set : list
            List containing the encoded labels (training, validation, test).
        """
        
        assert (len( set(test.columns).symmetric_difference(set(train.columns)) ) == 0), '['+self.name+':] ERROR. Divergence in train and test columns: ' + str(len(train.columns)) + ' train and ' + str(len(test.columns)) + ' test'
        
        data = []
        if validate:
            df_train = train.copy()
            if tid_col not in df_train.columns:
                df_train[tid_col] = df_train.index
            
            df_train, df_val = trainTestSplit(df_train, train_size=0.75, 
                                                 tid_col=tid_col, class_col=class_col, 
                                                 random_num=self.config['random_state'], outformats=[])
            
            data = [df_train, df_val, test]
        else:
            data = [train, test]
        
        for df in data:
            df.drop(columns=[tid_col], errors="ignore", inplace=True)
        
        num_classes = len(train[class_col].unique())
        num_features = len(data[0].iloc[1,:]) -1 # Minus label

        # Scaling y and transforming to keras format
        le = None
        if encode_labels:
            le = LabelEncoder()
            le.fit(train[class_col])
        
        # For Scaling data
        min_max_scaler = None
            
        X_set = []
        y_set = []

        for dataset in data:
            # Separating attribute data (X) than class attribute (y)
            X = dataset.iloc[:, 0:(num_features)].values
            y = dataset.iloc[:, (num_features)].values
            
            # Replace distance 0 for presence 1
            # and distance 2 to non presence 0
            X[X == 0] = 1
            X[X == 2] = 0
            
            # Scaling data
            if not min_max_scaler:
                min_max_scaler = MinMaxScaler()
                min_max_scaler.fit(X)
                
            X = min_max_scaler.transform(X)
            
            if encode_labels:
                y = le.transform(y)
                y = to_categorical(y)
            
            X_set.append(X)
            y_set.append(y)
        
        return num_classes, num_features, le, X_set, y_set

    

[docs]
    def prepare_input(self,
                      train, test,
                      
                      tid_col='tid', 
                      class_col='label',
                      validate = False):
        """
        Prepares the input datasets (training, validation, and test) for the classifier by invoking 
        the `xy()` method, storing the processed data, and setting the classifier configuration.

        Parameters:
        -----------
        train : pd.DataFrame
            The training dataset.
        test : pd.DataFrame
            The test dataset.
        tid_col : str, optional
            Column name representing the trajectory ID (default: 'tid').
        class_col : str, optional
            Column name representing the class label (default: 'label').
        validate : bool, optional
            If True, splits the training data into training and validation sets (default: False)>> #TODO Under Dev.

        Returns:
        --------
        X_set : list
            List containing the feature matrices (training, validation, test).
        y_set : list
            List containing the label vectors (training, validation, test).
        num_features : int
            The number of features in the dataset, excluding the class label.
        num_classes : int
            The number of unique classes in the dataset.
        """
        
        num_classes, num_features, le, X_set, y_set = self.xy(train, test, tid_col, class_col, validate)
        
        self.add_config(num_classes=num_classes,
                        num_features=num_features)
        self.le = le
        
        if len(X_set) == 2:
            self.X_train = X_set[0] 
            self.X_test = X_set[1]
            self.y_train = y_set[0] 
            self.y_test = y_set[1]
            self.validate = False
        if len(X_set) > 2:
            self.X_train = X_set[0] 
            self.X_val = X_set[1]
            self.X_test = X_set[2]
            self.y_train = y_set[0] 
            self.y_val = y_set[1]
            self.y_test = y_set[2]
            self.validate = True
            
        return X_set, y_set, num_features, num_classes