Source code for matclassification.methods.mat.Bituler

# -*- coding: utf-8 -*-
'''
MAT-Tools: Python Framework for Multiple Aspect Trajectory Data Mining

The present package offers a tool, to support the user in the task of data analysis of multiple aspect trajectories. It integrates into a unique framework for multiple aspects trajectories and in general for multidimensional sequence data mining methods.
Copyright (C) 2022, MIT license (this portion of code is subject to licensing from source project distribution)

Created on Dec, 2021
Copyright (C) 2022, License GPL Version 3 or superior (this portion of code is subject to licensing from source project distribution)

Authors:
    - Tarlis Portela
    - Original source:
        - Nicksson C. A. de Freitas, 
        - Ticiana L. Coelho da Silva, 
        - Jose António Fernandes de Macêdo, 
        - Leopoldo Melo Junior, 
        - Matheus Gomes Cordeiro
    - Adapted from: https://github.com/nickssonfreitas/ICAART2021
'''
# --------------------------------------------------------------------------------
import time
import pandas as pd
import numpy as np
from numpy import argmax

from tqdm.auto import tqdm

import itertools
# --------------------------------------------------------------------------------
from tensorflow.keras.layers import Dense, LSTM, GRU, Bidirectional, Concatenate, Add, Average, Embedding, Dropout, Input
from tensorflow.keras.initializers import he_normal, he_uniform
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.regularizers import l1
# --------------------------------------------------------------------------------
from matclassification.methods._lib.datahandler import prepareTrajectories

from matclassification.methods.core import THSClassifier


[docs]
class Bituler(THSClassifier):
    """
    Gao et al. (2017) proposed BiTULER, a model that uses word embeddings and a Bidirectional Recurrent Neural Network, 
    but it is limited to the sequence of check-in identifiers, not supporting other dimensions.
    
    Bituler is a trajectory classification model that extends the THSClassifier. 
    It is designed for handling multiple aspect trajectory data, utilizing deep learning techniques 
    such as Recurrent Neural Networks (RNNs) for feature extraction and classification. The model 
    can be configured with various neural network architectures, embedding sizes, and optimization 
    settings. 

    Parameters
    ----------
    rnn : list, optional
        List of RNN architectures to use. Currently, only 'bilstm' is supported (default: ['bilstm']).
    units : list, optional
        List of integers specifying the number of hidden units in each layer (default: [100, 200, 250, 300]).
    stack : list, optional
        List of integers specifying the number of RNN layers to stack (default: [1]).
    dropout : list, optional
        List of floats specifying the dropout rate for regularization (default: [0.5]).
    embedding_size : list, optional
        List of integers specifying the size of the embedding layer (default: [100, 200, 300, 400]).
    batch_size : list, optional
        List of batch sizes for training (default: [64]).
    epochs : list, optional
        List of the number of epochs for training (default: [1000]).
    patience : list, optional
        List of integers specifying the number of epochs to wait for early stopping (default: [20], currently unused).
    monitor : list, optional
        List of metrics to monitor for early stopping (default: ['val_acc']).
    optimizer : list, optional
        List of optimizers to use during training (default: ['ada']).
    learning_rate : list, optional
        List of learning rates for the optimizer (default: [0.001]).
    save_results : bool, optional
        If True, saves the results of the training process (default: False).
    n_jobs : int, optional
        Number of parallel jobs to run (default: -1).
    verbose : int, optional
        Verbosity level of the training process (default: 0).
    random_state : int, optional
        Seed for random number generation (default: 42).
    filterwarnings : str, optional
        Configures warning filtering (default: 'ignore').

    Methods
    -------
    xy(train, test, tid_col='tid', class_col='label', space_geohash=False, geo_precision=30, features=['poi'], validate=False)
        Prepares the trajectory data for model training and testing.
        
    prepare_input(train, test, tid_col='tid', class_col='label', space_geohash=False, geo_precision=30, features=['poi'], validate=False)
        Prepares the input data by configuring the model parameters and splitting the data into training and testing sets.
        
    create(config)
        Creates and returns the RNN model architecture based on the provided configuration.
        
    fit(X_train, y_train, X_val, y_val, config=None)
        Trains the model on the provided training data and evaluates it on the validation data.
        
    clear()
        Clears the model session to free memory.
    """
    def __init__(self, 
#                 num_classes = -1,
#                 max_lenght = -1,
#                 vocab_size = -1,
                 rnn= ['bilstm'], #Unused
                 units = [100, 200, 250, 300],
                 stack = [1],
                 dropout =[0.5],
                 embedding_size = [100, 200, 300, 400],
                 batch_size = [64],
                 epochs = [1000],
                 patience = [20], #Unused
                 monitor = ['val_acc'],
                 optimizer = ['ada'],
                 learning_rate = [0.001],
                 
                 save_results=False,
                 n_jobs=-1,
                 verbose=0,
                 random_state=42,
                 filterwarnings='ignore'):
        
        super().__init__('Bituler', save_results=save_results, n_jobs=n_jobs, verbose=verbose, random_state=random_state, filterwarnings=filterwarnings)
        
        self.add_config(rnn=rnn, 
                        units=units, 
                        stack=stack, 
                        dropout=dropout, 
                        embedding_size=embedding_size, 
                        batch_size=batch_size, 
                        epochs=epochs, 
                        patience=patience, 
                        monitor=monitor, 
                        optimizer=optimizer, 
                        learning_rate=learning_rate)

        # Moved to prepare_input:
#        self.grid = list(itertools.product())
        
        self.model = None
    

[docs]
    def xy(self,
           train, test,
           tid_col='tid', 
           class_col='label',
           space_geohash=False, # True: Geohash, False: indexgrid
           geo_precision=30,    # Geohash: precision OR IndexGrid: meters
           features=['poi'],
           validate=False):
        
        # RETURN: X, y, features, num_classes, space, dic_parameters
        return prepareTrajectories(train.copy(), test.copy(),
                                   tid_col=tid_col, 
                                   class_col=class_col,
                                   # space_geohash, True: Geohash, False: indexgrid
                                   space_geohash=space_geohash, 
                                   # Geohash: precision OR IndexGrid: meters
                                   geo_precision=geo_precision,     

                                   features=features,
                                   features_encoding=True, 
                                   y_one_hot_encodding=False,
                                   split_test_validation=validate,
                                   data_preparation=2,

                                   verbose=self.isverbose)

    

[docs]
    def prepare_input(self,
                      train, test,
                      tid_col='tid', 
                      class_col='label',
                      space_geohash=False, # True: Geohash, False: indexgrid
                      geo_precision=30,     # Geohash: precision OR IndexGrid: meters
                      features=['poi'],
                      validate=False):
        
        ## Rewriting the method to change default params
        X, y, features, num_classes, space, dic_parameters = self.xy(train, test, tid_col, class_col, space_geohash, geo_precision, features, validate)
        
        self.add_config(space=space,
                        features=features,
                        num_classes=num_classes, 
                        dic_parameters=dic_parameters)
        
        if 'encode_y' in dic_parameters.keys():
            self.le = dic_parameters['encode_y']
        
        if len(X) == 2:
            self.X_train = X[0] 
            self.X_test = X[1]
            self.y_train = y[0] 
            self.y_test = y[1]
            self.validate = False
        if len(X) > 2:
            self.X_train = X[0] 
            self.X_val = X[1]
            self.X_test = X[2]
            self.y_train = y[0] 
            self.y_val = y[1]
            self.y_test = y[2]
            self.validate = True
        
        num_classes = self.config['num_classes'] = dic_parameters['num_classes']
        max_lenght = self.config['max_lenght'] = dic_parameters['max_lenght']
        vocab_size = self.config['vocab_size'] = dic_parameters['vocab_size'][features[0]] #['poi']
        rnn = self.config['rnn']
        units = self.config['units']
        stack = self.config['stack']
        dropout = self.config['dropout']
        embedding_size = self.config['embedding_size']
        batch_size = self.config['batch_size']
        epochs = self.config['epochs']
        patience = self.config['patience']
        monitor = self.config['monitor']
        optimizer = self.config['optimizer']
        learning_rate = self.config['learning_rate']
        
        
        self.grid_search(rnn, units, stack, dropout, embedding_size, batch_size, epochs, patience, monitor, learning_rate)
#        self.grid = list(itertools.product(rnn, units, stack, dropout, embedding_size, 
#                                           batch_size, epochs, patience, monitor, learning_rate))
        
        return X, y, features, num_classes, space, dic_parameters

        

[docs]
    def create(self, config):

        max_lenght=self.config['max_lenght']   
        num_classes=self.config['num_classes']
        vocab_size=self.config['vocab_size']
        rnn_units=config[1]
        stack=config[2]
        dropout=config[3]
        embedding_size=config[4]
        
        #Initializing Neural Network
        input_model= Input(shape=(max_lenght,), name='spatial_poi') 
        embedding_layer = Embedding(input_dim = vocab_size, output_dim = embedding_size, 
                              name='embedding_poi', input_length=max_lenght)(input_model)

        rnn_cell = Bidirectional(LSTM(units=rnn_units))(embedding_layer)
                            
        hidden_dropout = Dropout(dropout)(rnn_cell)
        output_model = Dense(num_classes, activation='softmax')(hidden_dropout)
        
        return Model(inputs=input_model, outputs=output_model)

    

[docs]
    def fit(self, 
            X_train, 
            y_train, 
            X_val,
            y_val,
            config=None):
        
        if not config:
            config = self.best_config            
        if not self.model:
            self.model = self.create(config)
        
        batch_size=config[5]
        epochs=config[6]
        learning_rate=config[9]
        
        ## seting parameters
        optimizer = Adam(lr=learning_rate)
        loss = ['sparse_categorical_crossentropy']
        metric = ['acc']  
        monitor='val_acc'
        
        self.model.compile(optimizer=optimizer, loss=loss, metrics=metric)
        
        early_stop = EarlyStopping(monitor='val_acc',
                                   min_delta=0, 
                                   patience=50, 
                                   verbose=0, # without print 
                                   mode='auto',
                                   restore_best_weights=True)
        
        my_callbacks= [early_stop]   

        return self.model.fit(X_train, 
                                    y_train,
                                    epochs=epochs,
                                    callbacks=my_callbacks,
                                    validation_data=(X_val, y_val),
                                    verbose=1,
                                    shuffle=True,
                                    use_multiprocessing=True,          
                                    batch_size=batch_size)

    

[docs]
    def clear(self):
        super().clear()
        K.clear_session()