Source code for matclassification.methods.mat.Bituler

# -*- coding: utf-8 -*-
'''
MAT-Tools: Python Framework for Multiple Aspect Trajectory Data Mining

The present package offers a tool, to support the user in the task of data analysis of multiple aspect trajectories. It integrates into a unique framework for multiple aspects trajectories and in general for multidimensional sequence data mining methods.
Copyright (C) 2022, MIT license (this portion of code is subject to licensing from source project distribution)

Created on Dec, 2021
Copyright (C) 2022, License GPL Version 3 or superior (this portion of code is subject to licensing from source project distribution)

Authors:
    - Tarlis Portela
    - Original source:
        - Nicksson C. A. de Freitas, 
        - Ticiana L. Coelho da Silva, 
        - Jose António Fernandes de Macêdo, 
        - Leopoldo Melo Junior, 
        - Matheus Gomes Cordeiro
    - Adapted from: https://github.com/nickssonfreitas/ICAART2021
'''
# --------------------------------------------------------------------------------
import time
import pandas as pd
import numpy as np
from numpy import argmax

from tqdm.auto import tqdm

import itertools
# --------------------------------------------------------------------------------
from tensorflow.keras.layers import Dense, LSTM, GRU, Bidirectional, Concatenate, Add, Average, Embedding, Dropout, Input
from tensorflow.keras.initializers import he_normal, he_uniform
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.regularizers import l1
# --------------------------------------------------------------------------------
from matclassification.methods._lib.datahandler import prepareTrajectories

from matclassification.methods.core import THSClassifier

[docs] class Bituler(THSClassifier): """ Gao et al. (2017) proposed BiTULER, a model that uses word embeddings and a Bidirectional Recurrent Neural Network, but it is limited to the sequence of check-in identifiers, not supporting other dimensions. Bituler is a trajectory classification model that extends the THSClassifier. It is designed for handling multiple aspect trajectory data, utilizing deep learning techniques such as Recurrent Neural Networks (RNNs) for feature extraction and classification. The model can be configured with various neural network architectures, embedding sizes, and optimization settings. Parameters ---------- rnn : list, optional List of RNN architectures to use. Currently, only 'bilstm' is supported (default: ['bilstm']). units : list, optional List of integers specifying the number of hidden units in each layer (default: [100, 200, 250, 300]). stack : list, optional List of integers specifying the number of RNN layers to stack (default: [1]). dropout : list, optional List of floats specifying the dropout rate for regularization (default: [0.5]). embedding_size : list, optional List of integers specifying the size of the embedding layer (default: [100, 200, 300, 400]). batch_size : list, optional List of batch sizes for training (default: [64]). epochs : list, optional List of the number of epochs for training (default: [1000]). patience : list, optional List of integers specifying the number of epochs to wait for early stopping (default: [20], currently unused). monitor : list, optional List of metrics to monitor for early stopping (default: ['val_acc']). optimizer : list, optional List of optimizers to use during training (default: ['ada']). learning_rate : list, optional List of learning rates for the optimizer (default: [0.001]). save_results : bool, optional If True, saves the results of the training process (default: False). n_jobs : int, optional Number of parallel jobs to run (default: -1). verbose : int, optional Verbosity level of the training process (default: 0). random_state : int, optional Seed for random number generation (default: 42). filterwarnings : str, optional Configures warning filtering (default: 'ignore'). Methods ------- xy(train, test, tid_col='tid', class_col='label', space_geohash=False, geo_precision=30, features=['poi'], validate=False) Prepares the trajectory data for model training and testing. prepare_input(train, test, tid_col='tid', class_col='label', space_geohash=False, geo_precision=30, features=['poi'], validate=False) Prepares the input data by configuring the model parameters and splitting the data into training and testing sets. create(config) Creates and returns the RNN model architecture based on the provided configuration. fit(X_train, y_train, X_val, y_val, config=None) Trains the model on the provided training data and evaluates it on the validation data. clear() Clears the model session to free memory. """ def __init__(self, # num_classes = -1, # max_lenght = -1, # vocab_size = -1, rnn= ['bilstm'], #Unused units = [100, 200, 250, 300], stack = [1], dropout =[0.5], embedding_size = [100, 200, 300, 400], batch_size = [64], epochs = [1000], patience = [20], #Unused monitor = ['val_acc'], optimizer = ['ada'], learning_rate = [0.001], save_results=False, n_jobs=-1, verbose=0, random_state=42, filterwarnings='ignore'): super().__init__('Bituler', save_results=save_results, n_jobs=n_jobs, verbose=verbose, random_state=random_state, filterwarnings=filterwarnings) self.add_config(rnn=rnn, units=units, stack=stack, dropout=dropout, embedding_size=embedding_size, batch_size=batch_size, epochs=epochs, patience=patience, monitor=monitor, optimizer=optimizer, learning_rate=learning_rate) # Moved to prepare_input: # self.grid = list(itertools.product()) self.model = None
[docs] def xy(self, train, test, tid_col='tid', class_col='label', space_geohash=False, # True: Geohash, False: indexgrid geo_precision=30, # Geohash: precision OR IndexGrid: meters features=['poi'], validate=False): # RETURN: X, y, features, num_classes, space, dic_parameters return prepareTrajectories(train.copy(), test.copy(), tid_col=tid_col, class_col=class_col, # space_geohash, True: Geohash, False: indexgrid space_geohash=space_geohash, # Geohash: precision OR IndexGrid: meters geo_precision=geo_precision, features=features, features_encoding=True, y_one_hot_encodding=False, split_test_validation=validate, data_preparation=2, verbose=self.isverbose)
[docs] def prepare_input(self, train, test, tid_col='tid', class_col='label', space_geohash=False, # True: Geohash, False: indexgrid geo_precision=30, # Geohash: precision OR IndexGrid: meters features=['poi'], validate=False): ## Rewriting the method to change default params X, y, features, num_classes, space, dic_parameters = self.xy(train, test, tid_col, class_col, space_geohash, geo_precision, features, validate) self.add_config(space=space, features=features, num_classes=num_classes, dic_parameters=dic_parameters) if 'encode_y' in dic_parameters.keys(): self.le = dic_parameters['encode_y'] if len(X) == 2: self.X_train = X[0] self.X_test = X[1] self.y_train = y[0] self.y_test = y[1] self.validate = False if len(X) > 2: self.X_train = X[0] self.X_val = X[1] self.X_test = X[2] self.y_train = y[0] self.y_val = y[1] self.y_test = y[2] self.validate = True num_classes = self.config['num_classes'] = dic_parameters['num_classes'] max_lenght = self.config['max_lenght'] = dic_parameters['max_lenght'] vocab_size = self.config['vocab_size'] = dic_parameters['vocab_size'][features[0]] #['poi'] rnn = self.config['rnn'] units = self.config['units'] stack = self.config['stack'] dropout = self.config['dropout'] embedding_size = self.config['embedding_size'] batch_size = self.config['batch_size'] epochs = self.config['epochs'] patience = self.config['patience'] monitor = self.config['monitor'] optimizer = self.config['optimizer'] learning_rate = self.config['learning_rate'] self.grid_search(rnn, units, stack, dropout, embedding_size, batch_size, epochs, patience, monitor, learning_rate) # self.grid = list(itertools.product(rnn, units, stack, dropout, embedding_size, # batch_size, epochs, patience, monitor, learning_rate)) return X, y, features, num_classes, space, dic_parameters
[docs] def create(self, config): max_lenght=self.config['max_lenght'] num_classes=self.config['num_classes'] vocab_size=self.config['vocab_size'] rnn_units=config[1] stack=config[2] dropout=config[3] embedding_size=config[4] #Initializing Neural Network input_model= Input(shape=(max_lenght,), name='spatial_poi') embedding_layer = Embedding(input_dim = vocab_size, output_dim = embedding_size, name='embedding_poi', input_length=max_lenght)(input_model) rnn_cell = Bidirectional(LSTM(units=rnn_units))(embedding_layer) hidden_dropout = Dropout(dropout)(rnn_cell) output_model = Dense(num_classes, activation='softmax')(hidden_dropout) return Model(inputs=input_model, outputs=output_model)
[docs] def fit(self, X_train, y_train, X_val, y_val, config=None): if not config: config = self.best_config if not self.model: self.model = self.create(config) batch_size=config[5] epochs=config[6] learning_rate=config[9] ## seting parameters optimizer = Adam(lr=learning_rate) loss = ['sparse_categorical_crossentropy'] metric = ['acc'] monitor='val_acc' self.model.compile(optimizer=optimizer, loss=loss, metrics=metric) early_stop = EarlyStopping(monitor='val_acc', min_delta=0, patience=50, verbose=0, # without print mode='auto', restore_best_weights=True) my_callbacks= [early_stop] return self.model.fit(X_train, y_train, epochs=epochs, callbacks=my_callbacks, validation_data=(X_val, y_val), verbose=1, shuffle=True, use_multiprocessing=True, batch_size=batch_size)
[docs] def clear(self): super().clear() K.clear_session()