Source code for matclassification.methods.ensemble.TEC

# -*- coding: utf-8 -*-
'''
MAT-analysis: Analisys and Classification methods for Multiple Aspect Trajectory Data Mining

The present package offers a tool, to support the user in the task of data analysis of multiple aspect trajectories. It integrates into a unique framework for multiple aspects trajectories and in general for multidimensional sequence data mining methods.
Copyright (C) 2022, MIT license (this portion of code is subject to licensing from source project distribution)

Created on Dec, 2021
Copyright (C) 2022, License GPL Version 3 or superior (see LICENSE file)

Authors:
    - Tarlis Portela
'''
# --------------------------------------------------------------------------------
import time
import pandas as pd
import numpy as np
from numpy import argmax

#import logging
#logging.disable()

from tqdm.auto import tqdm
# --------------------------------------------------------------------------------
from sklearn import preprocessing
# --------------------------------------------------------------------------------
from matclassification.methods._lib.metrics import *
from matclassification.methods.core import AbstractClassifier, MClassifier, MHSClassifier, THSClassifier

from matclassification.methods import *


[docs]
def dinamic_import():
    return getattr(__import__('matclassification'), 'methods')


# --------------------------------------------------------------------------------

[docs]
class TEC(MHSClassifier, THSClassifier):
    
    def __init__(self, 
                 ensembles=['MARC', 'POIS', 'MMLP'],
                 
                 n_jobs=-1,
                 verbose=2,
                 random_state=42,
                 filterwarnings='ignore'):
        
        super().__init__('TEC', n_jobs=n_jobs, verbose=verbose, random_state=random_state, filterwarnings=filterwarnings)
        
        self.add_config(ensembles=ensembles, filterwarnings=filterwarnings)
        
        # tacke future warning, deprecated warning, for sub models
        import warnings
        # tackle some warning
        def warn(*args, **kwargs):
            pass
        warnings.warn = warn
        warnings.simplefilter(action="ignore", category=FutureWarning)
        warnings.filterwarnings("ignore", category = DeprecationWarning)

        '''
        TF_CPP_MIN_LOG_LEVEL = 0 to all logs .
        TF_CPP_MIN_LOG_LEVEL = 1 to filter out INFO logs 
        TF_CPP_MIN_LOG_LEVEL = 2 to additionall filter out WARNING 
        TF_CPP_MIN_LOG_LEVEL = 3 to additionally filter out ERROR.
        '''
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
        os.environ["PYTHONWARNINGS"] = 'ignore'
        
        
#        import tensorflow as tf
#        tf.keras.backend.clear_session()

    

[docs]
    def prepare_models(self,
                data,
                tid_col='tid',  # TODO, pass other params
                class_col='label',
                space_geohash=False, # True: Geohash, False: indexgrid
                geo_precision=30,    # Geohash: precision OR IndexGrid: meters
                validate=False):
        
        if validate:
            raise NotImplementedError('['+self.name+':]  with validate=True is not implemented.')
            
        if not self.check_input(data, tid_col, class_col):
            raise ValueError('['+self.name+':] All data input (trajectories and features) must match size and corresponding labels.')
     
        self.models = self.create()
        
        for method, model in self.models.items():
            # MARC and POIS yet have different configurations for input trajectories geo_precision:
            if isinstance(model, POIS) or isinstance(model, MARC):    
                train = data['mat'][0]
                test  = data['mat'][1]
                
                model.prepare_input(train, test, 
                                    tid_col=tid_col, class_col=class_col, 
                                    validate=validate)

            # Movelets as Input:
            elif isinstance(model, MClassifier):
                train = data['movelets'][0]
                test  = data['movelets'][1]
                
                model.prepare_input(train, test,
                                    validate=validate)
                
            # Trajectories as Input:
            elif isinstance(model, HPOClassifier):
                train = data['mat'][0]
                test  = data['mat'][1]
                
                model.prepare_input(train, test,
                                    tid_col=tid_col, class_col=class_col, 
                                    space_geohash=space_geohash, # TODO
                                    validate=validate)
            else:
                print('['+self.name+':] *WARNING* model \'{}\' has no matching data format input.'.format(method))

        

[docs]
    def create(self):
        module = dinamic_import()
        
        # create the ensemble models
        ensembles = self.config['ensembles']
        models = dict()
        
        pbar = tqdm(ensembles, desc='['+self.name+':] Model building')
        for method in pbar:
            pbar.set_postfix_str(' - building sub-model {}.'.format(method))
            class_ = getattr(module, method)
            instance = class_(n_jobs=self.config['n_jobs'],
                              random_state=self.config['random_state'],
                              filterwarnings=self.config['filterwarnings'],
                              verbose=0)
#                              verbose=-1)
            models[method] = instance
            
        return models

    

[docs]
    def sub_fit_predict(self, model, do_fit=True):
        X_train = model.X_train
        y_train = model.y_train

        if model.validate:
            X_val = model.X_val
            y_val = model.y_val
        else:
            X_val = model.X_test
            y_val = model.y_test 

        X_test = model.X_test
        y_test = model.y_test
        
        if do_fit:
#            model.fit(X_train, y_train, X_val, y_val)
            model.train()
            summ, y_pred = model.predict(X_val, y_val)
        else:
            summ, y_pred = model.predict(X_test, y_test)
            
        
        return summ, y_pred

    

[docs]
    def final_predict(self, y_test, estimators):
        final_pred = estimators[0]
        for i in range(1, len(estimators)):
            final_pred = final_pred + estimators[i]
        
        summ = self.score(None, y_test, np.array(final_pred))
        summ['model'] = self.name
        
        return summ, final_pred

    

[docs]
    def fit(self):
        
        if not hasattr(self, 'models'):
            raise Exception('['+self.name+':] first imput the data, calling "prepare_models".')
        
        report = pd.DataFrame()
        estimators = []
        
        pbar = tqdm(self.models.items(), desc='['+self.name+':] Model Training')
        for method, model in pbar:
            pbar.set_postfix_str('- training sub-model {}.'.format(method))
            
            summ, y_pred = self.sub_fit_predict(model)
            summ['model'] = method

            estimators.append(y_pred)
            
            report = pd.concat([report, summ])
        
        print('['+self.name+':] \t - Combining models.')
        
        if model.validate:
            y_val = model.y_val
        else:
            y_val = model.y_test
        summ, final_pred = self.final_predict(y_val, estimators)
        self.report = pd.concat([report, summ])
        self.report.reset_index(drop=True, inplace=True)
        
        return self.report

    

[docs]
    def predict(self):
        
        report = pd.DataFrame()
        estimators = []
        for method, model in self.models.items():
            print('['+self.name+':] \t - Predicting sub-model {}.'.format(method))
            
            summ, y_pred = self.sub_fit_predict(model, do_fit=False)
            summ['model'] = method
            
            estimators.append(y_pred)
            
            report = pd.concat([report, summ])
        
        print('['+self.name+':] \t - Final prediction.')
        
        summ, final_pred = self.final_predict(model.y_test, estimators)
        self._summary = pd.concat([report, summ])
        self._summary.reset_index(drop=True, inplace=True)
        
        return self._summary, final_pred



[docs]
    def check_input(self, data, tid_col='tid', class_col='label'):
        y_true = {}
        for i, datasets in data.items():
            for j in range(len(datasets)):
                df = datasets[j]
                if i == 'mat':
                    df.sort_values([class_col, tid_col], inplace=True)
                    
                    arr = list(map(lambda df_i: df_i[1][class_col].unique()[0], df.groupby([tid_col])))
                elif i == 'movelets':
                    df.sort_values([class_col, tid_col], inplace=True)
#                    df.drop(columns=[tid_col], inplace=True)
                    
                    arr = df[class_col].values
                elif i == 'pois':
                    raise NotImplementedError('['+self.name+':] check pois data is not implemented.')
                
                if j not in y_true.keys():
                    y_true[j] = []
                y_true[j].append(np.array(arr))
    
        check = True
        for i, y in y_true.items():
            y = np.array(y)
            if len(y) > 1:
                check = check and np.all(np.equal(*y))

        return check , y_true        


# --------------------------------------------------------------------------------