Source code for matclustering.methods.hierarchical.mattree.algorithm.dashtree

# -*- coding: utf-8 -*-
"""
MAT-Tools: Python Framework for Multiple Aspect Trajectory Data Mining

The present application offers a tool, to support the user in the clustering of multiple aspect trajectory data.It integrates into a unique framework for multiple aspects trajectories and in general for multidimensional sequence data mining methods.
Copyright (C) 2022, MIT license (this portion of code is subject to licensing from source project distribution)

Created on Apr, 2024
Copyright (C) 2024, License GPL Version 3 or superior (see LICENSE file)

Authors:
    - Tarlis Portela
    - Yuri Santos
"""
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

from matclustering.methods.hierarchical.mattree.algorithm.TreeNodeObject import TreeNodeObject
from matclustering.methods.hierarchical.mattree.algorithm.check_label import check_label
from matclustering.methods.hierarchical.mattree.metrics_evaluation.entropy import get_entropy
from matclustering.methods.hierarchical.mattree.metrics_evaluation.freq_matrix import generate_freq_matrix
from matclustering.methods.hierarchical.mattree.metrics_evaluation.similarity_matrix import get_similarity_matrix


[docs] def dashtree(self, df, exclude_aspects=None): """ Method that calculates the frequency matrix and the clusters generated from the division of data based on a split criteria defined in the Tree constructor. """ # columns = self.data.drop(['time', 'poi', 'type', 'rating', 'label'], axis=1) # dummies = pd.get_dummies(columns, prefix_sep='~') # vals = dummies.drop(['tid'], axis=1) # self.freqMatrix = pd.pivot_table(dummies, index=['tid'], values=vals.columns, aggfunc=np.sum) print("Computing frequency matrix...", end="") generate_freq_matrix(self,exclude_aspects) print("DONE!") print("Computing absolute freq matrix...", end="") if self.relative and self.absolute_frequency_matrix is None: self.absolute_frequency_matrix = self.freqMatrix.sum() dt = [res for res in self.freqMatrix.mean()] self.absolute_frequency_matrix = pd.DataFrame(columns=['mean'], data=dt, index=self.freqMatrix.columns) print("DONE") # Condição de parada if (self.trajList.size <= 1 or (self.maxTrajPerGroup > 0 and self.trajList.size < self.maxTrajPerGroup) or (0 < self.maxDepth <= self.depth)): print("Stop criteria matched.") self.parentName = str(self.data.tid.unique().size) self.done = 'Yes' check_label(self, 'c', self.depth) self.df_dict[str(self.id)] = self.freqMatrix self.nodeLabel[self.depth].append(self.id) # print(f"self.done: {self.done}; self.parentName: {self.parentName}; nodeLabel: [self.depth={self.depth}.append(self.id={self.id})]")#degub idx = self.df_dict[str(self.id)].index df_filter = df[df.tid.isin(idx)] self.df_leaves[str(self.id)] = df_filter self.temporario += get_entropy(df_filter) * len(df_filter.tid.unique()) self.clusters += 1 self.dendrogram_dict[str(self.id)] = [] return 0 self.nodeNum += 1 self.source.append(self.nodeNum) minVar = -1 print("Computing aspects threshold...", end="") if self.relative: dt = [res for res in self.freqMatrix.sum() / len(df)] self.threshold = pd.DataFrame(columns=['mean'], data=dt, index=self.freqMatrix.columns) else: dt = [res for res in self.freqMatrix.mean()] self.threshold = pd.DataFrame(columns=['mean'], data=dt, index=self.freqMatrix.columns) print("DONE!") self.variance = {} left_dict = {} right_dict = {} feature_list, initial_var, var_red = [], [], [] reducao = {} split_value = {} muitas_result = {} msm_result = {} print(f"Starting trajectory split on branches (split method: {self.split}). It will do:") print("\tCheck trajs for left and right branches regarding the AVG.") print("\tCompute variance for each aspect and branch.") print("\tRunning...", end="") # for col in self.freqMatrix.columns: for col in tqdm(self.freqMatrix.columns, desc='Processing '+self.id): if col in self.skipVal: continue left, left_idx, right, right_idx = [], [], [], [] for i, reg in enumerate(self.freqMatrix[col]): if reg < self.threshold['mean'][col]: left.append(reg) left_idx.append(i) else: right.append(reg) right_idx.append(i) self.left_group[col] = left self.right_group[col] = right left_dict[col] = left_idx right_dict[col] = right_idx # print("Delay on var() part 1:", end="")#debug self.variance[col] = { "initial": self.freqMatrix[col].var(), "left": np.var(left), "right": np.var(right) } # print(f"DONE!")#debug if self.split == 'var_red': print("Delay on var() part 2:", end="") save_df = self.freqMatrix.copy() save_df.reset_index(drop=True, inplace=True) esquerda = save_df.loc[save_df.index.isin(left_dict[col])] direita = save_df.loc[save_df.index.isin(right_dict[col])] for c in self.freqMatrix.columns: initial_variance = self.freqMatrix[c].var() variance_reduction = initial_variance - abs((np.var(esquerda[c]) - np.var(direita[c])) / 2) reducao[c] = variance_reduction split_value[col] = sum(reducao.values()) / len(self.freqMatrix.columns) print(f"DONE!") elif self.split == 'muitas': traj_left = [t for i, t in enumerate(self.freqMatrix.index.values) if i in left_dict[col]] esquerda = self.data.loc[self.data['tid'].isin(traj_left)] traj_right = [t for i, t in enumerate(self.freqMatrix.index.values) if i in right_dict[col]] direita = self.data.loc[self.data['tid'].isin(traj_right)] similarity_mean_node_muitas_esquerda = get_similarity_matrix(esquerda, 'MUITAS') try: similarity_mean_node_muitas_esquerda = sum(similarity_mean_node_muitas_esquerda.mean()) / len( similarity_mean_node_muitas_esquerda) except Exception as ex: similarity_mean_node_muitas_esquerda = sum(similarity_mean_node_muitas_esquerda.mean()) / 1 similarity_mean_node_muitas_direita = get_similarity_matrix(direita, 'MUITAS') similarity_mean_node_muitas_direita = sum(similarity_mean_node_muitas_direita.mean()) / len( similarity_mean_node_muitas_direita) muitas_media = (similarity_mean_node_muitas_esquerda + similarity_mean_node_muitas_direita) / 2 muitas_result[col] = muitas_media elif self.split == 'msm': traj_left = [t for i, t in enumerate(self.freqMatrix.index.values) if i in left_dict[col]] esquerda = self.data.loc[self.data['tid'].isin(traj_left)] traj_right = [t for i, t in enumerate(self.freqMatrix.index.values) if i in right_dict[col]] direita = self.data.loc[self.data['tid'].isin(traj_right)] similarity_mean_node_msm_esquerda = get_similarity_matrix(esquerda, 'MSM') try: similarity_mean_node_msm_esquerda = sum(similarity_mean_node_msm_esquerda.mean()) / len( similarity_mean_node_msm_esquerda) except Exception as ex: similarity_mean_node_msm_esquerda = sum(similarity_mean_node_msm_esquerda.mean()) / 1 similarity_mean_node_msm_direita = get_similarity_matrix(direita, 'MSM') similarity_mean_node_msm_direita = sum(similarity_mean_node_msm_direita.mean()) / len( similarity_mean_node_msm_direita) msm_media = (similarity_mean_node_msm_esquerda + similarity_mean_node_msm_direita) / 2 msm_result[col] = msm_media elif self.split == "binary": differenceBetweenGroups = np.abs(len(right) - len(left)) if minVar == -1 or differenceBetweenGroups < minVar: minVar = differenceBetweenGroups self.division = col elif self.split == "minVariance": calcMinVar = (self.variance[col]["left"] + self.variance[col][ "right"]) / 2; # average variance between groups if minVar == -1 or calcMinVar < minVar: minVar = calcMinVar self.division = col else: # self.split == 'max_red' calcMinVar = self.variance[col]['initial'] - (self.variance[col]["left"] + self.variance[col][ "right"]) / 2 # average variance between groups if calcMinVar > minVar: minVar = calcMinVar self.division = col print("ALL DONE!") if self.split == 'muitas': self.division = max(muitas_result, key=muitas_result.get) elif self.split == 'msm': self.division = max(msm_result, key=msm_result.get) elif self.split == 'var_red': self.division = max(split_value, key=split_value.get) # print(f"self.division: {self.division}")#debug asp, val = self.division.split('~') # print(f"asp: {asp}; val: {val}")#degub self.thresholdVal = self.threshold['mean'][self.division] check_label(self, f'{asp} {val}', self.depth) if self.division not in self.skipVal: self.skipVal.append(self.division) self.value[self.depth].extend([len(self.left_group[self.division]), len(self.right_group[self.division])]) self.nodeLabel[self.depth].append(self.id) # print(f"nodeLabel: [self.depth={self.depth}].append(self.id={self.id})")#degub self.df_dict[str(self.id)] = self.freqMatrix self.parentName = asp + "\n[" + val + "]" # print(f"parentName: {self.parentName}")#degub # print("\nChamar func para nó da ESQUERDA")#degub traj_left = [t for i, t in enumerate(self.freqMatrix.index.values) if i in left_dict[self.division]] self.left = TreeNodeObject(self.data.loc[self.data['tid'].isin(traj_left)], self) dashtree(self.left, df) # self.left.dashTree() self.leftChildName = self.left.parentName # print(f"leftChildName: {self.leftChildName}; left.id: {self.left.id}")#degub self.dendrogram_dict[str(self.id)].append(self.left.id) # print("\nChamar func para nó da DIREITA")#degub traj_right = [t for i, t in enumerate(self.freqMatrix.index.values) if i in right_dict[self.division]] self.right = TreeNodeObject(self.data.loc[self.data['tid'].isin(traj_right)], self) dashtree(self.right, df) # self.left.dashTree() self.rightChildName = self.right.parentName # print(f"rightChildName: {self.rightChildName}; right.id: {self.right.id}")#degub self.dendrogram_dict[str(self.id)].append(self.right.id)