Source code for matdata.dataset

# -*- coding: utf-8 -*-
"""
MAT-Tools: Python Framework for Multiple Aspect Trajectory Data Mining

The present application offers a tool, to support the user in the preprocessing of multiple aspect trajectory data. It integrates into a unique framework for multiple aspects trajectories and in general for multidimensional sequence data mining methods.
Copyright (C) 2022, MIT license (this portion of code is subject to licensing from source project distribution)

Created on Dec, 2023
Copyright (C) 2023, License GPL Version 3 or superior (see LICENSE file)

Authors:
    - Tarlis Portela
"""
import os
import pandas as pd
import numpy as np
import requests
import subprocess
import tempfile, py7zr

from tqdm.auto import tqdm

from matdata.preprocess import organizeFrame, splitTIDs, readDataset, stratify, trainTestSplit, kfold_trainTestSplit

# Repository data on GitHub
USER = "mat-analysis"
REPOSITORY = "datasets"

# This URLs are a workaround of GH delay in accessing raw files
REPO_URL     = 'https://github.com/{}/{}/tree/main/{}/{}/'
REPO_URL_API = 'https://api.github.com/repos/{}/{}/contents/{}/{}/'
REPO_URL_RAW = 'https://raw.githubusercontent.com/{}/{}/main/{}/{}/'

DATASET_TYPES = {
    'mat':           'Multiple Aspect Trajectories', 
    'raw':           'Raw Trajectories', 
    'sequential':    'Sequential Semantics', 
    'log':           'Event Logs',
    'mts':           'Multivariate Time Series', 
    'uts':           'Univariate Time Series',
}

SUBSET_TYPES = {
   '*.specific':                     'Multiple',
   'mat.specific':                   'Multiple Aspect',
   'raw.specific':                   'Raw',
   'sequential.*':                   'Semantic',
   'mts.specific':                   'Multivariate',
   'uts.specific':                   'Univariate',
   'log.specific':                   'Event Log',
   'log.process':                    'Event Log', #Deprecated?
   'log.*':                          'Semantic',
    
   '*.raw':      'Spatio-Temporal',
    
   '*.spatial':  'Spatial',
   '*.geo_only': 'Spatial',
   '*.generic':  'Generic',
   '*.category': 'Category',
   '*.poi':      'POI',
   '*.5dims':    '5-Dimensions',
   '*.genes':    'Genetic Sequence',
}

###############################################################################
#   LOAD DATASETs - From https://github.com/mat-analysis/datasets/
###############################################################################

[docs]
def prepare_ds(df, tid_col='tid', class_col=None, sample_size=1, random_num=1, sort=True):
    """
    Prepare dataset for training or testing (helper function).

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the dataset.
    tid_col : str, optional
        The name of the column representing trajectory IDs (default 'tid').
    class_col : str or None, optional
        The name of the column representing class labels. If None, no class column is used for ordering data (default None).
    sample_size : float, optional
        The proportion of the dataset to include in the sample (default 1, i.e., use the entire dataset).
    random_num : int, optional
        Random seed for reproducibility (default 1).

    Returns:
    --------
    pandas.DataFrame
        The prepared dataset with optional sampling.
    """
    
    if class_col and (tid_col != 'tid' or class_col != 'label'):
        df.rename(columns={tid_col: 'tid', class_col: 'label'}, inplace=True)
        class_col = 'label'
        #df.sort_values(['label', 'tid'])
    elif tid_col != 'tid':
        df.rename(columns={tid_col: 'tid'}, inplace=True)
        tid_col == 'tid'
        #df.sort_values(['tid'])
    
    if sample_size < 1: # Stratify the data
        df = stratify(df, sample_size, random_num, tid_col, class_col, organize_columns=False, sort=sort)
        
        #df_index, _ = splitTIDs(df, sample_size, random_num, 'tid', class_col, min_elements=2)
        #df = df.set_index('tid').loc[df_index].reset_index() #df.loc[df['tid'].isin(df_index)]
        
    df, _, columns_order_csv = organizeFrame(df, None, 'tid', class_col)
        
    return df[columns_order_csv]


# ------------------------------------------------------------
## TODO: For now, all datasets on repository have tid and label columns. This can change in the future.

[docs]
def load_ds(dataset='mat.FoursquareNYC', prefix='', missing=None, sample_size=1, random_num=1, sort=True):
    """
    Load a dataset for training or testing from a GitHub repository.

    Parameters:
    -----------
    dataset : str, optional
        The name of the dataset to load (default 'mat.FoursquareNYC').
    prefix : str, optional
        The prefix to be added to the dataset file name (default '').
    missing : str, optional
        The placeholder value used to denote missing data (default '-999').
    sample_size : float, optional
        The proportion of the dataset to include in the sample (default 1, i.e., use the entire dataset).
    random_num : int, optional
        Random seed for reproducibility (default 1).

    Returns:
    --------
    pandas.DataFrame
        The loaded dataset with optional sampling.
    """
    
    def is_file(dsc, dsn, file):
        url = REPO_URL_API.format(USER, REPOSITORY, dsc, dsn) + file
        try:
            resp = requests.head(url)
#            return resp.status_code == requests.codes.found
            return resp.status_code == requests.codes.ok
        except Exception as e:
            return False
        
    def url_is_file(url):
        try:
            resp = requests.head(url)
            return resp.status_code == requests.codes.found
#            return resp.status_code == requests.codes.ok
        except Exception as e:
            return False
        
    def download(url, tmpdir):
        file = os.path.join(tmpdir, os.path.basename(url))
        subprocess.run('curl -o {} {}'.format(file, url), shell=True, check=True)
#        response = requests.get(url, stream=True)
#        with open(os.path.join(tmpdir, os.path.basename(url)), 'wb') as out:
#            out.write(response.content)
#            #content = response.json()['content']
#            #out.write(base64.b64decode(content))
#            return True
        return True #False
    
    def read(url):
        df = pd.read_parquet(url)
        if missing:
            df.fillna(missing, inplace=True)

        return prepare_ds(df, tid_col='tid', class_col='label', sample_size=sample_size, random_num=random_num, sort=sort)
    
    # ------
    file = 'data.parquet'
    if prefix and prefix != '':
        file = prefix+'_data.parquet'
        
    dsc = dataset.split('.')[0]
    dsn = dataset.split('.')[1]
    
    base = REPO_URL_RAW.format(USER, REPOSITORY, dsc, dsn)
    
    # Try to load: 'data.parquet'
    url = base + file
    if is_file(dsc, dsn, file): # url_is_file(url):
        print("Loading dataset file: " + REPO_URL.format(USER, REPOSITORY, dsc, dsn))
#        return read(url)
        with tempfile.TemporaryDirectory() as tmpdir:
            download(url, tmpdir)
            return read(os.path.join(tmpdir, file))
    
    # Try to load compressed: 'data.parquet.7z'
    url = base + file +'.7z'
    if is_file(dsc, dsn, file+'.7z'): #url_is_file(url):
        print("Loading dataset compressed file: " + REPO_URL.format(USER, REPOSITORY, dsc, dsn))
        with tempfile.TemporaryDirectory() as tmpdir:
            download(url, tmpdir)
            filename = os.path.join(tmpdir, file +'.7z')

            with py7zr.SevenZipFile(filename, 'r') as archive:
                archive.extractall(path=tmpdir)
            
            print("Done.")
            print(" --------------------------------------------------------------------------------")
            return read(os.path.join(tmpdir, file))
        
    # Try to load compressed and splitted: 'data.parquet.7z.001-N'
    if is_file(dsc, dsn, file+'.7z.001'): #url_is_file(url+'.001'):
        print("Loading dataset multi-volume files: " + REPO_URL.format(USER, REPOSITORY, dsc, dsn))
        with tempfile.TemporaryDirectory() as tmpdir:
            with open(os.path.join(tmpdir, file +'.7z'), 'ab') as outfile:  # append in binary mode
                i = 1
                while is_file(dsc, dsn, file+'.7z.{:03d}'.format(i)) and download(url+'.7z.{:03d}'.format(i), tmpdir):
                    with open(os.path.join(tmpdir, file+'.7z.{:03d}'.format(i)) , 'rb') as infile: # open in binary mode also
                        outfile.write(infile.read())
                    i += 1

            filename = os.path.join(tmpdir, file +'.7z')
            with py7zr.SevenZipFile(filename, 'r') as archive:
                archive.extractall(path=tmpdir)

            print("Done.")
            print(" --------------------------------------------------------------------------------")
            return read(os.path.join(tmpdir, file))
        
    raise Exception('Unable to load file, check the repository: ' + base)  

    

[docs]
def load_ds_holdout(dataset='mat.FoursquareNYC', train_size=0.7, prefix='', missing='-999', sample_size=1, random_num=1, sort=True):
    """
    Load a dataset for training and testing with a holdout method from a GitHub repository.

    Parameters:
    -----------
    dataset : str, optional
        The name of the dataset file to load from the GitHub repository (default 'mat.FoursquareNYC'). Format as `category.DatasetName`
    train_size : float, optional
        The proportion of the dataset to include in the training set (default 0.7).
    prefix : str, optional
        The prefix to be added to the dataset file name (default '').
    missing : str, optional
        The placeholder value used to denote missing data (default '-999').
    sample_size : float, optional
        The proportion of the dataset to include in the sample (default 1, i.e., use the entire dataset).
    random_num : int, optional
        Random seed for reproducibility (default 1).

    Returns:
    --------
    train : pandas.DataFrame
        The training dataset.
    test : pandas.DataFrame
        The testing dataset.
    """
    
    df = load_ds(dataset, prefix, missing, sample_size, random_num, sort=False)
    
    # Class balanced train/ test split:
    train, test = trainTestSplit(df, train_size, random_num, sort=sort)
    
    return train, test

    

[docs]
def load_ds_kfold(dataset='mat.FoursquareNYC', k=5, prefix='', missing='-999', sample_size=1, random_num=1):
    """
    Load a dataset for k-fold cross-validation from a GitHub repository.

    Parameters:
    -----------
    dataset : str, optional
        The name of the dataset file to load from the GitHub repository (default 'mat.FoursquareNYC').
    k : int, optional
        The number of folds for cross-validation (default 5).
    prefix : str, optional
        The prefix to be added to the dataset file name (default '').
    missing : str, optional
        The placeholder value used to denote missing data (default '-999').
    sample_size : float, optional
        The proportion of the dataset to include in the sample (default 1, i.e., use the entire dataset).
    random_num : int, optional
        Random seed for reproducibility (default 1).

    Returns:
    --------
    ktrain : list ofpandas.DataFrame
        The training datasets for each fold.
    ktest : list of pandas.DataFrame
        The testing datasets for each fold.
    """
    
    df = load_ds(dataset, prefix, missing, sample_size, random_num, sort=False)
    
    # Class balanced f-fold train/ test split:
    ktrain, ktest = kfold_trainTestSplit(df, k, random_num, sort=sort)
    
    return ktrain, ktest


# ------------------------------------------------------------

[docs]
def repository_datasets():
    """
    Read the datasets available in the repository and organize them by category.

    Returns:
    --------
    dict
        A dictionary containing lists of datasets, where each category is a key.
    """
    
    import requests
    
    url = "https://api.github.com/repos/{}/{}/git/trees/main?recursive=1".format(USER, REPOSITORY)
    r = requests.get(url)
    res = r.json()
    
    files = list(map(lambda file: file["path"], res["tree"]))
    datasets_dict = {}
    
    def create_dict(file):
        if file[-3:] == '.md' and '-stats.md' not in file and 'README' not in file and 'TODO' not in file:
            file = file.split(os.path.sep)
            category = file[0]
            if category not in datasets_dict.keys():
                datasets_dict[category] = []
                
            name = file[-1].split('.')[0]
            datasets_dict[category].append(name)
                
        return file
    
    file = list(map(lambda file: create_dict(file), files))
        
    return datasets_dict


###############################################################################
#   READ DATASETs - From local files
###############################################################################

[docs]
def read_ds(data_file, tid_col='tid', class_col=None, missing='-999', sample_size=1, random_num=1):
    """
    Read a dataset from a file.

    Parameters:
    -----------
    data_file : str
        The path to the dataset file.
    tid_col : str, optional
        The name of the column representing trajectory IDs (default 'tid').
    class_col : str or None, optional
        The name of the column representing class labels. If None, no class column is used (default None).
    missing : str, optional
        The placeholder value used to denote missing data (default '-999').
    sample_size : float, optional
        The proportion of the dataset to include in the sample (default 1, i.e., use the entire dataset).
    random_num : int, optional
        Random seed for reproducibility (default 1).

    Returns:
    --------
    pandas.DataFrame
        The read dataset.
    """
    
    df = readDataset(data_file, class_col=class_col, tid_col=tid_col, missing=missing)
    
    return prepare_ds(df, tid_col, class_col, sample_size, random_num) 



[docs]
def read_ds_5fold(data_path, prefix='specific', suffix='.csv', tid_col='tid', class_col=None, missing='-999'):
    """
    Read datasets for k-fold cross-validation from files in a directory.
    
    See Also
    --------
    read_ds_kfold : Read datasets for k-fold cross-validation.
    
    Parameters:
    -----------
    data_path : str
        The path to the directory containing the dataset files.
    prefix : str, optional
        The prefix of the dataset file names (default 'specific').
    suffix : str, optional
        The suffix of the dataset file names (default '.csv').
    tid_col : str, optional
        The name of the column representing trajectory IDs (default 'tid').
    class_col : str or None, optional
        The name of the column representing class labels. If None, no class column is used (default None).
    missing : str, optional
        The placeholder value used to denote missing data (default '-999').

    Returns:
    --------
    5_train : list ofpandas.DataFrame
        The training datasets for each fold.
    5_test : list of pandas.DataFrame
        The testing datasets for each fold.
    """

    return read_ds_kfold(data_path, 5, prefix, suffix, tid_col, class_col, missing)

    

[docs]
def read_ds_kfold(data_path, k=5, prefix='specific', suffix='.csv', tid_col='tid', class_col=None, missing='-999'):
    """
    Read datasets for k-fold cross-validation from files in a directory.

    Parameters:
    -----------
    data_path : str
        The path to the directory containing the dataset files.
    k : int, optional
        The number of folds for cross-validation (default 5).
    prefix : str, optional
        The prefix of the dataset file names (default 'specific').
    suffix : str, optional
        The suffix of the dataset file names (default '.csv').
    tid_col : str, optional
        The name of the column representing trajectory IDs (default 'tid').
    class_col : str or None, optional
        The name of the column representing class labels. If None, no class column is used (default None).
    missing : str, optional
        The placeholder value used to denote missing data (default '-999').

    Returns:
    --------
    ktrain : list ofpandas.DataFrame
        The training datasets for each fold.
    ktest : list of pandas.DataFrame
        The testing datasets for each fold.
    """
    
    dsc = data_path.split(os.path.sep)[-2]
    dsn = data_path.split(os.path.sep)[-1]
    
    k_train = []
    k_test  = []
    
    for fold in tqdm(range(1, k+1), desc='Reading '+str(k)+'-fold dataset '+ dsn + ' of ' + translateCategory(dsn, dsc)):
        df_train, df_test = read_ds_holdout(data_path, prefix, suffix, tid_col, class_col, missing, fold)
        
        k_train.append(df_train)
        k_test.append(df_test)
        
    return k_train, k_test



[docs]
def read_ds_holdout(data_path, prefix=None, suffix='.csv', tid_col='tid', class_col=None, missing='-999', fold=None):
    """
    Read datasets for holdout validation from files in a directory.

    Parameters:
    -----------
    data_path : str
        The path to the directory containing the dataset files.
    prefix : str, optional
        The prefix of the dataset file names (default 'specific').
    suffix : str, optional
        The suffix of the dataset file names (default '.csv').
    tid_col : str, optional
        The name of the column representing trajectory IDs (default 'tid').
    class_col : str or None, optional
        The name of the column representing class labels. If None, no class column is used (default None).
    missing : str, optional
        The placeholder value used to denote missing data (default '-999').
    fold : int or None, optional
        The fold number to load for holdout validation, including subdirectory (ex. run1). If None, read files in `data_path`.

    Returns:
    --------
    train : pandas.DataFrame
        The training dataset.
    test : pandas.DataFrame
        The testing dataset.
    """
    
    dsc = data_path.split(os.path.sep)[-2]
    dsn = data_path.split(os.path.sep)[-1]

    if prefix and prefix != '':
        files = [prefix+'_train'+suffix, prefix+'_test'+suffix]
    else:
        files = ['train'+suffix, 'test'+suffix]
        
    if fold:
        files = [os.path.join('run'+str(fold), files[0]), os.path.join('run'+str(fold), files[1])]
    else:
        print('Reading dataset', dsn, 'of', translateCategory(dsn, dsc))
    
    dataset = []
    for file in tqdm(files, desc=dsn + ' (' + translateCategory(dsn, dsc) + \
                     ('), fold: '+str(fold) if fold else ')')):
#        url = BASE_URL + dsc+'/'+dsn+'/' + file
        url = os.path.join(data_path, file)
        df = read_ds(url, tid_col, class_col, missing)
        dataset.append(df)
    
    return dataset


# ------------------------------------------------------------

[docs]
def translateDesc(dataset, category, descName):
    dst, dsn = descName.split('.')[0].split('_')[0:2]
    if dsn in ['allfeat', '5dims']:
        return False

    if getDescName(category, dataset) == dst:
        return dsn
    elif dataset in dst:
        return dsn
    return False



[docs]
def translateCategory(dataset, category, descName=None):
    if descName:        
        if (category+'.'+descName) in SUBSET_TYPES.keys():
            return SUBSET_TYPES[category+'.'+descName]
        elif ('*.'+descName) in SUBSET_TYPES.keys():
            return SUBSET_TYPES['*.'+descName]
        elif (category+'.*') in SUBSET_TYPES.keys():
            return SUBSET_TYPES[category+'.*']
        else:
            return descName.capitalize()
        
    elif category in DATASET_TYPES.keys():
        return DATASET_TYPES[category]
    
    else:
        return category.split('_')[0].title()

    
# ------------------------------------------------------------
#def getName(dic, dst=None, dsn=None):
#    dst = (dst if dst else '*')
#    dsn = (dsn if dsn else '*')
#    if dst +'.'+ dsn in dic.keys():
#        name = dic[dst +'.'+ dsn]
#    elif dst +'.*' in dic.keys():
#        name = dic[dst +'.*']
#    elif '*.*' in dic.keys():
#        name = dic['*.*']
#        
#    if not name:
#        name = dsn 
#    return name
#
#def getDescName(dst, dsn):
#    name = getName(DESCRIPTOR_NAMES, dst, dsn)
#    if not name:
#        name = dsn
#    return name
#
#def getFeature(dst, dsn):
#    name = getName(FEATURES_NAMES, dst, dsn)
#    if not name:
#        name = ['poi']
#    return name
#
#def getSubset(dsn, feature):
#    for key, value in FEATURES_NAMES.items():
#        if dsn in key and feature in value:
#            if '?' in key:
#                return 'generic'
#            
#    return 'specific'