Source code for ptrail.features.temporal_features

"""
    | 1. The temporal_features module contains all the features of the library
         that calculates several features based on the DateTime provided in
         the data.
    | 2. It is to be noted that most of the functions in this module
         calculate the features and then add the results to an entirely new
         column with a new column header.
    | 3. It is to be also noted that a lot of these features are inspired from
         the PyMove library and we are crediting the PyMove creators with them.

    | Authors: Yaksh J Haranwala, Salman Haidri

    References
    ----------
        Arina De Jesus Amador Monteiro Sanches. “Uma Arquitetura E Imple-menta ̧c ̃ao Do M ́odulo De
        Pr ́e-processamento Para Biblioteca Pymove”.Bachelor’s thesis. Universidade Federal Do Cear ́a, 2019
"""
import itertools
import multiprocessing
from typing import Optional, Text

import numpy as np
import pandas as pd

from ptrail.core.TrajectoryDF import PTRAILDataFrame
from ptrail.features.helper_functions import Helpers as helpers
from ptrail.utilities import constants as const


[docs]class TemporalFeatures:
[docs]    @staticmethod
    def create_date_column(dataframe: PTRAILDataFrame):
        """
            From the DateTime column already present in the data, extract only the date
            and then add another column containing just the date.

            Parameters
            ----------
                dataframe: PTRAILDataFrame
                    The PTRAILDataFrame Dataframe on which the creation of the time column is to be done.

            Returns
            -------
                PTRAILDataFrame:
                    The dataframe containing the resultant Date column.

        """
        df = dataframe.reset_index()

        # From the DateTime value extract the dates and store them in Date column
        df['Date'] = df[const.DateTime].dt.date

        # Return the dataframe by converting it to PTRAILDataFrame
        return PTRAILDataFrame(df.reset_index(drop=True),
                               const.LAT, const.LONG, const.DateTime, const.TRAJECTORY_ID)

[docs]    @staticmethod
    def create_time_column(dataframe: PTRAILDataFrame):
        """
            From the DateTime column already present in the data, extract only the time
            and then add another column containing just the time.

            Parameters
            ----------
                dataframe: PTRAILDataFrame
                    The PTRAILDataFrame Dataframe on which the creation of the time column is to be done.

            Returns
            -------
                PTRAILDataFrame
                    The dataframe containing the resultant Time column.

        """
        dataframe = dataframe.reset_index()

        # From the DateTime column extract the time and store them in the Time column
        dataframe['Time'] = dataframe[const.DateTime].dt.time

        # Return the dataframe by converting it into PTRAILDataFrame type
        return PTRAILDataFrame(dataframe, const.LAT, const.LONG, const.DateTime, const.TRAJECTORY_ID)

[docs]    @staticmethod
    def create_day_of_week_column(dataframe: PTRAILDataFrame):
        """
            Create a column called Day_Of_Week which contains the day of the week
            on which the trajectory point is recorded. This is calculated on the basis
            of timestamp recorded in the data.

            Parameters
            ----------
                dataframe: PTRAILDataFrame
                    The dataframe containing the entire data on which the operation is to be performed

            Returns
            -------
                PTRAILDataFrame
                    The dataframe containing the resultant Day_of_week column.
        """
        dataframe = dataframe.reset_index()

        # From the DateTime column extract the time and store them in the Time column
        dataframe['Day_Of_Week'] = dataframe[const.DateTime].dt.day_name()

        # Return the dataframe by converting it into PTRAILDataFrame type
        return PTRAILDataFrame(dataframe, const.LAT, const.LONG, const.DateTime, const.TRAJECTORY_ID)

[docs]    @staticmethod
    def create_weekend_indicator_column(dataframe: PTRAILDataFrame):
        """
            Create a column called Weekend which indicates whether the point data is collected
            on either a Saturday or a Sunday.

            Parameters
            ----------
                dataframe: PTRAILDataFrame
                    The dataframe on which the operation is to be performed.

            Returns
            -------
                PTRAILDataFrame
                    The dataframe containing the resultant Weekend column.

            References
            ----------
                Arina De Jesus Amador Monteiro Sanches. 'Uma Arquitetura E Imple-menta ̧c ̃ao
                Do M ́odulo De Pr ́e-processamento Para Biblioteca Pymove'.Bachelor’s thesis.
                Universidade Federal Do Cear ́a, 2019.

        """
        # Check if Day_of_Week column is present in the dataframe
        if 'Day_Of_Week' in dataframe.columns:
            dataframe = dataframe.reset_index()
            # store the weekend days in a series
            fd = np.logical_or(dataframe['Day_Of_Week'] == const.WEEKEND[0],
                               dataframe['Day_Of_Week'] == const.WEEKEND[1])
            # store the index of the weekends
            index_fd = dataframe[fd].index
            # initialize the Weekend column with False and then update all the indexes which
            # indicates that it's a weekend
            dataframe['Weekend'] = False
            dataframe.at[index_fd, 'Weekend'] = True
            # Return the dataframe by converting it into PTRAILDataFrame
        else:
            # If the dataframe does not contain the Day_Of_Week column already,
            # then call the method and create the column.
            dataframe = TemporalFeatures.create_day_of_week_column(dataframe)
            dataframe = dataframe.reset_index()
            # store the weekend days in a series
            fd = np.logical_or(dataframe['Day_Of_Week'] == const.WEEKEND[0],
                               dataframe['Day_Of_Week'] == const.WEEKEND[1])
            # store the index of the weekends
            index_fd = dataframe[fd].index
            # initialize the Weekend column with False and then update all the indexes which
            # indicates that it's a weekend
            dataframe['Weekend'] = False
            dataframe.at[index_fd, 'Weekend'] = True
            # Return the dataframe by converting it into PTRAILDataFrame

        return PTRAILDataFrame(dataframe, const.LAT, const.LONG, const.DateTime, const.TRAJECTORY_ID)

[docs]    @staticmethod
    def create_time_of_day_column(dataframe: PTRAILDataFrame):
        """
            Create a Time_Of_Day column in the dataframe using parallelization which indicates at what time of the
            day was the point data captured.
            Note: The divisions of the day based on the time are provided in the utilities.constants module.

            Parameters
            ----------
                dataframe: PTRAILDataFrame
                    The dataframe on which the calculation is to be done.

            Returns
            -------
                PTRAILDataFrame
                    The dataframe containing the resultant Time_Of_Day column.

            References
            ----------
                Arina De Jesus Amador Monteiro Sanches. 'Uma Arquitetura E Imple-menta ̧c ̃ao
                Do M ́odulo De Pr ́e-processamento Para Biblioteca Pymove'.Bachelor’s thesis.
                Universidade Federal Do Cear ́a, 2019.

        """
        dataframe = dataframe.reset_index()
        # Extract the hours from the Datetime column and then create a list of conditions for the
        # different periods of time
        hours = dataframe[const.DateTime].dt.hour
        conditions = [
            (hours >= 0) & (hours <= 4),
            (hours > 4) & (hours <= 8),
            (hours > 8) & (hours <= 12),
            (hours > 12) & (hours <= 16),
            (hours > 16) & (hours <= 20),
            (hours > 20) & (hours <= 24)
        ]
        # Map the conditions to the different periods of  the day
        dataframe['Time_Of_Day'] = np.select(conditions, const.TIME_OF_DAY)
        return PTRAILDataFrame(dataframe, const.LAT, const.LONG, const.DateTime, const.TRAJECTORY_ID)

[docs]    @staticmethod
    def get_traj_duration(dataframe: PTRAILDataFrame, traj_id: Optional[Text] = None):
        """
            Accessor method for the duration of a trajectory specified by the user.

            Note
            ----
                If no trajectory ID is given by the user, then the duration of each
                unique trajectory is calculated.

            Parameters
            ----------
                dataframe: PTRAILDataFrame
                    The dataframe containing the resultant column if inplace is True.
                traj_id: Optional[Text]
                    The trajectory id for which the duration is required.

            Returns
            -------
                pandas.TimeDelta:
                    The trajectory duration.
                pandas.core.dataframe.DataFrame:
                    The dataframe containing the duration of all trajectories in the dataset.
        """
        dataframe = dataframe.reset_index()
        if traj_id is None:
            ids_ = dataframe[const.TRAJECTORY_ID].value_counts(ascending=True).keys().to_list()

            split_factor = helpers._get_partition_size(len(ids_))
            ids_ = [ids_[i: i + split_factor] for i in range(0, len(ids_), split_factor)]

            mp_pool = multiprocessing.Pool(len(ids_))
            results = mp_pool.starmap(helpers.traj_duration_helper, zip(itertools.repeat(dataframe), ids_))
            mp_pool.close()
            mp_pool.join()

            results = pd.concat(results).sort_values(const.TRAJECTORY_ID)
            return results
        else:
            small = dataframe.loc[dataframe[const.TRAJECTORY_ID] == traj_id, [const.DateTime]]
            if len(small) == 0:
                return f"No {traj_id} exists in the given data. Please try again."
            else:
                return small.max() - small.min()

[docs]    @staticmethod
    def get_start_time(dataframe: PTRAILDataFrame, traj_id: Optional[Text] = None):
        """
            Get the starting time of the trajectory.

            Note
            ----
                If the trajectory ID is not specified by the user, then by default,
                the starting times of all the trajectory IDs in the data are
                returned.

            Parameters
            ----------
                dataframe: PTRAILDataFrame
                    The dataframe on which the operations are to be performed.
                traj_id: Optional[Text]
                    The trajectory for which the start time is required.

            Returns
            -------
                pandas.DateTime:
                    The start time of a single trajectory.
                pandas.core.dataframe.DataFrame
                    Pandas dataframe containing the start time of all the trajectories
                    present in the data when the user hasn't asked for a particular
                    trajectory's start time.
        """
        dataframe = dataframe.reset_index()
        if traj_id is None:
            # First, create a list containing all the ids of the data and then further divide that
            # list items and split it into sub-lists of 100 ids each if there are more than 100 ids.
            ids_ = dataframe[const.TRAJECTORY_ID].value_counts(ascending=True).keys().to_list()

            # Get the ideal number of IDs by which the dataframe is to be split.
            split_factor = helpers._get_partition_size(len(ids_))
            ids_ = [ids_[i: i + split_factor] for i in range(0, len(ids_), split_factor)]

            # Now, create a multiprocessing pool and then run processes in parallel
            # which calculate the start times for a smaller set of IDs only.
            mp_pool = multiprocessing.Pool(len(ids_))
            results = mp_pool.starmap(helpers.start_time_helper, zip(itertools.repeat(dataframe), ids_))
            mp_pool.close()
            mp_pool.join()

            # Concatenate all the smaller dataframes and return the answer.
            results = pd.concat(results).sort_values(const.TRAJECTORY_ID)
            return results
        else:
            filt = dataframe.loc[dataframe[const.TRAJECTORY_ID] == traj_id]
            filt_two = filt.loc[filt[const.DateTime] == filt[const.DateTime].min()]
            return filt_two[const.DateTime].iloc[0]

[docs]    @staticmethod
    def get_end_time(dataframe: PTRAILDataFrame, traj_id: Optional[Text] = None):
        """
            Get the ending time of the trajectory.

            Note
            ----
                If the trajectory ID is not specified by the user, then by default,
                the ending times of all the trajectory IDs in the data are
                returned.

            Parameters
            ----------
                dataframe: PTRAILDataFrame
                    The dataframe on which the operations are to be performed.
                traj_id: Optional[Text]
                    The trajectory for which the end time is required.

            Returns
            -------
                pandas.DateTime:
                    The end time of a single trajectory.
                pandas.core.dataframe.DataFrame
                    Pandas dataframe containing the end time of all the trajectories
                    present in the data when the user hasn't asked for a particular
                    trajectory's end time.
        """
        dataframe = dataframe.reset_index()
        if traj_id is None:
            # First, create a list containing all the ids of the data and then further divide that
            # list items and split it into sub-lists of 100 ids each if there are more than 100 ids.
            ids_ = dataframe[const.TRAJECTORY_ID].value_counts(ascending=True).keys().to_list()

            # Get the ideal number of IDs by which the dataframe is to be split.
            split_factor = helpers._get_partition_size(len(ids_))
            ids_ = [ids_[i: i + split_factor] for i in range(0, len(ids_), split_factor)]

            # Now, create a multiprocessing pool and then run processes in parallel
            # which calculate the end times for a smaller set of IDs only.
            mp_pool = multiprocessing.Pool(len(ids_))
            results = mp_pool.starmap(helpers.end_time_helper, zip(itertools.repeat(dataframe), ids_))
            mp_pool.close()
            mp_pool.join()

            # Concatenate all the smaller dataframes and return the answer.
            results = pd.concat(results).sort_values(const.TRAJECTORY_ID)
            return results
        else:
            filt = dataframe.loc[dataframe[const.TRAJECTORY_ID] == traj_id]
            filt_two = filt.loc[filt[const.DateTime] == filt[const.DateTime].max()]
            return filt_two[const.DateTime].iloc[0]

[docs]    @staticmethod
    def generate_temporal_features(dataframe: PTRAILDataFrame):
        """
            Generate all the temporal features with a single call of this function.
            
            Parameters
            ----------
                dataframe: PTRAILDataFrame
                    The dataframe on which the features are to be generated.
            
            Returns
            -------
                PTRAILDataFrame:
                    The dataframe enriched with Temporal Features.
        """
        to_return = TemporalFeatures.create_date_column(dataframe)
        to_return = TemporalFeatures.create_time_column(to_return)
        to_return = TemporalFeatures.create_day_of_week_column(to_return)
        to_return = TemporalFeatures.create_weekend_indicator_column(to_return)
        to_return = TemporalFeatures.create_time_of_day_column(to_return)

        return to_return