Source code for ptrail.core.TrajectoryDF

"""
    The TrajectoryDF module is the main module containing the PTRAILDataFrame Dataframe
    for storing the Trajectory Data with PTRAIL Library. The Dataframe has
    certain restrictions on what type of data is mandatory in order to be stored as a
    PTRAILDataFrame which is mentioned in the documentation of the constructor.

    | Authors: Yaksh J Haranwala, Salman Haidri
"""

from typing import Dict, List, Union, Optional, Text

import numpy as np
import pandas as pd
import pandas.core.dtypes.common

from pandas import DataFrame

from pandas._libs import lib

from ptrail.utilities import constants as const
from ptrail.utilities.exceptions import *


[docs]class PTRAILDataFrame(DataFrame):
[docs]    def __init__(self, data_set: Union[DataFrame, List, Dict], latitude: Text, longitude: Text, datetime: Text,
                 traj_id: Text, rest_of_columns: Optional[List[Text]] = None):
        """
            Construct a trajectory dataframe to store and represent the Trajectory Data.

            Note
            ----
                | The mandatory columns in the dataset are:
                |    1. DateTime
                |    2. Trajectory ID
                |    3. Latitude
                |    4. Longitude

                | ``rest_of_columns`` makes sure that if the data_set is a list, it has appropriate headers
                  that the user wants instead of the default numerical values.

            Parameters
            ----------
                data_set: List, Dictionary or pandas DF.
                    The data provided by the user that needs to be represented and stored.
                datetime: str
                    The header of the datetime column.
                traj_id: str
                    The header of the Trajectory ID column.
                latitude: str
                    The header of the latitude column.
                longitude: str
                    The header of the longitude column.
                rest_of_columns: Optional[list[Text]]
                    A list containing headers of the columns other than the mandatory ones.
        """
        # Case-1: The data is from a dictionary.
        # Here, first check whether the data is in dictionary form and if it is so, then convert into
        # pandas DataFrame first.
        rest_of_columns = [] if rest_of_columns is None else rest_of_columns
        column_list = [latitude, longitude, datetime, traj_id] + rest_of_columns
        if isinstance(data_set, dict):
            data_set = DataFrame.from_dict(data_set)
            data_set = data_set.rename(columns=dict(zip(data_set.columns, [const.LAT, const.LONG,
                                                                           const.DateTime, const.TRAJECTORY_ID])))

        # Case-2: The data is from a list.
        # Here, first check whether the data is in list form and if it is so, then convert into
        # pandas DataFrame first.
        elif isinstance(data_set, list) or isinstance(data_set, np.ndarray):
            data_set = DataFrame(data_set, columns=column_list)

        # Case-3: The data is from a pandas DF.
        # Here, all we have to do is to rename the column names from the data to default names.
        elif isinstance(data_set, DataFrame):
            data_set = self._rename_df_col_headers(data_set, latitude, longitude, datetime, traj_id)

        # Now, renaming the default column names to library default column names.
        column_names = self._get_default_column_names(DateTime=datetime, traj_id=traj_id,
                                                      latitude=latitude, longitude=longitude)
        data_set.rename(columns=column_names, inplace=True)

        # Now checking whether all the columns are present in the data and then verifying the data types
        # of all the columns abd then calling the super() to create and return the dataframe.
        if self._validate_columns(data_set):
            self._validate_data_types(data_set)
            data_set.set_index([const.TRAJECTORY_ID, const.DateTime], inplace=True)
            data_set.sort_values([const.TRAJECTORY_ID, const.DateTime], inplace=True)
            super(PTRAILDataFrame, self).__init__(data_set)

    # ------------------------------ General (Private) Utilities ----------------------------- #
    def _rename_df_col_headers(self, data: DataFrame, lat: Text, lon: Text,
                               datetime: Text, traj_id: Text):
        """
            Change the column headers of the columns when the user given data is in the
            form of a pandas DF while creating the PTRAILDataFrame. This method is mainly
            used when the user reads in data from a csv because the CSV file might
            contain different names for the columns.

            Parameters
            ----------
                data: DataFrame
                    The dataframe whose column names are to be changed.
                lat: Text
                    The header of the Latitude column.
                lon: Text
                    The header of the Longitude column.
                datetime: Text
                    The header of the DateTime column.
                traj_id: Text
                    The header of the Trajectory ID column.

            Returns
            -------
                pandas.DataFrame
                    The pandas dataframe containing the library default column headers.
        """
        cols = data.columns.to_list()  # List of all column names
        # Iterate over the list of column names and check for keywords in the header
        # to help renaming with appropriate terms.
        for i in range(len(cols)):
            if lat in cols[i]:
                cols[i] = const.LAT
            if lon in cols[i]:
                cols[i] = const.LONG
            if datetime in cols[i]:
                cols[i] = const.DateTime
            if traj_id in cols[i]:
                cols[i] = const.TRAJECTORY_ID

        data = data.set_axis(cols, axis=1)  # Change all the column names to modified values.
        return data

    def _validate_data_types(self, data: DataFrame):
        """
            Check whether all the data given by the user is of valid type and if it isn't,
            it converts them to the specified data types.
                1. Trajectory_ID: Any -> str
                2. LAT: Any -> float64
                3. LONG: Any -> float64
                4. DateTime: Any -> dask.datetime64[ns]

            Parameters
            ----------
                data: pd.DataFrame
                    This is the dataframe that contains the data that was passed in by the user.
                    The data is converted to pandas DF eventually in the import_data function anyway.

            Raises
            ------
                KeyError:
                    Dataframe has one of the mandatory columns missing.
                ValueError:
                    One of the data-types cannot be converted.
                DateTimeFormatInvalid:
                    The DateTime format provided is invalid and cannot be parsed as pandas DateTime.

        """
        try:
            if data.dtypes[const.LAT] != 'float64':
                data[const.LAT] = data[const.LAT].astype('float64')
            if data.dtypes[const.LONG] != 'float64':
                data[const.LONG] = data[const.LONG].astype('float64')
            if data.dtypes[const.DateTime] != 'datetime64[ns]':
                data[const.DateTime] = data[const.DateTime].astype('datetime64[ns]')
            if data.dtypes[const.TRAJECTORY_ID] != 'str':
                data[const.TRAJECTORY_ID] = data[const.TRAJECTORY_ID].astype('str')
        except KeyError:
            raise KeyError('dataframe missing one of lat, lon, datetime columns.')
        except ValueError:
            raise ValueError('dtypes cannot be converted.')
        except:
            raise DateTimeFormatInvalid("The DateTime format provided is invalid and cannot be parsed as pandas DateTime.")

    def _validate_columns(self, data: DataFrame) -> bool:
        """
            Check whether all the mandatory columns are present in the DataFrame or not.

            Parameters
            ----------
                data
                    The DataFrame containing all the data passed in by the user.

            Returns
            -------
                bool
                    Indicate whether or not all the columns that are mandatory are
                    present in the Data given by the user.

            Raises
            ------
                MissingColumnsException
                    One or more of the mandatory columns (Latitude, Longitude, DateTime, Traj_ID)
                    are missing in the data.
        """
        try:
            if np.isin(const.LAT, data.columns) and \
                    np.isin(const.LONG, data.columns) and \
                    np.isin(const.TRAJECTORY_ID, data.columns) and \
                    np.isin(const.DateTime, data.columns):
                return True
        except KeyError:
            raise MissingColumnsException("One of the columns are missing. Please check your data and try again.")

    def _get_default_column_names(self, DateTime, traj_id, latitude, longitude) -> dict:
        """
            Get a dictionary containing the key, value pairs of the library default
            column names for the following columns:
                1. Latitude
                2. Longitude
                3. DateTime
                4. Trajectory ID

            Parameters
            ----------
                DateTime: Text
                    The datetime header of the column already given by the user.
                traj_id: Text
                    The traj_id header of the column already given by the user.
                latitude:
                    The latitude header of the column already given by the user.
                longitude:
                    The longitude header of the column already given by the user.

            Returns
            -------
                dict
                    A dictionary of mappings from the given header by the user to the
                    library default headers.
        """
        return {
            DateTime: const.DateTime,
            traj_id: const.TRAJECTORY_ID,
            latitude: const.LAT,
            longitude: const.LONG
        }

[docs]    def set_default_index(self):
        """
            Set the Index of the dataframe back to traj_id and DateTime.

            Raises
            ------
                MissingColumnsException
                    DateTime/traj_id column is missing from the dataset.
        """
        try:
            self.set_index([const.DateTime, const.TRAJECTORY_ID], inplace=True)
        except KeyError:
            raise MissingColumnsException(f"Either of {const.DateTime} or {const.TRAJECTORY_ID} columns are missing.")

    # ------------------------------- Properties ---------------------------------- #
    @property
    def latitude(self):
        """
            Accessor method for the latitude column of the PTRAILDataFrame DataFrame.

            Returns
            -------
                pandas.core.series.Series
                    The Series containing all the latitude values from the DataFrame.

            Raises
            ------
                MissingColumnsException
                    Latitude column is missing from the data.
        """
        try:
            return self[const.LAT]
        except KeyError or IndexError:
            raise MissingColumnsException("The Latitude column is not present in the DataFrame, please verify again.")

    @property
    def longitude(self):
        """
            Accessor method for the longitude column of the PTRAILDataFrame DataFrame.

            Returns
            -------
                pandas.core.series.Series
                    The Series containing all the longitude values from the DataFrame.

            Raises
            ------
                MissingColumnsException
                    Longitude column is missing from the data
        """
        try:
            return self[const.LONG]
        except KeyError or IndexError:
            raise MissingColumnsException("The Longitude column is not present in the DataFrame, please verify again.")

    @property
    def datetime(self):
        """
            Accessor method for the DateTime column of the PTRAILDataFrame DataFrame.

            Returns
            -------
                pandas.core.series.Series
                    The Series containing all the DateTime values from the DataFrame.

            Raises
            ------
                MissingColumnsException
                    DateTime column is missing from the data.
        """
        try:
            return self.index.get_level_values(const.DateTime).to_series()
        except KeyError or IndexError:
            raise MissingColumnsException("The DateTime column is not present in the DataFrame, please verify again.")

    @property
    def traj_id(self):
        """
            Accessor method for the Trajectory_ID column of the DaskTrajectoryDF.

            Returns
            -------
                pandas.core.series.Series
                    The Series containing all the Trajectory_ID values from the DataFrame.

            Raises
            ------
                MissingColumnsException
                    traj_id column is missing from the data.
        """
        try:
            return self.index.get_level_values(const.TRAJECTORY_ID).to_series()
        except KeyError or IndexError:
            raise MissingColumnsException("The Trajectory_ID column is not present in the DataFrame, please verify "
                                          "again.")

    def __str__(self):
        return f"------------------------ Dataset Facts ------------------------------\n\n" \
               f"Number of unique Trajectories in the data: {self.traj_id.nunique()}\n" \
               f"Number of points in the data: {len(self)}\n" \
               f"Dataset time range: {self.datetime.max() - self.datetime.min()}\n" \
               f"Datatype of the DataFrame: {type(self)}\n" \
               f"Dataset Bounding Box:" \
               f" {(self.latitude.min(), self.longitude.min(), self.latitude.max(), self.longitude.max())}\n\n" \
               f"---------------------------------------------------------------------"

    # ------------------------------- File and DF Operations ----------------------------- #

[docs]    def to_numpy(self, dtype=None, copy: bool = False, na_value=lib.no_default) -> np.ndarray:
        """
            Convert the DataFrame to a NumPy array.By default, the dtype of the returned array will
            be the common dtype of all types in the DataFrame. For example, if the dtypes are float16
            and float32, the results dtype will be float32. This may require copying data and coercing
            values, which may be expensive

            Parameters
            ----------
                dtype:
                    The dtype to pass to :meth:`numpy.asarray`.
                copy:
                    Whether to ensure that the returned value is not a view on another array.
                    Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy.
                    Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary.
                na_value:
                    The value to use for missing values. The default value depends on `dtype` and the
                    dtypes of the DataFrame columns.
        """
        return self.reset_index(drop=False).to_numpy()

[docs]    def sort_by_traj_id_and_datetime(self, ascending=True):
        """
            Sort the trajectory in Ascending or descending order based on the following 2
            columns in order:
                1. Trajectory ID
                2. DateTime

            Parameters
            ----------
                ascending: bool
                    Whether to sort the values in ascending order or descending order.

            Returns
            -------
                PTRAILDataFrame
                    The sorted dataframe.
        """
        return self.sort_values([const.TRAJECTORY_ID, const.DateTime], ascending=ascending)