Source code for ptrail.core.TrajectoryDF

"""
    The TrajectoryDF module is the main module containing the PTRAILDataFrame Dataframe
    for storing the Trajectory Data with PTRAIL Library. The Dataframe has
    certain restrictions on what type of data is mandatory in order to be stored as a
    PTRAILDataFrame which is mentioned in the documentation of the constructor.

    | Authors: Yaksh J Haranwala, Salman Haidri
"""

from typing import Dict, List, Union, Optional, Text

import numpy as np
import pandas as pd
import pandas.core.dtypes.common

from pandas import DataFrame

from pandas._libs import lib

from ptrail.utilities import constants as const
from ptrail.utilities.exceptions import *


[docs]class PTRAILDataFrame(DataFrame):
[docs] def __init__(self, data_set: Union[DataFrame, List, Dict], latitude: Text, longitude: Text, datetime: Text, traj_id: Text, rest_of_columns: Optional[List[Text]] = None): """ Construct a trajectory dataframe to store and represent the Trajectory Data. Note ---- | The mandatory columns in the dataset are: | 1. DateTime | 2. Trajectory ID | 3. Latitude | 4. Longitude | ``rest_of_columns`` makes sure that if the data_set is a list, it has appropriate headers that the user wants instead of the default numerical values. Parameters ---------- data_set: List, Dictionary or pandas DF. The data provided by the user that needs to be represented and stored. datetime: str The header of the datetime column. traj_id: str The header of the Trajectory ID column. latitude: str The header of the latitude column. longitude: str The header of the longitude column. rest_of_columns: Optional[list[Text]] A list containing headers of the columns other than the mandatory ones. """ # Case-1: The data is from a dictionary. # Here, first check whether the data is in dictionary form and if it is so, then convert into # pandas DataFrame first. rest_of_columns = [] if rest_of_columns is None else rest_of_columns column_list = [latitude, longitude, datetime, traj_id] + rest_of_columns if isinstance(data_set, dict): data_set = DataFrame.from_dict(data_set) data_set = data_set.rename(columns=dict(zip(data_set.columns, [const.LAT, const.LONG, const.DateTime, const.TRAJECTORY_ID]))) # Case-2: The data is from a list. # Here, first check whether the data is in list form and if it is so, then convert into # pandas DataFrame first. elif isinstance(data_set, list) or isinstance(data_set, np.ndarray): data_set = DataFrame(data_set, columns=column_list) # Case-3: The data is from a pandas DF. # Here, all we have to do is to rename the column names from the data to default names. elif isinstance(data_set, DataFrame): data_set = self._rename_df_col_headers(data_set, latitude, longitude, datetime, traj_id) # Now, renaming the default column names to library default column names. column_names = self._get_default_column_names(DateTime=datetime, traj_id=traj_id, latitude=latitude, longitude=longitude) data_set.rename(columns=column_names, inplace=True) # Now checking whether all the columns are present in the data and then verifying the data types # of all the columns abd then calling the super() to create and return the dataframe. if self._validate_columns(data_set): self._validate_data_types(data_set) data_set.set_index([const.TRAJECTORY_ID, const.DateTime], inplace=True) data_set.sort_values([const.TRAJECTORY_ID, const.DateTime], inplace=True) super(PTRAILDataFrame, self).__init__(data_set)
# ------------------------------ General (Private) Utilities ----------------------------- # def _rename_df_col_headers(self, data: DataFrame, lat: Text, lon: Text, datetime: Text, traj_id: Text): """ Change the column headers of the columns when the user given data is in the form of a pandas DF while creating the PTRAILDataFrame. This method is mainly used when the user reads in data from a csv because the CSV file might contain different names for the columns. Parameters ---------- data: DataFrame The dataframe whose column names are to be changed. lat: Text The header of the Latitude column. lon: Text The header of the Longitude column. datetime: Text The header of the DateTime column. traj_id: Text The header of the Trajectory ID column. Returns ------- pandas.DataFrame The pandas dataframe containing the library default column headers. """ cols = data.columns.to_list() # List of all column names # Iterate over the list of column names and check for keywords in the header # to help renaming with appropriate terms. for i in range(len(cols)): if lat in cols[i]: cols[i] = const.LAT if lon in cols[i]: cols[i] = const.LONG if datetime in cols[i]: cols[i] = const.DateTime if traj_id in cols[i]: cols[i] = const.TRAJECTORY_ID data = data.set_axis(cols, axis=1) # Change all the column names to modified values. return data def _validate_data_types(self, data: DataFrame): """ Check whether all the data given by the user is of valid type and if it isn't, it converts them to the specified data types. 1. Trajectory_ID: Any -> str 2. LAT: Any -> float64 3. LONG: Any -> float64 4. DateTime: Any -> dask.datetime64[ns] Parameters ---------- data: pd.DataFrame This is the dataframe that contains the data that was passed in by the user. The data is converted to pandas DF eventually in the import_data function anyway. Raises ------ KeyError: Dataframe has one of the mandatory columns missing. ValueError: One of the data-types cannot be converted. DateTimeFormatInvalid: The DateTime format provided is invalid and cannot be parsed as pandas DateTime. """ try: if data.dtypes[const.LAT] != 'float64': data[const.LAT] = data[const.LAT].astype('float64') if data.dtypes[const.LONG] != 'float64': data[const.LONG] = data[const.LONG].astype('float64') if data.dtypes[const.DateTime] != 'datetime64[ns]': data[const.DateTime] = data[const.DateTime].astype('datetime64[ns]') if data.dtypes[const.TRAJECTORY_ID] != 'str': data[const.TRAJECTORY_ID] = data[const.TRAJECTORY_ID].astype('str') except KeyError: raise KeyError('dataframe missing one of lat, lon, datetime columns.') except ValueError: raise ValueError('dtypes cannot be converted.') except: raise DateTimeFormatInvalid("The DateTime format provided is invalid and cannot be parsed as pandas DateTime.") def _validate_columns(self, data: DataFrame) -> bool: """ Check whether all the mandatory columns are present in the DataFrame or not. Parameters ---------- data The DataFrame containing all the data passed in by the user. Returns ------- bool Indicate whether or not all the columns that are mandatory are present in the Data given by the user. Raises ------ MissingColumnsException One or more of the mandatory columns (Latitude, Longitude, DateTime, Traj_ID) are missing in the data. """ try: if np.isin(const.LAT, data.columns) and \ np.isin(const.LONG, data.columns) and \ np.isin(const.TRAJECTORY_ID, data.columns) and \ np.isin(const.DateTime, data.columns): return True except KeyError: raise MissingColumnsException("One of the columns are missing. Please check your data and try again.") def _get_default_column_names(self, DateTime, traj_id, latitude, longitude) -> dict: """ Get a dictionary containing the key, value pairs of the library default column names for the following columns: 1. Latitude 2. Longitude 3. DateTime 4. Trajectory ID Parameters ---------- DateTime: Text The datetime header of the column already given by the user. traj_id: Text The traj_id header of the column already given by the user. latitude: The latitude header of the column already given by the user. longitude: The longitude header of the column already given by the user. Returns ------- dict A dictionary of mappings from the given header by the user to the library default headers. """ return { DateTime: const.DateTime, traj_id: const.TRAJECTORY_ID, latitude: const.LAT, longitude: const.LONG }
[docs] def set_default_index(self): """ Set the Index of the dataframe back to traj_id and DateTime. Raises ------ MissingColumnsException DateTime/traj_id column is missing from the dataset. """ try: self.set_index([const.DateTime, const.TRAJECTORY_ID], inplace=True) except KeyError: raise MissingColumnsException(f"Either of {const.DateTime} or {const.TRAJECTORY_ID} columns are missing.")
# ------------------------------- Properties ---------------------------------- # @property def latitude(self): """ Accessor method for the latitude column of the PTRAILDataFrame DataFrame. Returns ------- pandas.core.series.Series The Series containing all the latitude values from the DataFrame. Raises ------ MissingColumnsException Latitude column is missing from the data. """ try: return self[const.LAT] except KeyError or IndexError: raise MissingColumnsException("The Latitude column is not present in the DataFrame, please verify again.") @property def longitude(self): """ Accessor method for the longitude column of the PTRAILDataFrame DataFrame. Returns ------- pandas.core.series.Series The Series containing all the longitude values from the DataFrame. Raises ------ MissingColumnsException Longitude column is missing from the data """ try: return self[const.LONG] except KeyError or IndexError: raise MissingColumnsException("The Longitude column is not present in the DataFrame, please verify again.") @property def datetime(self): """ Accessor method for the DateTime column of the PTRAILDataFrame DataFrame. Returns ------- pandas.core.series.Series The Series containing all the DateTime values from the DataFrame. Raises ------ MissingColumnsException DateTime column is missing from the data. """ try: return self.index.get_level_values(const.DateTime).to_series() except KeyError or IndexError: raise MissingColumnsException("The DateTime column is not present in the DataFrame, please verify again.") @property def traj_id(self): """ Accessor method for the Trajectory_ID column of the DaskTrajectoryDF. Returns ------- pandas.core.series.Series The Series containing all the Trajectory_ID values from the DataFrame. Raises ------ MissingColumnsException traj_id column is missing from the data. """ try: return self.index.get_level_values(const.TRAJECTORY_ID).to_series() except KeyError or IndexError: raise MissingColumnsException("The Trajectory_ID column is not present in the DataFrame, please verify " "again.") def __str__(self): return f"------------------------ Dataset Facts ------------------------------\n\n" \ f"Number of unique Trajectories in the data: {self.traj_id.nunique()}\n" \ f"Number of points in the data: {len(self)}\n" \ f"Dataset time range: {self.datetime.max() - self.datetime.min()}\n" \ f"Datatype of the DataFrame: {type(self)}\n" \ f"Dataset Bounding Box:" \ f" {(self.latitude.min(), self.longitude.min(), self.latitude.max(), self.longitude.max())}\n\n" \ f"---------------------------------------------------------------------" # ------------------------------- File and DF Operations ----------------------------- #
[docs] def to_numpy(self, dtype=None, copy: bool = False, na_value=lib.no_default) -> np.ndarray: """ Convert the DataFrame to a NumPy array.By default, the dtype of the returned array will be the common dtype of all types in the DataFrame. For example, if the dtypes are float16 and float32, the results dtype will be float32. This may require copying data and coercing values, which may be expensive Parameters ---------- dtype: The dtype to pass to :meth:`numpy.asarray`. copy: Whether to ensure that the returned value is not a view on another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. na_value: The value to use for missing values. The default value depends on `dtype` and the dtypes of the DataFrame columns. """ return self.reset_index(drop=False).to_numpy()
[docs] def sort_by_traj_id_and_datetime(self, ascending=True): """ Sort the trajectory in Ascending or descending order based on the following 2 columns in order: 1. Trajectory ID 2. DateTime Parameters ---------- ascending: bool Whether to sort the values in ascending order or descending order. Returns ------- PTRAILDataFrame The sorted dataframe. """ return self.sort_values([const.TRAJECTORY_ID, const.DateTime], ascending=ascending)