Source code for routee.transit.depot_deadhead

import geopandas as gpd
import pandas as pd
from geopy.distance import geodesic
from gtfsblocks import Feed
from shapely.geometry import Point

from routee.transit.ntd import NTDAgencyMatch, load_ntd_facilities, match_agency_to_ntd

# Re-export for backward compatibility
__all__ = [
    "load_ntd_facilities",
    "match_agency_to_ntd",
    "NTDAgencyMatch",
    "create_depot_deadhead_trips",
    "infer_depot_trip_endpoints",
    "create_depot_deadhead_stops",
]



[docs]
def create_depot_deadhead_trips(
    trips_df: pd.DataFrame, stop_times_df: pd.DataFrame
) -> pd.DataFrame:
    """Create deadhead trips from and to depots for each block.

    This function essentially creates rows for the trips.txt DataFrame.
    It does not generate shape traces for them (that is handled by other
    functions in this module).

    Parameters
    ----------
    trips_df : pd.DataFrame
        trips_df of selected date route (e.g. result from read_in_gtfs).
    stop_times_df: pd.DataFrame
        stop_times df in feed resulted from read_in_gtfs.

    Returns
    -------
    pd.DataFrame: DataFrame with created deadhead trips.
    """

    block_ids = trips_df["block_id"].dropna().unique().tolist()

    # Get earliest start time for each trip and merge then in to trips DF
    trip_start_times = (
        stop_times_df.groupby("trip_id")["arrival_time"].min().reset_index()
    )
    trips_with_times = trips_df.merge(trip_start_times, on="trip_id", how="left")

    # For each block id, create two deadhead trips: one from depot to first stop,
    # and one from last stop to depot.
    depot_trips = list()

    for block_id in block_ids:
        block_trips = trips_with_times[trips_with_times["block_id"] == block_id]
        # Exclude any between-trip deadhead trips that may have been added
        if "from_trip" in block_trips.columns:
            block_trips = block_trips.loc[block_trips["from_trip"].isna()]
        # Ensure trips have been sorted in chronological order
        block_trips = block_trips.sort_values(by="arrival_time")
        first_trip = block_trips.iloc[0]
        last_trip = block_trips.iloc[-1]
        # Create trip from depot to first stop
        from_depot_trip_id = f"depot_to_{first_trip['trip_id']}"
        from_depot_route = f"from_depot_{block_id}"
        from_depot_trip = {
            "trip_id": from_depot_trip_id,
            "trip_type": "pull-out",
            "route_id": from_depot_route,
            "service_id": first_trip["service_id"],
            "block_id": block_id,
            "shape_id": from_depot_route,
            "route_short_name": from_depot_route,
            "route_type": 3,  # 3 means bus
            "route_desc": f"Deadhead from depot to {first_trip['trip_id']}",
            "agency_id": first_trip.get("agency_id", None),
        }
        depot_trips.append(from_depot_trip)
        # Create trip from last stop to depot
        to_depot_trip_id = f"{last_trip['trip_id']}_to_depot"
        to_depot_route = f"to_depot_{block_id}"
        to_depot_trip = {
            "trip_id": to_depot_trip_id,
            "trip_type": "pull-in",
            "route_id": to_depot_route,
            "service_id": last_trip["service_id"],
            "block_id": block_id,
            "shape_id": to_depot_route,
            "route_short_name": to_depot_route,
            "route_type": 3,  # 3 means bus
            "route_desc": f"Deadhead from {last_trip['trip_id']} to depot",
            "agency_id": last_trip.get("agency_id", None),
        }
        depot_trips.append(to_depot_trip)

    deadhead_trips_df = pd.DataFrame(depot_trips)
    return deadhead_trips_df




[docs]
def infer_depot_trip_endpoints(
    trips_df: pd.DataFrame,
    feed: Feed,
    depots_gdf: gpd.GeoDataFrame,
) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame, gpd.GeoDataFrame]:
    """Add origin/destination depot geometry for each block.

    Parameters
    ----------
    trips_df: pd.DataFrame
        trips_df of selected date and route (result from read_in_gtfs).
    feed : Feed
        GTFS feed object (e.g. result from read_in_gtfs).
    depots_gdf : gpd.GeoDataFrame
        Point GeoDataFrame of candidate depot locations in EPSG:4326.  Typically
        the result of :func:`load_ntd_facilities`.  If a ``depot_priority``
        column is present (0 = best), candidate depots are first restricted to
        the highest-priority type available before distance minimisation; if no
        higher-priority depot is reachable the next tier is tried.

    Returns
    -------
    tuple[GeoDataFrame, GeoDataFrame, GeoDataFrame]
        (first_stops_gdf, last_stops_gdf, depots_gdf). The first two contain
        stop geometry and matched depot geometry.  ``depots_gdf`` is the full
        depot GeoDataFrame (EPSG:4326) so callers can look up metadata by row
        index.
    """

    # Process trips and stops dataframes in feed to get first and last stops of each block id
    trips_df = trips_df.copy()
    stop_times_df = feed.stop_times
    stops_df = feed.stops
    blocks_trips_stops = stop_times_df.merge(
        trips_df[["trip_id", "block_id"]], on="trip_id", how="right"
    )
    blocks_trips_stops = blocks_trips_stops.merge(stops_df, on="stop_id", how="left")

    blocks_trips_stops = blocks_trips_stops.sort_values(by=["block_id", "arrival_time"])
    first_stops = blocks_trips_stops.groupby("block_id").first().reset_index()
    last_stops = blocks_trips_stops.groupby("block_id").last().reset_index()

    first_stops = first_stops[
        ["block_id", "stop_id", "arrival_time", "stop_lat", "stop_lon"]
    ]
    last_stops = last_stops[
        ["block_id", "stop_id", "arrival_time", "stop_lat", "stop_lon"]
    ]

    first_stops["geometry"] = first_stops.apply(
        lambda row: Point(row["stop_lon"], row["stop_lat"]), axis=1
    )
    last_stops["geometry"] = last_stops.apply(
        lambda row: Point(row["stop_lon"], row["stop_lat"]), axis=1
    )
    first_stops_gdf = gpd.GeoDataFrame(
        first_stops, geometry="geometry", crs="EPSG:4326"
    )
    last_stops_gdf = gpd.GeoDataFrame(last_stops, geometry="geometry", crs="EPSG:4326")

    # Ensure depot geometries are in WGS84
    if depots_gdf.crs is None:
        depots_gdf = depots_gdf.set_crs(epsg=4326)
    else:
        depots_gdf = depots_gdf.to_crs(epsg=4326)

    has_priority = "depot_priority" in depots_gdf.columns
    priority_levels: list[int] = (
        sorted(depots_gdf["depot_priority"].dropna().unique().tolist())
        if has_priority
        else []
    )

    # Create a simple mapping from depot index to geometry for fast lookup
    depots_geom_map = depots_gdf["geometry"].to_dict()

    # Project to Web Mercator (EPSG:3857) for distance computations
    proj_crs = "EPSG:3857"
    first_proj = first_stops_gdf.to_crs(proj_crs).reset_index(drop=True)
    last_proj = last_stops_gdf.to_crs(proj_crs).reset_index(drop=True)
    depots_proj = depots_gdf.to_crs(proj_crs).copy()

    best_depot_idx: dict[object, int] = {}
    for block_id, first_row in first_proj.groupby("block_id"):
        first_geom = first_row.iloc[0].geometry
        last_geom = last_proj.loc[last_proj["block_id"] == block_id, "geometry"].values[
            0
        ]

        # Compute pull-out + pull-in distance for every depot candidate
        working = depots_proj.copy()
        working["pullout"] = working.geometry.distance(first_geom)
        working["pullin"] = working.geometry.distance(last_geom)
        working["total"] = working["pullout"] + working["pullin"]

        if has_priority:
            # Pick nearest depot from the highest-priority tier that is
            # non-empty; fall through to subsequent tiers if needed.
            best_idx: int = working["total"].idxmin()
            for level in priority_levels:
                tier = working[working["depot_priority"] == level]
                if not tier.empty:
                    best_idx = int(tier["total"].idxmin())
                    break
        else:
            best_idx = int(working["total"].idxmin())

        best_depot_idx[block_id] = best_idx

    first_stops_gdf["nearest_depot_idx"] = first_stops_gdf["block_id"].map(
        best_depot_idx
    )
    last_stops_gdf["nearest_depot_idx"] = last_stops_gdf["block_id"].map(best_depot_idx)

    first_stops_gdf["geometry_origin"] = first_stops_gdf["nearest_depot_idx"].map(
        depots_geom_map
    )
    first_stops_gdf["geometry_destination"] = first_stops_gdf.geometry
    last_stops_gdf["geometry_destination"] = last_stops_gdf["nearest_depot_idx"].map(
        depots_geom_map
    )
    last_stops_gdf["geometry_origin"] = last_stops_gdf.geometry

    # Attach NTD metadata (NTD ID, agency name, facility name/type) to both
    # stop GDFs so downstream callers and outputs can identify which depot was
    # matched without having to rejoin on nearest_depot_idx themselves.
    _ntd_meta_cols: dict[str, str] = {
        "NTD ID": "depot_ntd_id",
        "Agency Name": "depot_agency_name",
        "Facility Name": "depot_facility_name",
        "Facility Type": "depot_facility_type",
    }
    for src_col, dst_col in _ntd_meta_cols.items():
        if src_col in depots_gdf.columns:
            col_map = depots_gdf[src_col].to_dict()
            first_stops_gdf[dst_col] = first_stops_gdf["nearest_depot_idx"].map(col_map)
            last_stops_gdf[dst_col] = last_stops_gdf["nearest_depot_idx"].map(col_map)

    # Set the arrival time as departure time for deadhead trip to depot for the last_stop_gdf
    last_stops_gdf["departure_time"] = last_stops_gdf["arrival_time"]
    # Drop the arrival_time column for the last_stop_gdf
    last_stops_gdf = last_stops_gdf.drop(columns=["arrival_time"])

    # Keep only relevant columns and set stop_geometry as the active geometry
    first_stops_gdf = first_stops_gdf.drop(columns=["geometry"])
    first_stops_gdf = gpd.GeoDataFrame(
        first_stops_gdf, geometry="geometry_destination", crs="EPSG:4326"
    )

    last_stops_gdf = last_stops_gdf.drop(columns=["geometry"])
    last_stops_gdf = gpd.GeoDataFrame(
        last_stops_gdf, geometry="geometry_origin", crs="EPSG:4326"
    )

    return first_stops_gdf, last_stops_gdf, depots_gdf




[docs]
def create_depot_deadhead_stops(
    first_stops_gdf: gpd.GeoDataFrame,
    last_stops_gdf: gpd.GeoDataFrame,
    deadhead_trips: pd.DataFrame,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Create stop_times and stops for deadhead trips from and to depots.

    Parameters
    ----------
    first_stops_gdf: gpd.GeoDataFrame
        GeoDataFrame of first stops for each block, with ``geometry_origin``
        (depot) and ``geometry_destination`` (first stop) columns.
        Result from :func:`infer_depot_trip_endpoints`.
    last_stops_gdf: gpd.GeoDataFrame
        GeoDataFrame of last stops for each block, with ``geometry_origin``
        (last stop) and ``geometry_destination`` (depot) columns.
        Result from :func:`infer_depot_trip_endpoints`.
    deadhead_trips: pd.DataFrame
        Deadhead trip records from :func:`create_depot_deadhead_trips`.

    Returns
    -------
    tuple[pd.DataFrame, pd.DataFrame]
        A ``(stop_times_df, stops_df)`` tuple for the depot deadhead trips.
    """

    from_depot = first_stops_gdf.copy()
    to_depot = last_stops_gdf.copy()

    # Calculate distance from depot to first stop
    from_depot["distance_m"] = from_depot.apply(
        lambda row: (
            geodesic(
                (row.geometry_origin.y, row.geometry_origin.x),
                (row.geometry_destination.y, row.geometry_destination.x),
            ).meters
        ),
        axis=1,
    )
    # Calculate distance from last stop to depot
    to_depot["distance_m"] = to_depot.apply(
        lambda row: (
            geodesic(
                (row.geometry_origin.y, row.geometry_origin.x),
                (row.geometry_destination.y, row.geometry_destination.x),
            ).meters
        ),
        axis=1,
    )
    # Assume average speed of 30 km/h (to be consistant with the number adopted in gtfs_feature_processing.py)
    # to estimate travel time
    from_depot["travel_time_sec"] = (from_depot["distance_m"] / 30000) * 3600
    to_depot["travel_time_sec"] = (to_depot["distance_m"] / 30000) * 3600
    # Calculate departure time from depot for deadhead trip to first stop
    from_depot["departure_time"] = from_depot["arrival_time"] - pd.to_timedelta(
        from_depot["travel_time_sec"], unit="s"
    )
    # Calculate arrival time at depot for deadhead trip from last stop
    to_depot["arrival_time"] = to_depot["departure_time"] + pd.to_timedelta(
        to_depot["travel_time_sec"], unit="s"
    )

    # Create stop_times df for deadhead trips
    deadhead_trips_df = deadhead_trips.copy()
    deadhead_trips_df_from_depot = deadhead_trips_df[
        deadhead_trips_df.trip_type == "pull-out"
    ].copy()
    deadhead_trips_df_from_depot = deadhead_trips_df_from_depot.merge(
        from_depot[
            [
                "block_id",
                "stop_id",
                "nearest_depot_idx",
                "departure_time",
                "arrival_time",
            ]
        ],
        on="block_id",
    )

    deadhead_trips_df_to_depot = deadhead_trips_df[
        deadhead_trips_df.trip_type == "pull-in"
    ].copy()
    deadhead_trips_df_to_depot = deadhead_trips_df_to_depot.merge(
        to_depot[
            [
                "block_id",
                "stop_id",
                "nearest_depot_idx",
                "departure_time",
                "arrival_time",
            ]
        ],
        on="block_id",
    )
    deadhead_trips_df = pd.concat(
        [deadhead_trips_df_from_depot, deadhead_trips_df_to_depot], ignore_index=True
    )
    stop_times_df = pd.DataFrame(
        columns=[
            "trip_id",
            "stop_sequence",
            "arrival_time",
            "stop_id",
            "departure_time",
            "shape_dist_traveled",
        ]
    )
    stop_times_df["trip_id"] = deadhead_trips_df["trip_id"].repeat(2).values

    stop_times_df["stop_sequence"] = [1, 2] * len(deadhead_trips_df)
    stop_times_df["arrival_time"] = [
        x
        for pair in zip(
            deadhead_trips_df["departure_time"].to_list(),
            deadhead_trips_df["arrival_time"].to_list(),
        )
        for x in pair
    ]
    # For pull-out trips: stop_sequence 1 = depot stop (new), stop_sequence 2 = first
    # revenue stop (existing GTFS stop).  For pull-in trips the order is reversed.
    # Depot stops are keyed as "depot_{nearest_depot_idx}" where nearest_depot_idx is
    # the row index in the FTA shapefile.  This means all blocks that share the same
    # physical depot get the same stop_id.
    from_depot_stop_ids = [
        x
        for pair in zip(
            (
                "depot_" + deadhead_trips_df_from_depot["nearest_depot_idx"].astype(str)
            ).tolist(),
            deadhead_trips_df_from_depot["stop_id"].tolist(),
        )
        for x in pair
    ]
    to_depot_stop_ids = [
        x
        for pair in zip(
            deadhead_trips_df_to_depot["stop_id"].tolist(),
            (
                "depot_" + deadhead_trips_df_to_depot["nearest_depot_idx"].astype(str)
            ).tolist(),
        )
        for x in pair
    ]
    stop_times_df["stop_id"] = from_depot_stop_ids + to_depot_stop_ids
    stop_times_df["departure_time"] = stop_times_df["arrival_time"]
    stop_times_df["shape_dist_traveled"] = 0.0

    # Create stops df — one row per unique physical depot (keyed by nearest_depot_idx).
    # Revenue stop endpoints are already in the GTFS feed and must not be duplicated.
    # Use depot_facility_name as stop_name when available so stops_supplement.txt
    # carries a human-readable depot identifier.
    from_depot_stop_name = (
        from_depot["depot_facility_name"]
        if "depot_facility_name" in from_depot.columns
        else pd.Series([""] * len(from_depot), index=from_depot.index)
    )
    to_depot_stop_name = (
        to_depot["depot_facility_name"]
        if "depot_facility_name" in to_depot.columns
        else pd.Series([""] * len(to_depot), index=to_depot.index)
    )
    from_depot_stops = pd.DataFrame(
        {
            "stop_id": "depot_" + from_depot["nearest_depot_idx"].astype(str),
            "stop_name": from_depot_stop_name.values,
            "stop_lat": from_depot.geometry_origin.apply(lambda p: p.y).values,
            "stop_lon": from_depot.geometry_origin.apply(lambda p: p.x).values,
        }
    )
    to_depot_stops = pd.DataFrame(
        {
            "stop_id": "depot_" + to_depot["nearest_depot_idx"].astype(str),
            "stop_name": to_depot_stop_name.values,
            "stop_lat": to_depot.geometry_destination.apply(lambda p: p.y).values,
            "stop_lon": to_depot.geometry_destination.apply(lambda p: p.x).values,
        }
    )
    stops_df = (
        pd.concat([from_depot_stops, to_depot_stops])
        .drop_duplicates(subset="stop_id")
        .reset_index(drop=True)
    )

    return stop_times_df, stops_df