meubility-workbench/app/pipeline/gtfs.py

from __future__ import annotations

import csv
import io
import json
import sqlite3
import zipfile
from collections import defaultdict
from collections.abc import Callable
from pathlib import Path
from typing import Any, Iterator, Optional

from shapely.geometry import LineString
from sqlalchemy import func, select, text
from sqlalchemy.orm import Session

from app.config import settings
from app.gtfs_storage import GTFS_STORAGE_MAIN, GTFS_STORAGE_METADATA_KEY, GTFS_STORAGE_SIDECAR_STOP_TIMES, effective_gtfs_timetable_storage
from app.models import (
    Dataset,
    GtfsAgency,
    GtfsCalendar,
    GtfsCalendarDate,
    GtfsRoute,
    GtfsShape,
    GtfsStop,
    GtfsStopTime,
    GtfsTrip,
    Source,
)
from app.osm_classification import infer_osm_route_scope
from app.performance import measure_pipeline_phase
from app.pipeline.download import materialize_source
from app.pipeline.utils import first_nonempty, geometry_json_and_bbox, norm_ref, norm_text, sha256_file
from app.spatial import analyze_postgresql_tables, refresh_postgis_geometries


GTFS_MODE = {
    0: "tram",
    1: "subway",
    2: "train",
    3: "bus",
    4: "ferry",
    5: "cable_tram",
    6: "aerialway",
    7: "funicular",
    11: "trolleybus",
    12: "monorail",
}

GTFS_EXTENDED_MODE_RANGES = [
    (100, 199, "train"),
    (400, 499, "subway"),
    (700, 799, "bus"),
    (900, 999, "tram"),
    (1000, 1099, "ferry"),
    (1100, 1199, "aerialway"),
    (1200, 1299, "funicular"),
    (1300, 1399, "aerialway"),
    (1400, 1499, "monorail"),
    (1500, 1599, "trolleybus"),
]

GTFS_IMPORTER_VERSION = "gtfs_import_v6_sidecar_stop_times"

REQUIRED_FILES = {"agency.txt", "stops.txt", "routes.txt", "trips.txt", "stop_times.txt"}
GTFS_STAGE_BATCH_SIZE = 50_000
ProgressCallback = Callable[[str, str, int | None, int | None, dict[str, Any] | None], None]


def run_gtfs_source(session: Session, source: Source, progress_callback: ProgressCallback | None = None) -> Dataset:
    local_path = materialize_source(source)
    source_hash = sha256_file(local_path)
    existing = session.scalar(
        select(Dataset)
        .where(
            Dataset.source_id == source.id,
            Dataset.kind == "gtfs",
            Dataset.sha256 == source_hash,
            Dataset.is_active.is_(True),
            Dataset.status == "imported",
        )
        .order_by(Dataset.id.desc())
    )
    if existing is not None and _dataset_importer_version(existing) == GTFS_IMPORTER_VERSION:
        return existing
    return import_gtfs_zip(session=session, source=source, zip_path=local_path, source_hash=source_hash, progress_callback=progress_callback)


def import_gtfs_zip(
    session: Session,
    source: Source,
    zip_path: Path,
    source_hash: str | None = None,
    progress_callback: ProgressCallback | None = None,
) -> Dataset:
    if not zipfile.is_zipfile(zip_path):
        raise ValueError(f"GTFS source is not a zip file: {zip_path}")

    dataset = Dataset(
        source_id=source.id,
        kind="gtfs",
        local_path=str(zip_path),
        sha256=source_hash or sha256_file(zip_path),
        is_active=False,
        status="staging",
    )
    session.add(dataset)
    session.flush()
    session.commit()

    stage_path = _gtfs_stage_path(source, dataset, zip_path)
    _emit_progress(progress_callback, "gtfs_staging_started", f"Staging GTFS zip {zip_path.name}.", 0, None, {"stage_path": str(stage_path)})
    try:
        with measure_pipeline_phase("gtfs_staging", source_id=source.id, dataset_id=dataset.id, metadata={"zip_path": str(zip_path), "stage_path": str(stage_path)}) as metric:
            stage_summary = _stage_gtfs_zip(zip_path, stage_path, progress_callback=progress_callback)
            metric.update(stage_summary)
        activation_path = _prepare_gtfs_activation_path(source, dataset, stage_path, stage_summary)
        _emit_progress(progress_callback, "gtfs_activation_started", "Activating staged GTFS dataset.", None, None, {"stage_path": str(activation_path)})
        with measure_pipeline_phase("gtfs_activation", source_id=source.id, dataset_id=dataset.id, metadata={"stage_path": str(activation_path)}) as metric:
            _activate_staged_gtfs(session, source, dataset, activation_path, stage_summary, progress_callback=progress_callback)
            metric.update(stage_summary)
    except BaseException:
        session.rollback()
        failed = session.get(Dataset, dataset.id)
        if failed is not None:
            failed.status = "failed"
            failed.is_active = False
            session.commit()
        raise

    source.status = "ok"
    source.last_error = None
    session.flush()
    _emit_progress(progress_callback, "gtfs_activation_completed", f"Activated GTFS dataset #{dataset.id}.", None, None, {"dataset_id": dataset.id})
    return dataset


def backfill_gtfs_shapes(session: Session, dataset_id: int | None = None) -> dict:
    stmt = select(Dataset).where(Dataset.kind == "gtfs")
    if dataset_id is not None:
        stmt = stmt.where(Dataset.id == dataset_id)
    else:
        stmt = stmt.where(Dataset.is_active.is_(True))
    datasets = session.scalars(stmt.order_by(Dataset.id)).all()
    results = []
    for dataset in datasets:
        existing = session.scalar(select(func.count()).select_from(GtfsShape).where(GtfsShape.dataset_id == dataset.id)) or 0
        if existing:
            results.append({"dataset_id": dataset.id, "status": "skipped", "shapes": existing})
            continue
        zip_path = Path(dataset.local_path)
        if not zip_path.exists() or not zipfile.is_zipfile(zip_path):
            results.append({"dataset_id": dataset.id, "status": "missing_zip", "path": str(zip_path)})
            continue
        with zipfile.ZipFile(zip_path) as zf:
            names = {Path(name).name: name for name in zf.namelist() if not name.endswith("/")}
            if "shapes.txt" not in names:
                results.append({"dataset_id": dataset.id, "status": "no_shapes_txt", "shapes": 0})
                continue
            shapes_by_id = _read_shapes(zf, names)
        imported = _import_shapes(session, dataset.id, shapes_by_id)
        _record_importer_metadata(dataset, shapes_count=imported)
        session.flush()
        results.append({"dataset_id": dataset.id, "status": "imported", "shapes": imported})
    return {"datasets": results}


def _gtfs_stage_path(source: Source, dataset: Dataset, zip_path: Path) -> Path:
    source_hash = dataset.sha256 or sha256_file(zip_path)
    return settings.data_dir / "staging" / f"source_{source.id}" / f"gtfs_dataset_{dataset.id}_{source_hash[:12]}.sqlite"


def _gtfs_sidecar_path(source: Source, dataset: Dataset) -> Path:
    source_hash = dataset.sha256 or "unknown"
    return settings.data_dir / "sidecars" / f"source_{source.id}" / f"gtfs_dataset_{dataset.id}_{source_hash[:12]}.sqlite"


def _gtfs_timetable_storage_mode() -> str:
    return effective_gtfs_timetable_storage()


def _prepare_gtfs_activation_path(source: Source, dataset: Dataset, stage_path: Path, summary: dict[str, Any]) -> Path:
    storage_mode = _gtfs_timetable_storage_mode()
    if storage_mode == GTFS_STORAGE_SIDECAR_STOP_TIMES:
        sidecar_path = _gtfs_sidecar_path(source, dataset)
        sidecar_path.parent.mkdir(parents=True, exist_ok=True)
        if sidecar_path.exists():
            sidecar_path.unlink()
        stage_path.replace(sidecar_path)
        summary["stage_path"] = str(sidecar_path)
        summary["staging"] = "sqlite_promoted_to_sidecar"
        summary[GTFS_STORAGE_METADATA_KEY] = {
            "mode": GTFS_STORAGE_SIDECAR_STOP_TIMES,
            "sidecar_path": str(sidecar_path),
            "tables": {
                "gtfs_stop_times": "sidecar",
                "gtfs_agencies": "main",
                "gtfs_stops": "main",
                "gtfs_routes": "main",
                "gtfs_trips": "main",
                "gtfs_calendars": "main",
                "gtfs_calendar_dates": "main",
                "gtfs_shapes": "main",
            },
        }
        return sidecar_path

    summary[GTFS_STORAGE_METADATA_KEY] = {
        "mode": GTFS_STORAGE_MAIN,
        "tables": {
            "gtfs_stop_times": "main",
            "gtfs_agencies": "main",
            "gtfs_stops": "main",
            "gtfs_routes": "main",
            "gtfs_trips": "main",
            "gtfs_calendars": "main",
            "gtfs_calendar_dates": "main",
            "gtfs_shapes": "main",
        },
    }
    return stage_path


def _stage_gtfs_zip(zip_path: Path, stage_path: Path, progress_callback: ProgressCallback | None = None) -> dict[str, Any]:
    if stage_path.exists():
        stage_path.unlink()
    stage_path.parent.mkdir(parents=True, exist_ok=True)
    connection = sqlite3.connect(stage_path)
    try:
        _configure_stage_connection(connection)
        _create_gtfs_stage_schema(connection)
        with zipfile.ZipFile(zip_path) as zf:
            names = {Path(name).name: name for name in zf.namelist() if not name.endswith("/")}
            missing = sorted(REQUIRED_FILES - set(names.keys()))
            agency_names = _stage_agencies(connection, zf, names, progress_callback)
            calendars_count = _stage_calendars(connection, zf, names, progress_callback)
            calendar_dates_count = _stage_calendar_dates(connection, zf, names, progress_callback)
            stops_by_id, stops_count = _stage_stops(connection, zf, names, progress_callback)
            trips_by_route, first_shape_by_route, first_trip_by_route, trips_count = _stage_trips(connection, zf, names, progress_callback)
            shapes_by_id = _read_shapes_with_progress(zf, names, progress_callback)
            shapes_count = _stage_shapes(connection, shapes_by_id, progress_callback)
            stopseq_by_trip, stop_times_seen, stop_times_imported = _stage_stop_times(
                connection,
                zf,
                names,
                first_trip_ids=set(first_trip_by_route.values()),
                progress_callback=progress_callback,
            )
            routes_count = _stage_routes(
                connection=connection,
                routes_raw=list(_read_gtfs_csv(zf, names, "routes.txt")),
                agency_names=agency_names,
                stops_by_id=stops_by_id,
                trips_by_route=trips_by_route,
                first_shape_by_route=first_shape_by_route,
                first_trip_by_route=first_trip_by_route,
                shapes_by_id=shapes_by_id,
                stopseq_by_trip=stopseq_by_trip,
                progress_callback=progress_callback,
            )
            _create_gtfs_stage_indexes(connection, progress_callback)
        connection.commit()
        summary = {
            "importer": GTFS_IMPORTER_VERSION,
            "stage_path": str(stage_path),
            "missing_required_files": missing,
            "agencies": agency_names and len(agency_names) or 0,
            "stops": stops_count,
            "routes": routes_count,
            "trips": trips_count,
            "calendars": calendars_count,
            "calendar_dates": calendar_dates_count,
            "shapes": shapes_count,
            "stop_times_seen": stop_times_seen,
            "stop_times_imported": stop_times_imported,
            "stop_times_import_limit": settings.gtfs_stop_times_import_limit,
            "staging": "sqlite",
        }
        _emit_progress(progress_callback, "gtfs_staging_completed", "GTFS staging completed.", None, None, summary)
        return summary
    finally:
        connection.close()


def _configure_stage_connection(connection: sqlite3.Connection) -> None:
    connection.execute("PRAGMA journal_mode=OFF")
    connection.execute("PRAGMA synchronous=OFF")
    connection.execute("PRAGMA temp_store=MEMORY")
    connection.execute("PRAGMA locking_mode=EXCLUSIVE")


def _create_gtfs_stage_schema(connection: sqlite3.Connection) -> None:
    connection.executescript(
        """
        CREATE TABLE gtfs_agencies (
            agency_id TEXT NOT NULL,
            name TEXT NOT NULL,
            url TEXT,
            timezone TEXT
        );
        CREATE TABLE gtfs_stops (
            stop_id TEXT NOT NULL,
            name TEXT,
            lat REAL,
            lon REAL,
            parent_station TEXT
        );
        CREATE TABLE gtfs_routes (
            route_id TEXT NOT NULL,
            agency_id TEXT,
            short_name TEXT,
            long_name TEXT,
            route_type INTEGER,
            mode TEXT,
            route_scope TEXT,
            operator_name TEXT,
            geometry_geojson TEXT,
            min_lon REAL,
            min_lat REAL,
            max_lon REAL,
            max_lat REAL,
            route_key TEXT,
            operator_key TEXT
        );
        CREATE TABLE gtfs_trips (
            route_id TEXT NOT NULL,
            trip_id TEXT NOT NULL,
            service_id TEXT,
            shape_id TEXT
        );
        CREATE TABLE gtfs_calendars (
            service_id TEXT NOT NULL,
            monday INTEGER NOT NULL,
            tuesday INTEGER NOT NULL,
            wednesday INTEGER NOT NULL,
            thursday INTEGER NOT NULL,
            friday INTEGER NOT NULL,
            saturday INTEGER NOT NULL,
            sunday INTEGER NOT NULL,
            start_date INTEGER NOT NULL,
            end_date INTEGER NOT NULL
        );
        CREATE TABLE gtfs_calendar_dates (
            service_id TEXT NOT NULL,
            date INTEGER NOT NULL,
            exception_type INTEGER NOT NULL
        );
        CREATE TABLE gtfs_shapes (
            shape_id TEXT NOT NULL,
            geometry_geojson TEXT NOT NULL,
            min_lon REAL,
            min_lat REAL,
            max_lon REAL,
            max_lat REAL
        );
        CREATE TABLE gtfs_stop_times (
            trip_id TEXT NOT NULL,
            stop_id TEXT NOT NULL,
            stop_sequence INTEGER NOT NULL,
            arrival_time TEXT,
            departure_time TEXT,
            arrival_seconds INTEGER,
            departure_seconds INTEGER
        );
        """
    )


def _create_gtfs_stage_indexes(connection: sqlite3.Connection, progress_callback: ProgressCallback | None = None) -> None:
    _emit_progress(progress_callback, "gtfs_stage_indexes_started", "Building GTFS stage indexes.", None, None, None)
    for statement in [
        "CREATE INDEX IF NOT EXISTS ix_stage_gtfs_stop_times_stop_depart_trip ON gtfs_stop_times (stop_id, departure_seconds, trip_id, stop_sequence)",
        "CREATE INDEX IF NOT EXISTS ix_stage_gtfs_stop_times_stop_arrive_trip ON gtfs_stop_times (stop_id, arrival_seconds, trip_id, stop_sequence)",
        "CREATE INDEX IF NOT EXISTS ix_stage_gtfs_stop_times_trip_seq ON gtfs_stop_times (trip_id, stop_sequence)",
        "CREATE INDEX IF NOT EXISTS ix_stage_gtfs_stop_times_trip_stop_seq ON gtfs_stop_times (trip_id, stop_id, stop_sequence)",
        "CREATE INDEX IF NOT EXISTS ix_stage_gtfs_trips_trip ON gtfs_trips (trip_id)",
        "CREATE INDEX IF NOT EXISTS ix_stage_gtfs_trips_service_trip ON gtfs_trips (service_id, trip_id)",
        "CREATE INDEX IF NOT EXISTS ix_stage_gtfs_trips_route_service ON gtfs_trips (route_id, service_id)",
    ]:
        connection.execute(statement)
    _emit_progress(progress_callback, "gtfs_stage_indexes_completed", "Built GTFS stage indexes.", None, None, None)


def _activate_staged_gtfs(
    session: Session,
    source: Source,
    dataset: Dataset,
    stage_path: Path,
    summary: dict[str, Any],
    progress_callback: ProgressCallback | None = None,
) -> None:
    if not stage_path.exists():
        raise FileNotFoundError(f"GTFS staging database is missing: {stage_path}")
    dataset = session.get(Dataset, dataset.id) or dataset
    source = session.get(Source, source.id) or source
    replaced_datasets = [existing for existing in list(source.datasets) if existing.id != dataset.id and existing.kind == "gtfs"]
    for existing in source.datasets:
        if existing.id != dataset.id:
            existing.is_active = False
    copy_stop_times = _copy_stop_times_to_main(summary)
    heavy_index_drop = copy_stop_times and _should_drop_indexes_for_activation(stage_path)
    if heavy_index_drop:
        _emit_progress(progress_callback, "gtfs_activation_indexes_dropped", "Dropping heavy GTFS lookup indexes before bulk activation.", None, None, None)
        _drop_gtfs_bulk_indexes(session.connection())
    try:
        if replaced_datasets:
            _emit_progress(
                progress_callback,
                "gtfs_activation_pruning_replaced",
                f"Pruning {len(replaced_datasets)} replaced GTFS dataset(s) before activation.",
                None,
                None,
                {"dataset_ids": [dataset.id for dataset in replaced_datasets]},
            )
            from app.data_management import _delete_dataset_files, _delete_dataset_rows, _detach_update_checks_for_dataset

            for old_dataset in replaced_datasets:
                _detach_update_checks_for_dataset(session, old_dataset.id)
                _delete_dataset_rows(session, old_dataset)
                _delete_dataset_files(old_dataset)
                session.delete(old_dataset)
        with sqlite3.connect(stage_path) as stage_connection:
            _copy_stage_table(
                session,
                stage_connection,
                dataset.id,
                "gtfs_agencies",
                ["agency_id", "name", "url", "timezone"],
                progress_callback,
            )
            _copy_stage_table(
                session,
                stage_connection,
                dataset.id,
                "gtfs_stops",
                ["stop_id", "name", "lat", "lon", "parent_station"],
                progress_callback,
            )
            _copy_stage_table(
                session,
                stage_connection,
                dataset.id,
                "gtfs_calendars",
                ["service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date"],
                progress_callback,
            )
            _copy_stage_table(
                session,
                stage_connection,
                dataset.id,
                "gtfs_calendar_dates",
                ["service_id", "date", "exception_type"],
                progress_callback,
            )
            _copy_stage_table(
                session,
                stage_connection,
                dataset.id,
                "gtfs_trips",
                ["route_id", "trip_id", "service_id", "shape_id"],
                progress_callback,
            )
            _copy_stage_table(
                session,
                stage_connection,
                dataset.id,
                "gtfs_shapes",
                ["shape_id", "geometry_geojson", "min_lon", "min_lat", "max_lon", "max_lat"],
                progress_callback,
            )
            if copy_stop_times:
                _copy_stage_table(
                    session,
                    stage_connection,
                    dataset.id,
                    "gtfs_stop_times",
                    ["trip_id", "stop_id", "stop_sequence", "arrival_time", "departure_time", "arrival_seconds", "departure_seconds"],
                    progress_callback,
                )
            else:
                _emit_progress(
                    progress_callback,
                    "gtfs_activation_sidecar_stop_times",
                    "Kept gtfs_stop_times in sidecar storage.",
                    None,
                    None,
                    {"table": "gtfs_stop_times", "sidecar_path": str(stage_path)},
                )
            _copy_stage_table(
                session,
                stage_connection,
                dataset.id,
                "gtfs_routes",
                [
                    "route_id",
                    "agency_id",
                    "short_name",
                    "long_name",
                    "route_type",
                    "mode",
                    "route_scope",
                    "operator_name",
                    "geometry_geojson",
                    "min_lon",
                    "min_lat",
                    "max_lon",
                    "max_lat",
                    "route_key",
                    "operator_key",
                ],
                progress_callback,
            )
    finally:
        if heavy_index_drop:
            _emit_progress(progress_callback, "gtfs_activation_indexes_rebuilding", "Rebuilding GTFS lookup indexes after bulk activation.", None, None, None)
            _create_gtfs_bulk_indexes(session.connection())
    dataset.status = "imported"
    dataset.is_active = True
    dataset.metadata_json = json.dumps(summary, indent=2)
    source.status = "ok"
    source.last_error = None
    session.flush()
    refresh_postgis_geometries(session, dataset_id=dataset.id, tables=["gtfs_stops", "gtfs_routes", "gtfs_shapes"])
    analyze_postgresql_tables(session, ["gtfs_stops", "gtfs_routes", "gtfs_shapes", "gtfs_trips", "gtfs_stop_times"])
    if copy_stop_times and not settings.gtfs_keep_activation_stage:
        try:
            stage_path.unlink()
        except FileNotFoundError:
            pass


def _copy_stop_times_to_main(summary: dict[str, Any]) -> bool:
    storage = summary.get(GTFS_STORAGE_METADATA_KEY)
    if not isinstance(storage, dict):
        return True
    tables = storage.get("tables")
    if isinstance(tables, dict):
        return tables.get("gtfs_stop_times") != "sidecar"
    return storage.get("mode") != GTFS_STORAGE_SIDECAR_STOP_TIMES


def _copy_stage_table(
    session: Session,
    stage_connection: sqlite3.Connection,
    dataset_id: int,
    table: str,
    columns: list[str],
    progress_callback: ProgressCallback | None,
) -> None:
    column_sql = ", ".join(columns)
    placeholders = ", ".join([":dataset_id", *[f":{column}" for column in columns]])
    insert_sql = f"INSERT INTO {table} (dataset_id, {column_sql}) VALUES ({placeholders})"
    cursor = stage_connection.execute(f"SELECT {column_sql} FROM {table}")
    copied = 0
    while True:
        rows = cursor.fetchmany(GTFS_STAGE_BATCH_SIZE)
        if not rows:
            break
        payload = [
            {"dataset_id": dataset_id, **{column: row[index] for index, column in enumerate(columns)}}
            for row in rows
        ]
        session.execute(text(insert_sql), payload)
        copied += len(rows)
        _emit_progress(
            progress_callback,
            "gtfs_activation_chunk",
            f"Activated {table} chunk.",
            copied,
            None,
            {"table": table, "rows": copied},
        )


def _should_drop_indexes_for_activation(stage_path: Path) -> bool:
    if settings.is_postgresql_database:
        return False
    try:
        with sqlite3.connect(stage_path) as connection:
            stop_times = connection.execute("SELECT COUNT(*) FROM gtfs_stop_times").fetchone()[0]
            trips = connection.execute("SELECT COUNT(*) FROM gtfs_trips").fetchone()[0]
    except sqlite3.Error:
        return False
    return int(stop_times or 0) >= 250_000 or int(trips or 0) >= 100_000


def _drop_gtfs_bulk_indexes(connection) -> None:
    for index_name in [
        "ix_gtfs_stop_times_stop",
        "ix_gtfs_stop_times_stop_depart_trip",
        "ix_gtfs_stop_times_stop_arrival",
        "ix_gtfs_stop_times_stop_arrive_trip",
        "ix_gtfs_stop_times_trip_seq",
        "ix_gtfs_stop_times_trip_stop_seq",
        "ix_gtfs_trips_dataset_trip",
        "ix_gtfs_trips_dataset_route",
        "ix_gtfs_trips_dataset_service",
        "ix_gtfs_trips_dataset_route_service",
        "ix_gtfs_routes_dataset_route",
        "ix_gtfs_shapes_dataset_shape",
        "ix_gtfs_calendars_dataset_service_dates",
        "ix_gtfs_calendar_dates_dataset_date",
    ]:
        connection.exec_driver_sql(f"DROP INDEX IF EXISTS {index_name}")


def _create_gtfs_bulk_indexes(connection) -> None:
    for statement in [
        "CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_stop ON gtfs_stop_times (dataset_id, stop_id, departure_seconds, trip_id, stop_sequence)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_stop_depart_trip ON gtfs_stop_times (dataset_id, stop_id, departure_seconds, trip_id)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_stop_arrival ON gtfs_stop_times (dataset_id, stop_id, arrival_seconds, trip_id, stop_sequence)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_stop_arrive_trip ON gtfs_stop_times (dataset_id, stop_id, arrival_seconds, trip_id)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_trip_seq ON gtfs_stop_times (dataset_id, trip_id, stop_sequence)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_trip_stop_seq ON gtfs_stop_times (dataset_id, trip_id, stop_id, stop_sequence)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_trips_dataset_trip ON gtfs_trips (dataset_id, trip_id)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_trips_dataset_route ON gtfs_trips (dataset_id, route_id)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_trips_dataset_service ON gtfs_trips (dataset_id, service_id, trip_id)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_trips_dataset_route_service ON gtfs_trips (dataset_id, route_id, service_id)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_routes_dataset_route ON gtfs_routes (dataset_id, route_id)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_shapes_dataset_shape ON gtfs_shapes (dataset_id, shape_id)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_calendars_dataset_service_dates ON gtfs_calendars (dataset_id, service_id, start_date, end_date)",
        "CREATE INDEX IF NOT EXISTS ix_gtfs_calendar_dates_dataset_date ON gtfs_calendar_dates (dataset_id, date, service_id, exception_type)",
    ]:
        connection.exec_driver_sql(statement)


def _stage_agencies(
    connection: sqlite3.Connection,
    zf: zipfile.ZipFile,
    names: dict[str, str],
    progress_callback: ProgressCallback | None,
) -> dict[str, str]:
    _emit_progress(progress_callback, "gtfs_file_started", "Reading agency.txt.", None, None, {"file": "agency.txt"})
    agency_names: dict[str, str] = {}
    rows = []
    for idx, row in enumerate(_read_gtfs_csv(zf, names, "agency.txt")):
        agency_id = first_nonempty(row.get("agency_id"), f"agency_{idx}")
        name = first_nonempty(row.get("agency_name"), agency_id)
        agency_names[agency_id] = name
        rows.append((agency_id, name, row.get("agency_url") or None, row.get("agency_timezone") or None))
    connection.executemany("INSERT INTO gtfs_agencies (agency_id, name, url, timezone) VALUES (?, ?, ?, ?)", rows)
    _emit_progress(progress_callback, "gtfs_file_completed", "Imported agency.txt.", len(rows), None, {"file": "agency.txt", "rows": len(rows)})
    return agency_names


def _stage_calendars(
    connection: sqlite3.Connection,
    zf: zipfile.ZipFile,
    names: dict[str, str],
    progress_callback: ProgressCallback | None,
) -> int:
    _emit_progress(progress_callback, "gtfs_file_started", "Reading calendar.txt.", None, None, {"file": "calendar.txt"})
    rows = []
    for row in _read_gtfs_csv(zf, names, "calendar.txt"):
        service_id = row.get("service_id") or ""
        start_date = _int_or_none(row.get("start_date"))
        end_date = _int_or_none(row.get("end_date"))
        if not service_id or start_date is None or end_date is None:
            continue
        rows.append(
            (
                service_id,
                int(_bool_flag(row.get("monday"))),
                int(_bool_flag(row.get("tuesday"))),
                int(_bool_flag(row.get("wednesday"))),
                int(_bool_flag(row.get("thursday"))),
                int(_bool_flag(row.get("friday"))),
                int(_bool_flag(row.get("saturday"))),
                int(_bool_flag(row.get("sunday"))),
                start_date,
                end_date,
            )
        )
    connection.executemany(
        """
        INSERT INTO gtfs_calendars
        (service_id, monday, tuesday, wednesday, thursday, friday, saturday, sunday, start_date, end_date)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """,
        rows,
    )
    _emit_progress(progress_callback, "gtfs_file_completed", "Imported calendar.txt.", len(rows), None, {"file": "calendar.txt", "rows": len(rows)})
    return len(rows)


def _stage_calendar_dates(
    connection: sqlite3.Connection,
    zf: zipfile.ZipFile,
    names: dict[str, str],
    progress_callback: ProgressCallback | None,
) -> int:
    return _stage_chunked_rows(
        connection=connection,
        zf=zf,
        names=names,
        basename="calendar_dates.txt",
        insert_sql="INSERT INTO gtfs_calendar_dates (service_id, date, exception_type) VALUES (?, ?, ?)",
        row_factory=lambda row: (
            row.get("service_id") or "",
            _int_or_none(row.get("date")),
            _int_or_none(row.get("exception_type")),
        ),
        validator=lambda row: bool(row[0]) and row[1] is not None and row[2] is not None,
        progress_callback=progress_callback,
    )


def _stage_stops(
    connection: sqlite3.Connection,
    zf: zipfile.ZipFile,
    names: dict[str, str],
    progress_callback: ProgressCallback | None,
) -> tuple[dict[str, tuple[float, float, str]], int]:
    _emit_progress(progress_callback, "gtfs_file_started", "Reading stops.txt.", None, None, {"file": "stops.txt"})
    stops_by_id: dict[str, tuple[float, float, str]] = {}
    rows = []
    for row in _read_gtfs_csv(zf, names, "stops.txt"):
        stop_id = row.get("stop_id", "")
        if not stop_id:
            continue
        lat = _float_or_none(row.get("stop_lat"))
        lon = _float_or_none(row.get("stop_lon"))
        name = row.get("stop_name") or None
        if lat is not None and lon is not None:
            stops_by_id[stop_id] = (lon, lat, name or stop_id)
        rows.append((stop_id, name, lat, lon, row.get("parent_station") or None))
    connection.executemany("INSERT INTO gtfs_stops (stop_id, name, lat, lon, parent_station) VALUES (?, ?, ?, ?, ?)", rows)
    _emit_progress(progress_callback, "gtfs_file_completed", "Imported stops.txt.", len(rows), None, {"file": "stops.txt", "rows": len(rows)})
    return stops_by_id, len(rows)


def _stage_trips(
    connection: sqlite3.Connection,
    zf: zipfile.ZipFile,
    names: dict[str, str],
    progress_callback: ProgressCallback | None,
) -> tuple[dict[str, list[str]], dict[str, str], dict[str, str], int]:
    _emit_progress(progress_callback, "gtfs_file_started", "Reading trips.txt.", None, None, {"file": "trips.txt"})
    trips_by_route: dict[str, list[str]] = defaultdict(list)
    first_shape_by_route: dict[str, str] = {}
    first_trip_by_route: dict[str, str] = {}
    rows = []
    imported = 0
    for row in _read_gtfs_csv(zf, names, "trips.txt"):
        route_id = row.get("route_id", "")
        trip_id = row.get("trip_id", "")
        if not route_id or not trip_id:
            continue
        trips_by_route[route_id].append(trip_id)
        first_trip_by_route.setdefault(route_id, trip_id)
        shape_id = row.get("shape_id") or ""
        if shape_id:
            first_shape_by_route.setdefault(route_id, shape_id)
        rows.append((route_id, trip_id, row.get("service_id") or None, shape_id or None))
        imported += 1
        if len(rows) >= GTFS_STAGE_BATCH_SIZE:
            connection.executemany("INSERT INTO gtfs_trips (route_id, trip_id, service_id, shape_id) VALUES (?, ?, ?, ?)", rows)
            rows.clear()
            _emit_progress(progress_callback, "gtfs_file_chunk", "Imported trips.txt chunk.", imported, None, {"file": "trips.txt", "rows": imported})
    if rows:
        connection.executemany("INSERT INTO gtfs_trips (route_id, trip_id, service_id, shape_id) VALUES (?, ?, ?, ?)", rows)
    _emit_progress(progress_callback, "gtfs_file_completed", "Imported trips.txt.", imported, None, {"file": "trips.txt", "rows": imported})
    return dict(trips_by_route), first_shape_by_route, first_trip_by_route, imported


def _read_shapes_with_progress(
    zf: zipfile.ZipFile,
    names: dict[str, str],
    progress_callback: ProgressCallback | None,
) -> dict[str, list[tuple[float, float]]]:
    _emit_progress(progress_callback, "gtfs_file_started", "Reading shapes.txt.", None, None, {"file": "shapes.txt"})
    shapes = _read_shapes(zf, names)
    _emit_progress(progress_callback, "gtfs_file_completed", "Read shapes.txt.", len(shapes), None, {"file": "shapes.txt", "shapes": len(shapes)})
    return shapes


def _stage_shapes(
    connection: sqlite3.Connection,
    shapes_by_id: dict[str, list[tuple[float, float]]],
    progress_callback: ProgressCallback | None,
) -> int:
    rows = []
    imported = 0
    for shape_id, coords in shapes_by_id.items():
        if len(coords) < 2:
            continue
        geometry_text, bbox = geometry_json_and_bbox(LineString(coords))
        if geometry_text is None:
            continue
        rows.append((shape_id, geometry_text, bbox[0], bbox[1], bbox[2], bbox[3]))
        imported += 1
        if len(rows) >= 5000:
            connection.executemany(
                "INSERT INTO gtfs_shapes (shape_id, geometry_geojson, min_lon, min_lat, max_lon, max_lat) VALUES (?, ?, ?, ?, ?, ?)",
                rows,
            )
            rows.clear()
            _emit_progress(progress_callback, "gtfs_file_chunk", "Imported shapes chunk.", imported, None, {"file": "shapes.txt", "rows": imported})
    if rows:
        connection.executemany(
            "INSERT INTO gtfs_shapes (shape_id, geometry_geojson, min_lon, min_lat, max_lon, max_lat) VALUES (?, ?, ?, ?, ?, ?)",
            rows,
        )
    _emit_progress(progress_callback, "gtfs_file_completed", "Imported shapes.", imported, None, {"file": "shapes.txt", "rows": imported})
    return imported


def _stage_stop_times(
    connection: sqlite3.Connection,
    zf: zipfile.ZipFile,
    names: dict[str, str],
    first_trip_ids: set[str],
    progress_callback: ProgressCallback | None,
) -> tuple[dict[str, list[str]], int, int]:
    _emit_progress(progress_callback, "gtfs_file_started", "Reading stop_times.txt.", None, None, {"file": "stop_times.txt"})
    stopseq_by_trip: dict[str, list[tuple[int, str]]] = defaultdict(list)
    rows = []
    count = 0
    imported = 0
    limit = settings.gtfs_stop_times_import_limit
    for row in _read_gtfs_csv(zf, names, "stop_times.txt"):
        count += 1
        trip_id = row.get("trip_id", "")
        stop_id = row.get("stop_id", "")
        seq = _int_or_none(row.get("stop_sequence"))
        if not trip_id or not stop_id or seq is None:
            continue
        if trip_id in first_trip_ids:
            stopseq_by_trip[trip_id].append((seq, stop_id))
        if limit <= 0 or imported < limit:
            arrival_time = row.get("arrival_time") or None
            departure_time = row.get("departure_time") or None
            rows.append((trip_id, stop_id, seq, arrival_time, departure_time, _time_seconds(arrival_time), _time_seconds(departure_time)))
            imported += 1
            if len(rows) >= GTFS_STAGE_BATCH_SIZE:
                connection.executemany(
                    """
                    INSERT INTO gtfs_stop_times
                    (trip_id, stop_id, stop_sequence, arrival_time, departure_time, arrival_seconds, departure_seconds)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                    """,
                    rows,
                )
                rows.clear()
                _emit_progress(progress_callback, "gtfs_file_chunk", "Imported stop_times.txt chunk.", imported, None, {"file": "stop_times.txt", "rows": imported, "seen": count})
    if rows:
        connection.executemany(
            """
            INSERT INTO gtfs_stop_times
            (trip_id, stop_id, stop_sequence, arrival_time, departure_time, arrival_seconds, departure_seconds)
            VALUES (?, ?, ?, ?, ?, ?, ?)
            """,
            rows,
        )
    _emit_progress(progress_callback, "gtfs_file_completed", "Imported stop_times.txt.", imported, None, {"file": "stop_times.txt", "rows": imported, "seen": count})
    return {trip: [stop for _, stop in sorted(seq)] for trip, seq in stopseq_by_trip.items()}, count, imported


def _stage_routes(
    connection: sqlite3.Connection,
    routes_raw: list[dict[str, str]],
    agency_names: dict[str, str],
    stops_by_id: dict[str, tuple[float, float, str]],
    trips_by_route: dict[str, list[str]],
    first_shape_by_route: dict[str, str],
    first_trip_by_route: dict[str, str],
    shapes_by_id: dict[str, list[tuple[float, float]]],
    stopseq_by_trip: dict[str, list[str]],
    progress_callback: ProgressCallback | None,
) -> int:
    _emit_progress(progress_callback, "gtfs_file_started", "Reading routes.txt.", None, None, {"file": "routes.txt"})
    rows = []
    for row in routes_raw:
        route_id = row.get("route_id", "")
        if not route_id:
            continue
        route_type = _int_or_none(row.get("route_type"))
        mode = _gtfs_mode(route_type)
        agency_id = row.get("agency_id") or None
        operator = agency_names.get(agency_id or "", agency_id or "")
        short_name = row.get("route_short_name") or None
        long_name = row.get("route_long_name") or None
        route_scope = infer_osm_route_scope(mode=mode, ref=short_name, name=long_name, network=operator)
        geometry = _route_geometry(route_id, first_shape_by_route, first_trip_by_route, shapes_by_id, stopseq_by_trip, stops_by_id)
        geometry_text, bbox = geometry_json_and_bbox(geometry) if geometry is not None else (None, (None, None, None, None))
        rows.append(
            (
                route_id,
                agency_id,
                short_name,
                long_name,
                route_type,
                mode,
                route_scope,
                operator or None,
                geometry_text,
                bbox[0],
                bbox[1],
                bbox[2],
                bbox[3],
                norm_ref(short_name) or norm_text(long_name) or norm_ref(route_id),
                norm_text(operator),
            )
        )
    connection.executemany(
        """
        INSERT INTO gtfs_routes
        (route_id, agency_id, short_name, long_name, route_type, mode, route_scope, operator_name, geometry_geojson, min_lon, min_lat, max_lon, max_lat, route_key, operator_key)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """,
        rows,
    )
    _emit_progress(progress_callback, "gtfs_file_completed", "Imported routes.txt.", len(rows), None, {"file": "routes.txt", "rows": len(rows)})
    return len(rows)


def _stage_chunked_rows(
    connection: sqlite3.Connection,
    zf: zipfile.ZipFile,
    names: dict[str, str],
    basename: str,
    insert_sql: str,
    row_factory,
    validator,
    progress_callback: ProgressCallback | None,
) -> int:
    _emit_progress(progress_callback, "gtfs_file_started", f"Reading {basename}.", None, None, {"file": basename})
    rows = []
    imported = 0
    for raw in _read_gtfs_csv(zf, names, basename):
        row = row_factory(raw)
        if not validator(row):
            continue
        rows.append(row)
        imported += 1
        if len(rows) >= GTFS_STAGE_BATCH_SIZE:
            connection.executemany(insert_sql, rows)
            rows.clear()
            _emit_progress(progress_callback, "gtfs_file_chunk", f"Imported {basename} chunk.", imported, None, {"file": basename, "rows": imported})
    if rows:
        connection.executemany(insert_sql, rows)
    _emit_progress(progress_callback, "gtfs_file_completed", f"Imported {basename}.", imported, None, {"file": basename, "rows": imported})
    return imported


def _emit_progress(
    progress_callback: ProgressCallback | None,
    event_type: str,
    message: str,
    progress_current: int | None = None,
    progress_total: int | None = None,
    metadata: dict[str, Any] | None = None,
) -> None:
    if progress_callback is not None:
        progress_callback(event_type, message, progress_current, progress_total, metadata)


def _read_gtfs_csv(zf: zipfile.ZipFile, names: dict[str, str], basename: str) -> Iterator[dict[str, str]]:
    if basename not in names:
        return iter(())

    def _iter() -> Iterator[dict[str, str]]:
        with zf.open(names[basename], "r") as raw:
            text = io.TextIOWrapper(raw, encoding="utf-8-sig", newline="")
            reader = csv.DictReader(text)
            for row in reader:
                yield {str(k).strip(): (v or "").strip() for k, v in row.items() if k is not None}

    return _iter()


def _record_importer_metadata(dataset: Dataset, shapes_count: int | None = None) -> None:
    metadata = {}
    if dataset.metadata_json:
        try:
            metadata = json.loads(dataset.metadata_json)
        except json.JSONDecodeError:
            metadata = {}
    metadata["importer"] = GTFS_IMPORTER_VERSION
    if shapes_count is not None:
        metadata["shapes"] = shapes_count
    dataset.metadata_json = json.dumps(metadata, indent=2)


def _import_agencies(session: Session, dataset_id: int, rows: list[dict[str, str]]) -> dict[str, str]:
    agency_names: dict[str, str] = {}
    objects: list[GtfsAgency] = []
    for idx, row in enumerate(rows):
        agency_id = first_nonempty(row.get("agency_id"), f"agency_{idx}")
        name = first_nonempty(row.get("agency_name"), agency_id)
        agency_names[agency_id] = name
        objects.append(
            GtfsAgency(
                dataset_id=dataset_id,
                agency_id=agency_id,
                name=name,
                url=row.get("agency_url") or None,
                timezone=row.get("agency_timezone") or None,
            )
        )
    if objects:
        session.bulk_save_objects(objects)
    return agency_names


def _import_calendars(session: Session, dataset_id: int, rows: list[dict[str, str]]) -> int:
    objects: list[GtfsCalendar] = []
    for row in rows:
        service_id = row.get("service_id") or ""
        start_date = _int_or_none(row.get("start_date"))
        end_date = _int_or_none(row.get("end_date"))
        if not service_id or start_date is None or end_date is None:
            continue
        objects.append(
            GtfsCalendar(
                dataset_id=dataset_id,
                service_id=service_id,
                monday=_bool_flag(row.get("monday")),
                tuesday=_bool_flag(row.get("tuesday")),
                wednesday=_bool_flag(row.get("wednesday")),
                thursday=_bool_flag(row.get("thursday")),
                friday=_bool_flag(row.get("friday")),
                saturday=_bool_flag(row.get("saturday")),
                sunday=_bool_flag(row.get("sunday")),
                start_date=start_date,
                end_date=end_date,
            )
        )
    if objects:
        session.bulk_save_objects(objects)
    return len(objects)


def _import_calendar_dates(session: Session, dataset_id: int, rows: list[dict[str, str]]) -> int:
    objects: list[GtfsCalendarDate] = []
    for row in rows:
        service_id = row.get("service_id") or ""
        date = _int_or_none(row.get("date"))
        exception_type = _int_or_none(row.get("exception_type"))
        if not service_id or date is None or exception_type is None:
            continue
        objects.append(
            GtfsCalendarDate(
                dataset_id=dataset_id,
                service_id=service_id,
                date=date,
                exception_type=exception_type,
            )
        )
    for batch_start in range(0, len(objects), 5000):
        session.bulk_save_objects(objects[batch_start : batch_start + 5000])
    return len(objects)


def _import_stops(session: Session, dataset_id: int, rows: list[dict[str, str]]) -> dict[str, tuple[float, float, str]]:
    stops_by_id: dict[str, tuple[float, float, str]] = {}
    objects: list[GtfsStop] = []
    for row in rows:
        stop_id = row.get("stop_id", "")
        if not stop_id:
            continue
        lat = _float_or_none(row.get("stop_lat"))
        lon = _float_or_none(row.get("stop_lon"))
        name = row.get("stop_name") or None
        if lat is not None and lon is not None:
            stops_by_id[stop_id] = (lon, lat, name or stop_id)
        objects.append(
            GtfsStop(
                dataset_id=dataset_id,
                stop_id=stop_id,
                name=name,
                lat=lat,
                lon=lon,
                parent_station=row.get("parent_station") or None,
            )
        )
    if objects:
        session.bulk_save_objects(objects)
    return stops_by_id


def _import_trips(
    session: Session, dataset_id: int, rows: list[dict[str, str]]
) -> tuple[dict[str, list[str]], dict[str, str], dict[str, str]]:
    trips_by_route: dict[str, list[str]] = defaultdict(list)
    first_shape_by_route: dict[str, str] = {}
    first_trip_by_route: dict[str, str] = {}
    objects: list[GtfsTrip] = []
    for row in rows:
        route_id = row.get("route_id", "")
        trip_id = row.get("trip_id", "")
        if not route_id or not trip_id:
            continue
        trips_by_route[route_id].append(trip_id)
        first_trip_by_route.setdefault(route_id, trip_id)
        shape_id = row.get("shape_id") or ""
        if shape_id:
            first_shape_by_route.setdefault(route_id, shape_id)
        objects.append(
            GtfsTrip(
                dataset_id=dataset_id,
                route_id=route_id,
                trip_id=trip_id,
                service_id=row.get("service_id") or None,
                shape_id=shape_id or None,
            )
        )
    for batch_start in range(0, len(objects), 5000):
        session.bulk_save_objects(objects[batch_start : batch_start + 5000])
    return dict(trips_by_route), first_shape_by_route, first_trip_by_route


def _read_shapes(zf: zipfile.ZipFile, names: dict[str, str]) -> dict[str, list[tuple[float, float]]]:
    by_shape: dict[str, list[tuple[int, float, float]]] = defaultdict(list)
    for row in _read_gtfs_csv(zf, names, "shapes.txt"):
        shape_id = row.get("shape_id", "")
        lat = _float_or_none(row.get("shape_pt_lat"))
        lon = _float_or_none(row.get("shape_pt_lon"))
        seq = _int_or_none(row.get("shape_pt_sequence"))
        if shape_id and lat is not None and lon is not None:
            by_shape[shape_id].append((seq if seq is not None else 0, lon, lat))
    return {shape_id: [(lon, lat) for _, lon, lat in sorted(points)] for shape_id, points in by_shape.items()}


def _import_shapes(session: Session, dataset_id: int, shapes_by_id: dict[str, list[tuple[float, float]]]) -> int:
    objects: list[GtfsShape] = []
    imported = 0
    for shape_id, coords in shapes_by_id.items():
        if len(coords) < 2:
            continue
        geometry_text, bbox = geometry_json_and_bbox(LineString(coords))
        if geometry_text is None:
            continue
        objects.append(
            GtfsShape(
                dataset_id=dataset_id,
                shape_id=shape_id,
                geometry_geojson=geometry_text,
                min_lon=bbox[0],
                min_lat=bbox[1],
                max_lon=bbox[2],
                max_lat=bbox[3],
            )
        )
        imported += 1
        if len(objects) >= 1000:
            session.bulk_save_objects(objects)
            objects.clear()
    if objects:
        session.bulk_save_objects(objects)
    return imported


def _import_stop_times(
    session: Session,
    dataset_id: int,
    zf: zipfile.ZipFile,
    names: dict[str, str],
    first_trip_ids: set[str],
) -> tuple[dict[str, list[str]], int, int]:
    stopseq_by_trip: dict[str, list[tuple[int, str]]] = defaultdict(list)
    objects: list[GtfsStopTime] = []
    count = 0
    imported = 0
    limit = settings.gtfs_stop_times_import_limit
    for row in _read_gtfs_csv(zf, names, "stop_times.txt"):
        count += 1
        trip_id = row.get("trip_id", "")
        stop_id = row.get("stop_id", "")
        seq = _int_or_none(row.get("stop_sequence"))
        if not trip_id or not stop_id or seq is None:
            continue
        if trip_id in first_trip_ids:
            stopseq_by_trip[trip_id].append((seq, stop_id))
        if limit <= 0 or imported < limit:
            arrival_time = row.get("arrival_time") or None
            departure_time = row.get("departure_time") or None
            objects.append(
                GtfsStopTime(
                    dataset_id=dataset_id,
                    trip_id=trip_id,
                    stop_id=stop_id,
                    stop_sequence=seq,
                    arrival_time=arrival_time,
                    departure_time=departure_time,
                    arrival_seconds=_time_seconds(arrival_time),
                    departure_seconds=_time_seconds(departure_time),
                )
            )
            imported += 1
            if len(objects) >= 5000:
                session.bulk_save_objects(objects)
                objects.clear()
    if objects:
        session.bulk_save_objects(objects)
    return {trip: [stop for _, stop in sorted(seq)] for trip, seq in stopseq_by_trip.items()}, count, imported


def _import_routes(
    session: Session,
    dataset_id: int,
    routes_raw: list[dict[str, str]],
    agency_names: dict[str, str],
    stops_by_id: dict[str, tuple[float, float, str]],
    trips_by_route: dict[str, list[str]],
    first_shape_by_route: dict[str, str],
    first_trip_by_route: dict[str, str],
    shapes_by_id: dict[str, list[tuple[float, float]]],
    stopseq_by_trip: dict[str, list[str]],
) -> int:
    objects: list[GtfsRoute] = []
    for row in routes_raw:
        route_id = row.get("route_id", "")
        if not route_id:
            continue
        route_type = _int_or_none(row.get("route_type"))
        mode = _gtfs_mode(route_type)
        agency_id = row.get("agency_id") or None
        operator = agency_names.get(agency_id or "", agency_id or "")
        short_name = row.get("route_short_name") or None
        long_name = row.get("route_long_name") or None
        route_scope = infer_osm_route_scope(mode=mode, ref=short_name, name=long_name, network=operator)
        geometry = _route_geometry(route_id, first_shape_by_route, first_trip_by_route, shapes_by_id, stopseq_by_trip, stops_by_id)
        geometry_text, bbox = geometry_json_and_bbox(geometry) if geometry is not None else (None, (None, None, None, None))
        route_key = norm_ref(short_name) or norm_text(long_name) or norm_ref(route_id)
        objects.append(
            GtfsRoute(
                dataset_id=dataset_id,
                route_id=route_id,
                agency_id=agency_id,
                short_name=short_name,
                long_name=long_name,
                route_type=route_type,
                mode=mode,
                route_scope=route_scope,
                operator_name=operator or None,
                geometry_geojson=geometry_text,
                min_lon=bbox[0],
                min_lat=bbox[1],
                max_lon=bbox[2],
                max_lat=bbox[3],
                route_key=route_key,
                operator_key=norm_text(operator),
            )
        )
    if objects:
        session.bulk_save_objects(objects)
    return len(objects)


def _route_geometry(
    route_id: str,
    first_shape_by_route: dict[str, str],
    first_trip_by_route: dict[str, str],
    shapes_by_id: dict[str, list[tuple[float, float]]],
    stopseq_by_trip: dict[str, list[str]],
    stops_by_id: dict[str, tuple[float, float, str]],
) -> Optional[LineString]:
    shape_id = first_shape_by_route.get(route_id)
    coords = shapes_by_id.get(shape_id or "", [])
    if len(coords) >= 2:
        return LineString(coords)

    trip_id = first_trip_by_route.get(route_id)
    stop_ids = stopseq_by_trip.get(trip_id or "", [])
    fallback = [(stops_by_id[sid][0], stops_by_id[sid][1]) for sid in stop_ids if sid in stops_by_id]
    if len(fallback) >= 2:
        return LineString(fallback)
    return None


def _float_or_none(value: object) -> Optional[float]:
    try:
        if value is None or str(value).strip() == "":
            return None
        return float(str(value))
    except ValueError:
        return None


def _int_or_none(value: object) -> Optional[int]:
    try:
        if value is None or str(value).strip() == "":
            return None
        return int(float(str(value)))
    except ValueError:
        return None


def _bool_flag(value: object) -> bool:
    return str(value or "").strip() in {"1", "true", "True", "TRUE", "yes"}


def _time_seconds(value: str | None) -> Optional[int]:
    if not value:
        return None
    parts = value.strip().split(":")
    if len(parts) == 2:
        parts.append("0")
    if len(parts) != 3:
        return None
    try:
        hours, minutes, seconds = [int(part) for part in parts]
    except ValueError:
        return None
    if hours < 0 or minutes < 0 or minutes > 59 or seconds < 0 or seconds > 59:
        return None
    return hours * 3600 + minutes * 60 + seconds


def _gtfs_mode(route_type: Optional[int]) -> str:
    if route_type is None:
        return "unknown"
    if route_type in GTFS_MODE:
        return GTFS_MODE[route_type]
    for start, end, mode in GTFS_EXTENDED_MODE_RANGES:
        if start <= route_type <= end:
            return mode
    return "unknown"


def _dataset_importer_version(dataset: Dataset) -> str:
    try:
        return str(json.loads(dataset.metadata_json or "{}").get("importer") or "")
    except json.JSONDecodeError:
        return ""