Files
meubility-workbench/app/pipeline/gtfs.py
2026-07-01 23:29:51 +02:00

1328 lines
53 KiB
Python

from __future__ import annotations
import csv
import io
import json
import sqlite3
import zipfile
from collections import defaultdict
from collections.abc import Callable
from pathlib import Path
from typing import Any, Iterator, Optional
from shapely.geometry import LineString
from sqlalchemy import func, select, text
from sqlalchemy.orm import Session
from app.config import settings
from app.gtfs_storage import GTFS_STORAGE_MAIN, GTFS_STORAGE_METADATA_KEY, GTFS_STORAGE_SIDECAR_STOP_TIMES, effective_gtfs_timetable_storage
from app.models import (
Dataset,
GtfsAgency,
GtfsCalendar,
GtfsCalendarDate,
GtfsRoute,
GtfsShape,
GtfsStop,
GtfsStopTime,
GtfsTrip,
Source,
)
from app.osm_classification import infer_osm_route_scope
from app.performance import measure_pipeline_phase
from app.pipeline.download import materialize_source
from app.pipeline.utils import first_nonempty, geometry_json_and_bbox, norm_ref, norm_text, sha256_file
from app.spatial import analyze_postgresql_tables, refresh_postgis_geometries
GTFS_MODE = {
0: "tram",
1: "subway",
2: "train",
3: "bus",
4: "ferry",
5: "cable_tram",
6: "aerialway",
7: "funicular",
11: "trolleybus",
12: "monorail",
}
GTFS_EXTENDED_MODE_RANGES = [
(100, 199, "train"),
(400, 499, "subway"),
(700, 799, "bus"),
(900, 999, "tram"),
(1000, 1099, "ferry"),
(1100, 1199, "aerialway"),
(1200, 1299, "funicular"),
(1300, 1399, "aerialway"),
(1400, 1499, "monorail"),
(1500, 1599, "trolleybus"),
]
GTFS_IMPORTER_VERSION = "gtfs_import_v6_sidecar_stop_times"
REQUIRED_FILES = {"agency.txt", "stops.txt", "routes.txt", "trips.txt", "stop_times.txt"}
GTFS_STAGE_BATCH_SIZE = 50_000
ProgressCallback = Callable[[str, str, int | None, int | None, dict[str, Any] | None], None]
def run_gtfs_source(session: Session, source: Source, progress_callback: ProgressCallback | None = None) -> Dataset:
local_path = materialize_source(source)
source_hash = sha256_file(local_path)
existing = session.scalar(
select(Dataset)
.where(
Dataset.source_id == source.id,
Dataset.kind == "gtfs",
Dataset.sha256 == source_hash,
Dataset.is_active.is_(True),
Dataset.status == "imported",
)
.order_by(Dataset.id.desc())
)
if existing is not None and _dataset_importer_version(existing) == GTFS_IMPORTER_VERSION:
return existing
return import_gtfs_zip(session=session, source=source, zip_path=local_path, source_hash=source_hash, progress_callback=progress_callback)
def import_gtfs_zip(
session: Session,
source: Source,
zip_path: Path,
source_hash: str | None = None,
progress_callback: ProgressCallback | None = None,
) -> Dataset:
if not zipfile.is_zipfile(zip_path):
raise ValueError(f"GTFS source is not a zip file: {zip_path}")
dataset = Dataset(
source_id=source.id,
kind="gtfs",
local_path=str(zip_path),
sha256=source_hash or sha256_file(zip_path),
is_active=False,
status="staging",
)
session.add(dataset)
session.flush()
session.commit()
stage_path = _gtfs_stage_path(source, dataset, zip_path)
_emit_progress(progress_callback, "gtfs_staging_started", f"Staging GTFS zip {zip_path.name}.", 0, None, {"stage_path": str(stage_path)})
try:
with measure_pipeline_phase("gtfs_staging", source_id=source.id, dataset_id=dataset.id, metadata={"zip_path": str(zip_path), "stage_path": str(stage_path)}) as metric:
stage_summary = _stage_gtfs_zip(zip_path, stage_path, progress_callback=progress_callback)
metric.update(stage_summary)
activation_path = _prepare_gtfs_activation_path(source, dataset, stage_path, stage_summary)
_emit_progress(progress_callback, "gtfs_activation_started", "Activating staged GTFS dataset.", None, None, {"stage_path": str(activation_path)})
with measure_pipeline_phase("gtfs_activation", source_id=source.id, dataset_id=dataset.id, metadata={"stage_path": str(activation_path)}) as metric:
_activate_staged_gtfs(session, source, dataset, activation_path, stage_summary, progress_callback=progress_callback)
metric.update(stage_summary)
except BaseException:
session.rollback()
failed = session.get(Dataset, dataset.id)
if failed is not None:
failed.status = "failed"
failed.is_active = False
session.commit()
raise
source.status = "ok"
source.last_error = None
session.flush()
_emit_progress(progress_callback, "gtfs_activation_completed", f"Activated GTFS dataset #{dataset.id}.", None, None, {"dataset_id": dataset.id})
return dataset
def backfill_gtfs_shapes(session: Session, dataset_id: int | None = None) -> dict:
stmt = select(Dataset).where(Dataset.kind == "gtfs")
if dataset_id is not None:
stmt = stmt.where(Dataset.id == dataset_id)
else:
stmt = stmt.where(Dataset.is_active.is_(True))
datasets = session.scalars(stmt.order_by(Dataset.id)).all()
results = []
for dataset in datasets:
existing = session.scalar(select(func.count()).select_from(GtfsShape).where(GtfsShape.dataset_id == dataset.id)) or 0
if existing:
results.append({"dataset_id": dataset.id, "status": "skipped", "shapes": existing})
continue
zip_path = Path(dataset.local_path)
if not zip_path.exists() or not zipfile.is_zipfile(zip_path):
results.append({"dataset_id": dataset.id, "status": "missing_zip", "path": str(zip_path)})
continue
with zipfile.ZipFile(zip_path) as zf:
names = {Path(name).name: name for name in zf.namelist() if not name.endswith("/")}
if "shapes.txt" not in names:
results.append({"dataset_id": dataset.id, "status": "no_shapes_txt", "shapes": 0})
continue
shapes_by_id = _read_shapes(zf, names)
imported = _import_shapes(session, dataset.id, shapes_by_id)
_record_importer_metadata(dataset, shapes_count=imported)
session.flush()
results.append({"dataset_id": dataset.id, "status": "imported", "shapes": imported})
return {"datasets": results}
def _gtfs_stage_path(source: Source, dataset: Dataset, zip_path: Path) -> Path:
source_hash = dataset.sha256 or sha256_file(zip_path)
return settings.data_dir / "staging" / f"source_{source.id}" / f"gtfs_dataset_{dataset.id}_{source_hash[:12]}.sqlite"
def _gtfs_sidecar_path(source: Source, dataset: Dataset) -> Path:
source_hash = dataset.sha256 or "unknown"
return settings.data_dir / "sidecars" / f"source_{source.id}" / f"gtfs_dataset_{dataset.id}_{source_hash[:12]}.sqlite"
def _gtfs_timetable_storage_mode() -> str:
return effective_gtfs_timetable_storage()
def _prepare_gtfs_activation_path(source: Source, dataset: Dataset, stage_path: Path, summary: dict[str, Any]) -> Path:
storage_mode = _gtfs_timetable_storage_mode()
if storage_mode == GTFS_STORAGE_SIDECAR_STOP_TIMES:
sidecar_path = _gtfs_sidecar_path(source, dataset)
sidecar_path.parent.mkdir(parents=True, exist_ok=True)
if sidecar_path.exists():
sidecar_path.unlink()
stage_path.replace(sidecar_path)
summary["stage_path"] = str(sidecar_path)
summary["staging"] = "sqlite_promoted_to_sidecar"
summary[GTFS_STORAGE_METADATA_KEY] = {
"mode": GTFS_STORAGE_SIDECAR_STOP_TIMES,
"sidecar_path": str(sidecar_path),
"tables": {
"gtfs_stop_times": "sidecar",
"gtfs_agencies": "main",
"gtfs_stops": "main",
"gtfs_routes": "main",
"gtfs_trips": "main",
"gtfs_calendars": "main",
"gtfs_calendar_dates": "main",
"gtfs_shapes": "main",
},
}
return sidecar_path
summary[GTFS_STORAGE_METADATA_KEY] = {
"mode": GTFS_STORAGE_MAIN,
"tables": {
"gtfs_stop_times": "main",
"gtfs_agencies": "main",
"gtfs_stops": "main",
"gtfs_routes": "main",
"gtfs_trips": "main",
"gtfs_calendars": "main",
"gtfs_calendar_dates": "main",
"gtfs_shapes": "main",
},
}
return stage_path
def _stage_gtfs_zip(zip_path: Path, stage_path: Path, progress_callback: ProgressCallback | None = None) -> dict[str, Any]:
if stage_path.exists():
stage_path.unlink()
stage_path.parent.mkdir(parents=True, exist_ok=True)
connection = sqlite3.connect(stage_path)
try:
_configure_stage_connection(connection)
_create_gtfs_stage_schema(connection)
with zipfile.ZipFile(zip_path) as zf:
names = {Path(name).name: name for name in zf.namelist() if not name.endswith("/")}
missing = sorted(REQUIRED_FILES - set(names.keys()))
agency_names = _stage_agencies(connection, zf, names, progress_callback)
calendars_count = _stage_calendars(connection, zf, names, progress_callback)
calendar_dates_count = _stage_calendar_dates(connection, zf, names, progress_callback)
stops_by_id, stops_count = _stage_stops(connection, zf, names, progress_callback)
trips_by_route, first_shape_by_route, first_trip_by_route, trips_count = _stage_trips(connection, zf, names, progress_callback)
shapes_by_id = _read_shapes_with_progress(zf, names, progress_callback)
shapes_count = _stage_shapes(connection, shapes_by_id, progress_callback)
stopseq_by_trip, stop_times_seen, stop_times_imported = _stage_stop_times(
connection,
zf,
names,
first_trip_ids=set(first_trip_by_route.values()),
progress_callback=progress_callback,
)
routes_count = _stage_routes(
connection=connection,
routes_raw=list(_read_gtfs_csv(zf, names, "routes.txt")),
agency_names=agency_names,
stops_by_id=stops_by_id,
trips_by_route=trips_by_route,
first_shape_by_route=first_shape_by_route,
first_trip_by_route=first_trip_by_route,
shapes_by_id=shapes_by_id,
stopseq_by_trip=stopseq_by_trip,
progress_callback=progress_callback,
)
_create_gtfs_stage_indexes(connection, progress_callback)
connection.commit()
summary = {
"importer": GTFS_IMPORTER_VERSION,
"stage_path": str(stage_path),
"missing_required_files": missing,
"agencies": agency_names and len(agency_names) or 0,
"stops": stops_count,
"routes": routes_count,
"trips": trips_count,
"calendars": calendars_count,
"calendar_dates": calendar_dates_count,
"shapes": shapes_count,
"stop_times_seen": stop_times_seen,
"stop_times_imported": stop_times_imported,
"stop_times_import_limit": settings.gtfs_stop_times_import_limit,
"staging": "sqlite",
}
_emit_progress(progress_callback, "gtfs_staging_completed", "GTFS staging completed.", None, None, summary)
return summary
finally:
connection.close()
def _configure_stage_connection(connection: sqlite3.Connection) -> None:
connection.execute("PRAGMA journal_mode=OFF")
connection.execute("PRAGMA synchronous=OFF")
connection.execute("PRAGMA temp_store=MEMORY")
connection.execute("PRAGMA locking_mode=EXCLUSIVE")
def _create_gtfs_stage_schema(connection: sqlite3.Connection) -> None:
connection.executescript(
"""
CREATE TABLE gtfs_agencies (
agency_id TEXT NOT NULL,
name TEXT NOT NULL,
url TEXT,
timezone TEXT
);
CREATE TABLE gtfs_stops (
stop_id TEXT NOT NULL,
name TEXT,
lat REAL,
lon REAL,
parent_station TEXT
);
CREATE TABLE gtfs_routes (
route_id TEXT NOT NULL,
agency_id TEXT,
short_name TEXT,
long_name TEXT,
route_type INTEGER,
mode TEXT,
route_scope TEXT,
operator_name TEXT,
geometry_geojson TEXT,
min_lon REAL,
min_lat REAL,
max_lon REAL,
max_lat REAL,
route_key TEXT,
operator_key TEXT
);
CREATE TABLE gtfs_trips (
route_id TEXT NOT NULL,
trip_id TEXT NOT NULL,
service_id TEXT,
shape_id TEXT
);
CREATE TABLE gtfs_calendars (
service_id TEXT NOT NULL,
monday INTEGER NOT NULL,
tuesday INTEGER NOT NULL,
wednesday INTEGER NOT NULL,
thursday INTEGER NOT NULL,
friday INTEGER NOT NULL,
saturday INTEGER NOT NULL,
sunday INTEGER NOT NULL,
start_date INTEGER NOT NULL,
end_date INTEGER NOT NULL
);
CREATE TABLE gtfs_calendar_dates (
service_id TEXT NOT NULL,
date INTEGER NOT NULL,
exception_type INTEGER NOT NULL
);
CREATE TABLE gtfs_shapes (
shape_id TEXT NOT NULL,
geometry_geojson TEXT NOT NULL,
min_lon REAL,
min_lat REAL,
max_lon REAL,
max_lat REAL
);
CREATE TABLE gtfs_stop_times (
trip_id TEXT NOT NULL,
stop_id TEXT NOT NULL,
stop_sequence INTEGER NOT NULL,
arrival_time TEXT,
departure_time TEXT,
arrival_seconds INTEGER,
departure_seconds INTEGER
);
"""
)
def _create_gtfs_stage_indexes(connection: sqlite3.Connection, progress_callback: ProgressCallback | None = None) -> None:
_emit_progress(progress_callback, "gtfs_stage_indexes_started", "Building GTFS stage indexes.", None, None, None)
for statement in [
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_stop_times_stop_depart_trip ON gtfs_stop_times (stop_id, departure_seconds, trip_id, stop_sequence)",
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_stop_times_stop_arrive_trip ON gtfs_stop_times (stop_id, arrival_seconds, trip_id, stop_sequence)",
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_stop_times_trip_seq ON gtfs_stop_times (trip_id, stop_sequence)",
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_stop_times_trip_stop_seq ON gtfs_stop_times (trip_id, stop_id, stop_sequence)",
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_trips_trip ON gtfs_trips (trip_id)",
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_trips_service_trip ON gtfs_trips (service_id, trip_id)",
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_trips_route_service ON gtfs_trips (route_id, service_id)",
]:
connection.execute(statement)
_emit_progress(progress_callback, "gtfs_stage_indexes_completed", "Built GTFS stage indexes.", None, None, None)
def _activate_staged_gtfs(
session: Session,
source: Source,
dataset: Dataset,
stage_path: Path,
summary: dict[str, Any],
progress_callback: ProgressCallback | None = None,
) -> None:
if not stage_path.exists():
raise FileNotFoundError(f"GTFS staging database is missing: {stage_path}")
dataset = session.get(Dataset, dataset.id) or dataset
source = session.get(Source, source.id) or source
replaced_datasets = [existing for existing in list(source.datasets) if existing.id != dataset.id and existing.kind == "gtfs"]
for existing in source.datasets:
if existing.id != dataset.id:
existing.is_active = False
copy_stop_times = _copy_stop_times_to_main(summary)
heavy_index_drop = copy_stop_times and _should_drop_indexes_for_activation(stage_path)
if heavy_index_drop:
_emit_progress(progress_callback, "gtfs_activation_indexes_dropped", "Dropping heavy GTFS lookup indexes before bulk activation.", None, None, None)
_drop_gtfs_bulk_indexes(session.connection())
try:
if replaced_datasets:
_emit_progress(
progress_callback,
"gtfs_activation_pruning_replaced",
f"Pruning {len(replaced_datasets)} replaced GTFS dataset(s) before activation.",
None,
None,
{"dataset_ids": [dataset.id for dataset in replaced_datasets]},
)
from app.data_management import _delete_dataset_files, _delete_dataset_rows, _detach_update_checks_for_dataset
for old_dataset in replaced_datasets:
_detach_update_checks_for_dataset(session, old_dataset.id)
_delete_dataset_rows(session, old_dataset)
_delete_dataset_files(old_dataset)
session.delete(old_dataset)
with sqlite3.connect(stage_path) as stage_connection:
_copy_stage_table(
session,
stage_connection,
dataset.id,
"gtfs_agencies",
["agency_id", "name", "url", "timezone"],
progress_callback,
)
_copy_stage_table(
session,
stage_connection,
dataset.id,
"gtfs_stops",
["stop_id", "name", "lat", "lon", "parent_station"],
progress_callback,
)
_copy_stage_table(
session,
stage_connection,
dataset.id,
"gtfs_calendars",
["service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date"],
progress_callback,
)
_copy_stage_table(
session,
stage_connection,
dataset.id,
"gtfs_calendar_dates",
["service_id", "date", "exception_type"],
progress_callback,
)
_copy_stage_table(
session,
stage_connection,
dataset.id,
"gtfs_trips",
["route_id", "trip_id", "service_id", "shape_id"],
progress_callback,
)
_copy_stage_table(
session,
stage_connection,
dataset.id,
"gtfs_shapes",
["shape_id", "geometry_geojson", "min_lon", "min_lat", "max_lon", "max_lat"],
progress_callback,
)
if copy_stop_times:
_copy_stage_table(
session,
stage_connection,
dataset.id,
"gtfs_stop_times",
["trip_id", "stop_id", "stop_sequence", "arrival_time", "departure_time", "arrival_seconds", "departure_seconds"],
progress_callback,
)
else:
_emit_progress(
progress_callback,
"gtfs_activation_sidecar_stop_times",
"Kept gtfs_stop_times in sidecar storage.",
None,
None,
{"table": "gtfs_stop_times", "sidecar_path": str(stage_path)},
)
_copy_stage_table(
session,
stage_connection,
dataset.id,
"gtfs_routes",
[
"route_id",
"agency_id",
"short_name",
"long_name",
"route_type",
"mode",
"route_scope",
"operator_name",
"geometry_geojson",
"min_lon",
"min_lat",
"max_lon",
"max_lat",
"route_key",
"operator_key",
],
progress_callback,
)
finally:
if heavy_index_drop:
_emit_progress(progress_callback, "gtfs_activation_indexes_rebuilding", "Rebuilding GTFS lookup indexes after bulk activation.", None, None, None)
_create_gtfs_bulk_indexes(session.connection())
dataset.status = "imported"
dataset.is_active = True
dataset.metadata_json = json.dumps(summary, indent=2)
source.status = "ok"
source.last_error = None
session.flush()
refresh_postgis_geometries(session, dataset_id=dataset.id, tables=["gtfs_stops", "gtfs_routes", "gtfs_shapes"])
analyze_postgresql_tables(session, ["gtfs_stops", "gtfs_routes", "gtfs_shapes", "gtfs_trips", "gtfs_stop_times"])
if copy_stop_times and not settings.gtfs_keep_activation_stage:
try:
stage_path.unlink()
except FileNotFoundError:
pass
def _copy_stop_times_to_main(summary: dict[str, Any]) -> bool:
storage = summary.get(GTFS_STORAGE_METADATA_KEY)
if not isinstance(storage, dict):
return True
tables = storage.get("tables")
if isinstance(tables, dict):
return tables.get("gtfs_stop_times") != "sidecar"
return storage.get("mode") != GTFS_STORAGE_SIDECAR_STOP_TIMES
def _copy_stage_table(
session: Session,
stage_connection: sqlite3.Connection,
dataset_id: int,
table: str,
columns: list[str],
progress_callback: ProgressCallback | None,
) -> None:
column_sql = ", ".join(columns)
placeholders = ", ".join([":dataset_id", *[f":{column}" for column in columns]])
insert_sql = f"INSERT INTO {table} (dataset_id, {column_sql}) VALUES ({placeholders})"
cursor = stage_connection.execute(f"SELECT {column_sql} FROM {table}")
copied = 0
while True:
rows = cursor.fetchmany(GTFS_STAGE_BATCH_SIZE)
if not rows:
break
payload = [
{"dataset_id": dataset_id, **{column: row[index] for index, column in enumerate(columns)}}
for row in rows
]
session.execute(text(insert_sql), payload)
copied += len(rows)
_emit_progress(
progress_callback,
"gtfs_activation_chunk",
f"Activated {table} chunk.",
copied,
None,
{"table": table, "rows": copied},
)
def _should_drop_indexes_for_activation(stage_path: Path) -> bool:
if settings.is_postgresql_database:
return False
try:
with sqlite3.connect(stage_path) as connection:
stop_times = connection.execute("SELECT COUNT(*) FROM gtfs_stop_times").fetchone()[0]
trips = connection.execute("SELECT COUNT(*) FROM gtfs_trips").fetchone()[0]
except sqlite3.Error:
return False
return int(stop_times or 0) >= 250_000 or int(trips or 0) >= 100_000
def _drop_gtfs_bulk_indexes(connection) -> None:
for index_name in [
"ix_gtfs_stop_times_stop",
"ix_gtfs_stop_times_stop_depart_trip",
"ix_gtfs_stop_times_stop_arrival",
"ix_gtfs_stop_times_stop_arrive_trip",
"ix_gtfs_stop_times_trip_seq",
"ix_gtfs_stop_times_trip_stop_seq",
"ix_gtfs_trips_dataset_trip",
"ix_gtfs_trips_dataset_route",
"ix_gtfs_trips_dataset_service",
"ix_gtfs_trips_dataset_route_service",
"ix_gtfs_routes_dataset_route",
"ix_gtfs_shapes_dataset_shape",
"ix_gtfs_calendars_dataset_service_dates",
"ix_gtfs_calendar_dates_dataset_date",
]:
connection.exec_driver_sql(f"DROP INDEX IF EXISTS {index_name}")
def _create_gtfs_bulk_indexes(connection) -> None:
for statement in [
"CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_stop ON gtfs_stop_times (dataset_id, stop_id, departure_seconds, trip_id, stop_sequence)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_stop_depart_trip ON gtfs_stop_times (dataset_id, stop_id, departure_seconds, trip_id)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_stop_arrival ON gtfs_stop_times (dataset_id, stop_id, arrival_seconds, trip_id, stop_sequence)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_stop_arrive_trip ON gtfs_stop_times (dataset_id, stop_id, arrival_seconds, trip_id)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_trip_seq ON gtfs_stop_times (dataset_id, trip_id, stop_sequence)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_trip_stop_seq ON gtfs_stop_times (dataset_id, trip_id, stop_id, stop_sequence)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_trips_dataset_trip ON gtfs_trips (dataset_id, trip_id)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_trips_dataset_route ON gtfs_trips (dataset_id, route_id)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_trips_dataset_service ON gtfs_trips (dataset_id, service_id, trip_id)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_trips_dataset_route_service ON gtfs_trips (dataset_id, route_id, service_id)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_routes_dataset_route ON gtfs_routes (dataset_id, route_id)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_shapes_dataset_shape ON gtfs_shapes (dataset_id, shape_id)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_calendars_dataset_service_dates ON gtfs_calendars (dataset_id, service_id, start_date, end_date)",
"CREATE INDEX IF NOT EXISTS ix_gtfs_calendar_dates_dataset_date ON gtfs_calendar_dates (dataset_id, date, service_id, exception_type)",
]:
connection.exec_driver_sql(statement)
def _stage_agencies(
connection: sqlite3.Connection,
zf: zipfile.ZipFile,
names: dict[str, str],
progress_callback: ProgressCallback | None,
) -> dict[str, str]:
_emit_progress(progress_callback, "gtfs_file_started", "Reading agency.txt.", None, None, {"file": "agency.txt"})
agency_names: dict[str, str] = {}
rows = []
for idx, row in enumerate(_read_gtfs_csv(zf, names, "agency.txt")):
agency_id = first_nonempty(row.get("agency_id"), f"agency_{idx}")
name = first_nonempty(row.get("agency_name"), agency_id)
agency_names[agency_id] = name
rows.append((agency_id, name, row.get("agency_url") or None, row.get("agency_timezone") or None))
connection.executemany("INSERT INTO gtfs_agencies (agency_id, name, url, timezone) VALUES (?, ?, ?, ?)", rows)
_emit_progress(progress_callback, "gtfs_file_completed", "Imported agency.txt.", len(rows), None, {"file": "agency.txt", "rows": len(rows)})
return agency_names
def _stage_calendars(
connection: sqlite3.Connection,
zf: zipfile.ZipFile,
names: dict[str, str],
progress_callback: ProgressCallback | None,
) -> int:
_emit_progress(progress_callback, "gtfs_file_started", "Reading calendar.txt.", None, None, {"file": "calendar.txt"})
rows = []
for row in _read_gtfs_csv(zf, names, "calendar.txt"):
service_id = row.get("service_id") or ""
start_date = _int_or_none(row.get("start_date"))
end_date = _int_or_none(row.get("end_date"))
if not service_id or start_date is None or end_date is None:
continue
rows.append(
(
service_id,
int(_bool_flag(row.get("monday"))),
int(_bool_flag(row.get("tuesday"))),
int(_bool_flag(row.get("wednesday"))),
int(_bool_flag(row.get("thursday"))),
int(_bool_flag(row.get("friday"))),
int(_bool_flag(row.get("saturday"))),
int(_bool_flag(row.get("sunday"))),
start_date,
end_date,
)
)
connection.executemany(
"""
INSERT INTO gtfs_calendars
(service_id, monday, tuesday, wednesday, thursday, friday, saturday, sunday, start_date, end_date)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
rows,
)
_emit_progress(progress_callback, "gtfs_file_completed", "Imported calendar.txt.", len(rows), None, {"file": "calendar.txt", "rows": len(rows)})
return len(rows)
def _stage_calendar_dates(
connection: sqlite3.Connection,
zf: zipfile.ZipFile,
names: dict[str, str],
progress_callback: ProgressCallback | None,
) -> int:
return _stage_chunked_rows(
connection=connection,
zf=zf,
names=names,
basename="calendar_dates.txt",
insert_sql="INSERT INTO gtfs_calendar_dates (service_id, date, exception_type) VALUES (?, ?, ?)",
row_factory=lambda row: (
row.get("service_id") or "",
_int_or_none(row.get("date")),
_int_or_none(row.get("exception_type")),
),
validator=lambda row: bool(row[0]) and row[1] is not None and row[2] is not None,
progress_callback=progress_callback,
)
def _stage_stops(
connection: sqlite3.Connection,
zf: zipfile.ZipFile,
names: dict[str, str],
progress_callback: ProgressCallback | None,
) -> tuple[dict[str, tuple[float, float, str]], int]:
_emit_progress(progress_callback, "gtfs_file_started", "Reading stops.txt.", None, None, {"file": "stops.txt"})
stops_by_id: dict[str, tuple[float, float, str]] = {}
rows = []
for row in _read_gtfs_csv(zf, names, "stops.txt"):
stop_id = row.get("stop_id", "")
if not stop_id:
continue
lat = _float_or_none(row.get("stop_lat"))
lon = _float_or_none(row.get("stop_lon"))
name = row.get("stop_name") or None
if lat is not None and lon is not None:
stops_by_id[stop_id] = (lon, lat, name or stop_id)
rows.append((stop_id, name, lat, lon, row.get("parent_station") or None))
connection.executemany("INSERT INTO gtfs_stops (stop_id, name, lat, lon, parent_station) VALUES (?, ?, ?, ?, ?)", rows)
_emit_progress(progress_callback, "gtfs_file_completed", "Imported stops.txt.", len(rows), None, {"file": "stops.txt", "rows": len(rows)})
return stops_by_id, len(rows)
def _stage_trips(
connection: sqlite3.Connection,
zf: zipfile.ZipFile,
names: dict[str, str],
progress_callback: ProgressCallback | None,
) -> tuple[dict[str, list[str]], dict[str, str], dict[str, str], int]:
_emit_progress(progress_callback, "gtfs_file_started", "Reading trips.txt.", None, None, {"file": "trips.txt"})
trips_by_route: dict[str, list[str]] = defaultdict(list)
first_shape_by_route: dict[str, str] = {}
first_trip_by_route: dict[str, str] = {}
rows = []
imported = 0
for row in _read_gtfs_csv(zf, names, "trips.txt"):
route_id = row.get("route_id", "")
trip_id = row.get("trip_id", "")
if not route_id or not trip_id:
continue
trips_by_route[route_id].append(trip_id)
first_trip_by_route.setdefault(route_id, trip_id)
shape_id = row.get("shape_id") or ""
if shape_id:
first_shape_by_route.setdefault(route_id, shape_id)
rows.append((route_id, trip_id, row.get("service_id") or None, shape_id or None))
imported += 1
if len(rows) >= GTFS_STAGE_BATCH_SIZE:
connection.executemany("INSERT INTO gtfs_trips (route_id, trip_id, service_id, shape_id) VALUES (?, ?, ?, ?)", rows)
rows.clear()
_emit_progress(progress_callback, "gtfs_file_chunk", "Imported trips.txt chunk.", imported, None, {"file": "trips.txt", "rows": imported})
if rows:
connection.executemany("INSERT INTO gtfs_trips (route_id, trip_id, service_id, shape_id) VALUES (?, ?, ?, ?)", rows)
_emit_progress(progress_callback, "gtfs_file_completed", "Imported trips.txt.", imported, None, {"file": "trips.txt", "rows": imported})
return dict(trips_by_route), first_shape_by_route, first_trip_by_route, imported
def _read_shapes_with_progress(
zf: zipfile.ZipFile,
names: dict[str, str],
progress_callback: ProgressCallback | None,
) -> dict[str, list[tuple[float, float]]]:
_emit_progress(progress_callback, "gtfs_file_started", "Reading shapes.txt.", None, None, {"file": "shapes.txt"})
shapes = _read_shapes(zf, names)
_emit_progress(progress_callback, "gtfs_file_completed", "Read shapes.txt.", len(shapes), None, {"file": "shapes.txt", "shapes": len(shapes)})
return shapes
def _stage_shapes(
connection: sqlite3.Connection,
shapes_by_id: dict[str, list[tuple[float, float]]],
progress_callback: ProgressCallback | None,
) -> int:
rows = []
imported = 0
for shape_id, coords in shapes_by_id.items():
if len(coords) < 2:
continue
geometry_text, bbox = geometry_json_and_bbox(LineString(coords))
if geometry_text is None:
continue
rows.append((shape_id, geometry_text, bbox[0], bbox[1], bbox[2], bbox[3]))
imported += 1
if len(rows) >= 5000:
connection.executemany(
"INSERT INTO gtfs_shapes (shape_id, geometry_geojson, min_lon, min_lat, max_lon, max_lat) VALUES (?, ?, ?, ?, ?, ?)",
rows,
)
rows.clear()
_emit_progress(progress_callback, "gtfs_file_chunk", "Imported shapes chunk.", imported, None, {"file": "shapes.txt", "rows": imported})
if rows:
connection.executemany(
"INSERT INTO gtfs_shapes (shape_id, geometry_geojson, min_lon, min_lat, max_lon, max_lat) VALUES (?, ?, ?, ?, ?, ?)",
rows,
)
_emit_progress(progress_callback, "gtfs_file_completed", "Imported shapes.", imported, None, {"file": "shapes.txt", "rows": imported})
return imported
def _stage_stop_times(
connection: sqlite3.Connection,
zf: zipfile.ZipFile,
names: dict[str, str],
first_trip_ids: set[str],
progress_callback: ProgressCallback | None,
) -> tuple[dict[str, list[str]], int, int]:
_emit_progress(progress_callback, "gtfs_file_started", "Reading stop_times.txt.", None, None, {"file": "stop_times.txt"})
stopseq_by_trip: dict[str, list[tuple[int, str]]] = defaultdict(list)
rows = []
count = 0
imported = 0
limit = settings.gtfs_stop_times_import_limit
for row in _read_gtfs_csv(zf, names, "stop_times.txt"):
count += 1
trip_id = row.get("trip_id", "")
stop_id = row.get("stop_id", "")
seq = _int_or_none(row.get("stop_sequence"))
if not trip_id or not stop_id or seq is None:
continue
if trip_id in first_trip_ids:
stopseq_by_trip[trip_id].append((seq, stop_id))
if limit <= 0 or imported < limit:
arrival_time = row.get("arrival_time") or None
departure_time = row.get("departure_time") or None
rows.append((trip_id, stop_id, seq, arrival_time, departure_time, _time_seconds(arrival_time), _time_seconds(departure_time)))
imported += 1
if len(rows) >= GTFS_STAGE_BATCH_SIZE:
connection.executemany(
"""
INSERT INTO gtfs_stop_times
(trip_id, stop_id, stop_sequence, arrival_time, departure_time, arrival_seconds, departure_seconds)
VALUES (?, ?, ?, ?, ?, ?, ?)
""",
rows,
)
rows.clear()
_emit_progress(progress_callback, "gtfs_file_chunk", "Imported stop_times.txt chunk.", imported, None, {"file": "stop_times.txt", "rows": imported, "seen": count})
if rows:
connection.executemany(
"""
INSERT INTO gtfs_stop_times
(trip_id, stop_id, stop_sequence, arrival_time, departure_time, arrival_seconds, departure_seconds)
VALUES (?, ?, ?, ?, ?, ?, ?)
""",
rows,
)
_emit_progress(progress_callback, "gtfs_file_completed", "Imported stop_times.txt.", imported, None, {"file": "stop_times.txt", "rows": imported, "seen": count})
return {trip: [stop for _, stop in sorted(seq)] for trip, seq in stopseq_by_trip.items()}, count, imported
def _stage_routes(
connection: sqlite3.Connection,
routes_raw: list[dict[str, str]],
agency_names: dict[str, str],
stops_by_id: dict[str, tuple[float, float, str]],
trips_by_route: dict[str, list[str]],
first_shape_by_route: dict[str, str],
first_trip_by_route: dict[str, str],
shapes_by_id: dict[str, list[tuple[float, float]]],
stopseq_by_trip: dict[str, list[str]],
progress_callback: ProgressCallback | None,
) -> int:
_emit_progress(progress_callback, "gtfs_file_started", "Reading routes.txt.", None, None, {"file": "routes.txt"})
rows = []
for row in routes_raw:
route_id = row.get("route_id", "")
if not route_id:
continue
route_type = _int_or_none(row.get("route_type"))
mode = _gtfs_mode(route_type)
agency_id = row.get("agency_id") or None
operator = agency_names.get(agency_id or "", agency_id or "")
short_name = row.get("route_short_name") or None
long_name = row.get("route_long_name") or None
route_scope = infer_osm_route_scope(mode=mode, ref=short_name, name=long_name, network=operator)
geometry = _route_geometry(route_id, first_shape_by_route, first_trip_by_route, shapes_by_id, stopseq_by_trip, stops_by_id)
geometry_text, bbox = geometry_json_and_bbox(geometry) if geometry is not None else (None, (None, None, None, None))
rows.append(
(
route_id,
agency_id,
short_name,
long_name,
route_type,
mode,
route_scope,
operator or None,
geometry_text,
bbox[0],
bbox[1],
bbox[2],
bbox[3],
norm_ref(short_name) or norm_text(long_name) or norm_ref(route_id),
norm_text(operator),
)
)
connection.executemany(
"""
INSERT INTO gtfs_routes
(route_id, agency_id, short_name, long_name, route_type, mode, route_scope, operator_name, geometry_geojson, min_lon, min_lat, max_lon, max_lat, route_key, operator_key)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
rows,
)
_emit_progress(progress_callback, "gtfs_file_completed", "Imported routes.txt.", len(rows), None, {"file": "routes.txt", "rows": len(rows)})
return len(rows)
def _stage_chunked_rows(
connection: sqlite3.Connection,
zf: zipfile.ZipFile,
names: dict[str, str],
basename: str,
insert_sql: str,
row_factory,
validator,
progress_callback: ProgressCallback | None,
) -> int:
_emit_progress(progress_callback, "gtfs_file_started", f"Reading {basename}.", None, None, {"file": basename})
rows = []
imported = 0
for raw in _read_gtfs_csv(zf, names, basename):
row = row_factory(raw)
if not validator(row):
continue
rows.append(row)
imported += 1
if len(rows) >= GTFS_STAGE_BATCH_SIZE:
connection.executemany(insert_sql, rows)
rows.clear()
_emit_progress(progress_callback, "gtfs_file_chunk", f"Imported {basename} chunk.", imported, None, {"file": basename, "rows": imported})
if rows:
connection.executemany(insert_sql, rows)
_emit_progress(progress_callback, "gtfs_file_completed", f"Imported {basename}.", imported, None, {"file": basename, "rows": imported})
return imported
def _emit_progress(
progress_callback: ProgressCallback | None,
event_type: str,
message: str,
progress_current: int | None = None,
progress_total: int | None = None,
metadata: dict[str, Any] | None = None,
) -> None:
if progress_callback is not None:
progress_callback(event_type, message, progress_current, progress_total, metadata)
def _read_gtfs_csv(zf: zipfile.ZipFile, names: dict[str, str], basename: str) -> Iterator[dict[str, str]]:
if basename not in names:
return iter(())
def _iter() -> Iterator[dict[str, str]]:
with zf.open(names[basename], "r") as raw:
text = io.TextIOWrapper(raw, encoding="utf-8-sig", newline="")
reader = csv.DictReader(text)
for row in reader:
yield {str(k).strip(): (v or "").strip() for k, v in row.items() if k is not None}
return _iter()
def _record_importer_metadata(dataset: Dataset, shapes_count: int | None = None) -> None:
metadata = {}
if dataset.metadata_json:
try:
metadata = json.loads(dataset.metadata_json)
except json.JSONDecodeError:
metadata = {}
metadata["importer"] = GTFS_IMPORTER_VERSION
if shapes_count is not None:
metadata["shapes"] = shapes_count
dataset.metadata_json = json.dumps(metadata, indent=2)
def _import_agencies(session: Session, dataset_id: int, rows: list[dict[str, str]]) -> dict[str, str]:
agency_names: dict[str, str] = {}
objects: list[GtfsAgency] = []
for idx, row in enumerate(rows):
agency_id = first_nonempty(row.get("agency_id"), f"agency_{idx}")
name = first_nonempty(row.get("agency_name"), agency_id)
agency_names[agency_id] = name
objects.append(
GtfsAgency(
dataset_id=dataset_id,
agency_id=agency_id,
name=name,
url=row.get("agency_url") or None,
timezone=row.get("agency_timezone") or None,
)
)
if objects:
session.bulk_save_objects(objects)
return agency_names
def _import_calendars(session: Session, dataset_id: int, rows: list[dict[str, str]]) -> int:
objects: list[GtfsCalendar] = []
for row in rows:
service_id = row.get("service_id") or ""
start_date = _int_or_none(row.get("start_date"))
end_date = _int_or_none(row.get("end_date"))
if not service_id or start_date is None or end_date is None:
continue
objects.append(
GtfsCalendar(
dataset_id=dataset_id,
service_id=service_id,
monday=_bool_flag(row.get("monday")),
tuesday=_bool_flag(row.get("tuesday")),
wednesday=_bool_flag(row.get("wednesday")),
thursday=_bool_flag(row.get("thursday")),
friday=_bool_flag(row.get("friday")),
saturday=_bool_flag(row.get("saturday")),
sunday=_bool_flag(row.get("sunday")),
start_date=start_date,
end_date=end_date,
)
)
if objects:
session.bulk_save_objects(objects)
return len(objects)
def _import_calendar_dates(session: Session, dataset_id: int, rows: list[dict[str, str]]) -> int:
objects: list[GtfsCalendarDate] = []
for row in rows:
service_id = row.get("service_id") or ""
date = _int_or_none(row.get("date"))
exception_type = _int_or_none(row.get("exception_type"))
if not service_id or date is None or exception_type is None:
continue
objects.append(
GtfsCalendarDate(
dataset_id=dataset_id,
service_id=service_id,
date=date,
exception_type=exception_type,
)
)
for batch_start in range(0, len(objects), 5000):
session.bulk_save_objects(objects[batch_start : batch_start + 5000])
return len(objects)
def _import_stops(session: Session, dataset_id: int, rows: list[dict[str, str]]) -> dict[str, tuple[float, float, str]]:
stops_by_id: dict[str, tuple[float, float, str]] = {}
objects: list[GtfsStop] = []
for row in rows:
stop_id = row.get("stop_id", "")
if not stop_id:
continue
lat = _float_or_none(row.get("stop_lat"))
lon = _float_or_none(row.get("stop_lon"))
name = row.get("stop_name") or None
if lat is not None and lon is not None:
stops_by_id[stop_id] = (lon, lat, name or stop_id)
objects.append(
GtfsStop(
dataset_id=dataset_id,
stop_id=stop_id,
name=name,
lat=lat,
lon=lon,
parent_station=row.get("parent_station") or None,
)
)
if objects:
session.bulk_save_objects(objects)
return stops_by_id
def _import_trips(
session: Session, dataset_id: int, rows: list[dict[str, str]]
) -> tuple[dict[str, list[str]], dict[str, str], dict[str, str]]:
trips_by_route: dict[str, list[str]] = defaultdict(list)
first_shape_by_route: dict[str, str] = {}
first_trip_by_route: dict[str, str] = {}
objects: list[GtfsTrip] = []
for row in rows:
route_id = row.get("route_id", "")
trip_id = row.get("trip_id", "")
if not route_id or not trip_id:
continue
trips_by_route[route_id].append(trip_id)
first_trip_by_route.setdefault(route_id, trip_id)
shape_id = row.get("shape_id") or ""
if shape_id:
first_shape_by_route.setdefault(route_id, shape_id)
objects.append(
GtfsTrip(
dataset_id=dataset_id,
route_id=route_id,
trip_id=trip_id,
service_id=row.get("service_id") or None,
shape_id=shape_id or None,
)
)
for batch_start in range(0, len(objects), 5000):
session.bulk_save_objects(objects[batch_start : batch_start + 5000])
return dict(trips_by_route), first_shape_by_route, first_trip_by_route
def _read_shapes(zf: zipfile.ZipFile, names: dict[str, str]) -> dict[str, list[tuple[float, float]]]:
by_shape: dict[str, list[tuple[int, float, float]]] = defaultdict(list)
for row in _read_gtfs_csv(zf, names, "shapes.txt"):
shape_id = row.get("shape_id", "")
lat = _float_or_none(row.get("shape_pt_lat"))
lon = _float_or_none(row.get("shape_pt_lon"))
seq = _int_or_none(row.get("shape_pt_sequence"))
if shape_id and lat is not None and lon is not None:
by_shape[shape_id].append((seq if seq is not None else 0, lon, lat))
return {shape_id: [(lon, lat) for _, lon, lat in sorted(points)] for shape_id, points in by_shape.items()}
def _import_shapes(session: Session, dataset_id: int, shapes_by_id: dict[str, list[tuple[float, float]]]) -> int:
objects: list[GtfsShape] = []
imported = 0
for shape_id, coords in shapes_by_id.items():
if len(coords) < 2:
continue
geometry_text, bbox = geometry_json_and_bbox(LineString(coords))
if geometry_text is None:
continue
objects.append(
GtfsShape(
dataset_id=dataset_id,
shape_id=shape_id,
geometry_geojson=geometry_text,
min_lon=bbox[0],
min_lat=bbox[1],
max_lon=bbox[2],
max_lat=bbox[3],
)
)
imported += 1
if len(objects) >= 1000:
session.bulk_save_objects(objects)
objects.clear()
if objects:
session.bulk_save_objects(objects)
return imported
def _import_stop_times(
session: Session,
dataset_id: int,
zf: zipfile.ZipFile,
names: dict[str, str],
first_trip_ids: set[str],
) -> tuple[dict[str, list[str]], int, int]:
stopseq_by_trip: dict[str, list[tuple[int, str]]] = defaultdict(list)
objects: list[GtfsStopTime] = []
count = 0
imported = 0
limit = settings.gtfs_stop_times_import_limit
for row in _read_gtfs_csv(zf, names, "stop_times.txt"):
count += 1
trip_id = row.get("trip_id", "")
stop_id = row.get("stop_id", "")
seq = _int_or_none(row.get("stop_sequence"))
if not trip_id or not stop_id or seq is None:
continue
if trip_id in first_trip_ids:
stopseq_by_trip[trip_id].append((seq, stop_id))
if limit <= 0 or imported < limit:
arrival_time = row.get("arrival_time") or None
departure_time = row.get("departure_time") or None
objects.append(
GtfsStopTime(
dataset_id=dataset_id,
trip_id=trip_id,
stop_id=stop_id,
stop_sequence=seq,
arrival_time=arrival_time,
departure_time=departure_time,
arrival_seconds=_time_seconds(arrival_time),
departure_seconds=_time_seconds(departure_time),
)
)
imported += 1
if len(objects) >= 5000:
session.bulk_save_objects(objects)
objects.clear()
if objects:
session.bulk_save_objects(objects)
return {trip: [stop for _, stop in sorted(seq)] for trip, seq in stopseq_by_trip.items()}, count, imported
def _import_routes(
session: Session,
dataset_id: int,
routes_raw: list[dict[str, str]],
agency_names: dict[str, str],
stops_by_id: dict[str, tuple[float, float, str]],
trips_by_route: dict[str, list[str]],
first_shape_by_route: dict[str, str],
first_trip_by_route: dict[str, str],
shapes_by_id: dict[str, list[tuple[float, float]]],
stopseq_by_trip: dict[str, list[str]],
) -> int:
objects: list[GtfsRoute] = []
for row in routes_raw:
route_id = row.get("route_id", "")
if not route_id:
continue
route_type = _int_or_none(row.get("route_type"))
mode = _gtfs_mode(route_type)
agency_id = row.get("agency_id") or None
operator = agency_names.get(agency_id or "", agency_id or "")
short_name = row.get("route_short_name") or None
long_name = row.get("route_long_name") or None
route_scope = infer_osm_route_scope(mode=mode, ref=short_name, name=long_name, network=operator)
geometry = _route_geometry(route_id, first_shape_by_route, first_trip_by_route, shapes_by_id, stopseq_by_trip, stops_by_id)
geometry_text, bbox = geometry_json_and_bbox(geometry) if geometry is not None else (None, (None, None, None, None))
route_key = norm_ref(short_name) or norm_text(long_name) or norm_ref(route_id)
objects.append(
GtfsRoute(
dataset_id=dataset_id,
route_id=route_id,
agency_id=agency_id,
short_name=short_name,
long_name=long_name,
route_type=route_type,
mode=mode,
route_scope=route_scope,
operator_name=operator or None,
geometry_geojson=geometry_text,
min_lon=bbox[0],
min_lat=bbox[1],
max_lon=bbox[2],
max_lat=bbox[3],
route_key=route_key,
operator_key=norm_text(operator),
)
)
if objects:
session.bulk_save_objects(objects)
return len(objects)
def _route_geometry(
route_id: str,
first_shape_by_route: dict[str, str],
first_trip_by_route: dict[str, str],
shapes_by_id: dict[str, list[tuple[float, float]]],
stopseq_by_trip: dict[str, list[str]],
stops_by_id: dict[str, tuple[float, float, str]],
) -> Optional[LineString]:
shape_id = first_shape_by_route.get(route_id)
coords = shapes_by_id.get(shape_id or "", [])
if len(coords) >= 2:
return LineString(coords)
trip_id = first_trip_by_route.get(route_id)
stop_ids = stopseq_by_trip.get(trip_id or "", [])
fallback = [(stops_by_id[sid][0], stops_by_id[sid][1]) for sid in stop_ids if sid in stops_by_id]
if len(fallback) >= 2:
return LineString(fallback)
return None
def _float_or_none(value: object) -> Optional[float]:
try:
if value is None or str(value).strip() == "":
return None
return float(str(value))
except ValueError:
return None
def _int_or_none(value: object) -> Optional[int]:
try:
if value is None or str(value).strip() == "":
return None
return int(float(str(value)))
except ValueError:
return None
def _bool_flag(value: object) -> bool:
return str(value or "").strip() in {"1", "true", "True", "TRUE", "yes"}
def _time_seconds(value: str | None) -> Optional[int]:
if not value:
return None
parts = value.strip().split(":")
if len(parts) == 2:
parts.append("0")
if len(parts) != 3:
return None
try:
hours, minutes, seconds = [int(part) for part in parts]
except ValueError:
return None
if hours < 0 or minutes < 0 or minutes > 59 or seconds < 0 or seconds > 59:
return None
return hours * 3600 + minutes * 60 + seconds
def _gtfs_mode(route_type: Optional[int]) -> str:
if route_type is None:
return "unknown"
if route_type in GTFS_MODE:
return GTFS_MODE[route_type]
for start, end, mode in GTFS_EXTENDED_MODE_RANGES:
if start <= route_type <= end:
return mode
return "unknown"
def _dataset_importer_version(dataset: Dataset) -> str:
try:
return str(json.loads(dataset.metadata_json or "{}").get("importer") or "")
except json.JSONDecodeError:
return ""