1328 lines
53 KiB
Python
1328 lines
53 KiB
Python
from __future__ import annotations
|
|
|
|
import csv
|
|
import io
|
|
import json
|
|
import sqlite3
|
|
import zipfile
|
|
from collections import defaultdict
|
|
from collections.abc import Callable
|
|
from pathlib import Path
|
|
from typing import Any, Iterator, Optional
|
|
|
|
from shapely.geometry import LineString
|
|
from sqlalchemy import func, select, text
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.config import settings
|
|
from app.gtfs_storage import GTFS_STORAGE_MAIN, GTFS_STORAGE_METADATA_KEY, GTFS_STORAGE_SIDECAR_STOP_TIMES, effective_gtfs_timetable_storage
|
|
from app.models import (
|
|
Dataset,
|
|
GtfsAgency,
|
|
GtfsCalendar,
|
|
GtfsCalendarDate,
|
|
GtfsRoute,
|
|
GtfsShape,
|
|
GtfsStop,
|
|
GtfsStopTime,
|
|
GtfsTrip,
|
|
Source,
|
|
)
|
|
from app.osm_classification import infer_osm_route_scope
|
|
from app.performance import measure_pipeline_phase
|
|
from app.pipeline.download import materialize_source
|
|
from app.pipeline.utils import first_nonempty, geometry_json_and_bbox, norm_ref, norm_text, sha256_file
|
|
from app.spatial import analyze_postgresql_tables, refresh_postgis_geometries
|
|
|
|
|
|
GTFS_MODE = {
|
|
0: "tram",
|
|
1: "subway",
|
|
2: "train",
|
|
3: "bus",
|
|
4: "ferry",
|
|
5: "cable_tram",
|
|
6: "aerialway",
|
|
7: "funicular",
|
|
11: "trolleybus",
|
|
12: "monorail",
|
|
}
|
|
|
|
GTFS_EXTENDED_MODE_RANGES = [
|
|
(100, 199, "train"),
|
|
(400, 499, "subway"),
|
|
(700, 799, "bus"),
|
|
(900, 999, "tram"),
|
|
(1000, 1099, "ferry"),
|
|
(1100, 1199, "aerialway"),
|
|
(1200, 1299, "funicular"),
|
|
(1300, 1399, "aerialway"),
|
|
(1400, 1499, "monorail"),
|
|
(1500, 1599, "trolleybus"),
|
|
]
|
|
|
|
GTFS_IMPORTER_VERSION = "gtfs_import_v6_sidecar_stop_times"
|
|
|
|
REQUIRED_FILES = {"agency.txt", "stops.txt", "routes.txt", "trips.txt", "stop_times.txt"}
|
|
GTFS_STAGE_BATCH_SIZE = 50_000
|
|
ProgressCallback = Callable[[str, str, int | None, int | None, dict[str, Any] | None], None]
|
|
|
|
|
|
def run_gtfs_source(session: Session, source: Source, progress_callback: ProgressCallback | None = None) -> Dataset:
|
|
local_path = materialize_source(source)
|
|
source_hash = sha256_file(local_path)
|
|
existing = session.scalar(
|
|
select(Dataset)
|
|
.where(
|
|
Dataset.source_id == source.id,
|
|
Dataset.kind == "gtfs",
|
|
Dataset.sha256 == source_hash,
|
|
Dataset.is_active.is_(True),
|
|
Dataset.status == "imported",
|
|
)
|
|
.order_by(Dataset.id.desc())
|
|
)
|
|
if existing is not None and _dataset_importer_version(existing) == GTFS_IMPORTER_VERSION:
|
|
return existing
|
|
return import_gtfs_zip(session=session, source=source, zip_path=local_path, source_hash=source_hash, progress_callback=progress_callback)
|
|
|
|
|
|
def import_gtfs_zip(
|
|
session: Session,
|
|
source: Source,
|
|
zip_path: Path,
|
|
source_hash: str | None = None,
|
|
progress_callback: ProgressCallback | None = None,
|
|
) -> Dataset:
|
|
if not zipfile.is_zipfile(zip_path):
|
|
raise ValueError(f"GTFS source is not a zip file: {zip_path}")
|
|
|
|
dataset = Dataset(
|
|
source_id=source.id,
|
|
kind="gtfs",
|
|
local_path=str(zip_path),
|
|
sha256=source_hash or sha256_file(zip_path),
|
|
is_active=False,
|
|
status="staging",
|
|
)
|
|
session.add(dataset)
|
|
session.flush()
|
|
session.commit()
|
|
|
|
stage_path = _gtfs_stage_path(source, dataset, zip_path)
|
|
_emit_progress(progress_callback, "gtfs_staging_started", f"Staging GTFS zip {zip_path.name}.", 0, None, {"stage_path": str(stage_path)})
|
|
try:
|
|
with measure_pipeline_phase("gtfs_staging", source_id=source.id, dataset_id=dataset.id, metadata={"zip_path": str(zip_path), "stage_path": str(stage_path)}) as metric:
|
|
stage_summary = _stage_gtfs_zip(zip_path, stage_path, progress_callback=progress_callback)
|
|
metric.update(stage_summary)
|
|
activation_path = _prepare_gtfs_activation_path(source, dataset, stage_path, stage_summary)
|
|
_emit_progress(progress_callback, "gtfs_activation_started", "Activating staged GTFS dataset.", None, None, {"stage_path": str(activation_path)})
|
|
with measure_pipeline_phase("gtfs_activation", source_id=source.id, dataset_id=dataset.id, metadata={"stage_path": str(activation_path)}) as metric:
|
|
_activate_staged_gtfs(session, source, dataset, activation_path, stage_summary, progress_callback=progress_callback)
|
|
metric.update(stage_summary)
|
|
except BaseException:
|
|
session.rollback()
|
|
failed = session.get(Dataset, dataset.id)
|
|
if failed is not None:
|
|
failed.status = "failed"
|
|
failed.is_active = False
|
|
session.commit()
|
|
raise
|
|
|
|
source.status = "ok"
|
|
source.last_error = None
|
|
session.flush()
|
|
_emit_progress(progress_callback, "gtfs_activation_completed", f"Activated GTFS dataset #{dataset.id}.", None, None, {"dataset_id": dataset.id})
|
|
return dataset
|
|
|
|
|
|
def backfill_gtfs_shapes(session: Session, dataset_id: int | None = None) -> dict:
|
|
stmt = select(Dataset).where(Dataset.kind == "gtfs")
|
|
if dataset_id is not None:
|
|
stmt = stmt.where(Dataset.id == dataset_id)
|
|
else:
|
|
stmt = stmt.where(Dataset.is_active.is_(True))
|
|
datasets = session.scalars(stmt.order_by(Dataset.id)).all()
|
|
results = []
|
|
for dataset in datasets:
|
|
existing = session.scalar(select(func.count()).select_from(GtfsShape).where(GtfsShape.dataset_id == dataset.id)) or 0
|
|
if existing:
|
|
results.append({"dataset_id": dataset.id, "status": "skipped", "shapes": existing})
|
|
continue
|
|
zip_path = Path(dataset.local_path)
|
|
if not zip_path.exists() or not zipfile.is_zipfile(zip_path):
|
|
results.append({"dataset_id": dataset.id, "status": "missing_zip", "path": str(zip_path)})
|
|
continue
|
|
with zipfile.ZipFile(zip_path) as zf:
|
|
names = {Path(name).name: name for name in zf.namelist() if not name.endswith("/")}
|
|
if "shapes.txt" not in names:
|
|
results.append({"dataset_id": dataset.id, "status": "no_shapes_txt", "shapes": 0})
|
|
continue
|
|
shapes_by_id = _read_shapes(zf, names)
|
|
imported = _import_shapes(session, dataset.id, shapes_by_id)
|
|
_record_importer_metadata(dataset, shapes_count=imported)
|
|
session.flush()
|
|
results.append({"dataset_id": dataset.id, "status": "imported", "shapes": imported})
|
|
return {"datasets": results}
|
|
|
|
|
|
def _gtfs_stage_path(source: Source, dataset: Dataset, zip_path: Path) -> Path:
|
|
source_hash = dataset.sha256 or sha256_file(zip_path)
|
|
return settings.data_dir / "staging" / f"source_{source.id}" / f"gtfs_dataset_{dataset.id}_{source_hash[:12]}.sqlite"
|
|
|
|
|
|
def _gtfs_sidecar_path(source: Source, dataset: Dataset) -> Path:
|
|
source_hash = dataset.sha256 or "unknown"
|
|
return settings.data_dir / "sidecars" / f"source_{source.id}" / f"gtfs_dataset_{dataset.id}_{source_hash[:12]}.sqlite"
|
|
|
|
|
|
def _gtfs_timetable_storage_mode() -> str:
|
|
return effective_gtfs_timetable_storage()
|
|
|
|
|
|
def _prepare_gtfs_activation_path(source: Source, dataset: Dataset, stage_path: Path, summary: dict[str, Any]) -> Path:
|
|
storage_mode = _gtfs_timetable_storage_mode()
|
|
if storage_mode == GTFS_STORAGE_SIDECAR_STOP_TIMES:
|
|
sidecar_path = _gtfs_sidecar_path(source, dataset)
|
|
sidecar_path.parent.mkdir(parents=True, exist_ok=True)
|
|
if sidecar_path.exists():
|
|
sidecar_path.unlink()
|
|
stage_path.replace(sidecar_path)
|
|
summary["stage_path"] = str(sidecar_path)
|
|
summary["staging"] = "sqlite_promoted_to_sidecar"
|
|
summary[GTFS_STORAGE_METADATA_KEY] = {
|
|
"mode": GTFS_STORAGE_SIDECAR_STOP_TIMES,
|
|
"sidecar_path": str(sidecar_path),
|
|
"tables": {
|
|
"gtfs_stop_times": "sidecar",
|
|
"gtfs_agencies": "main",
|
|
"gtfs_stops": "main",
|
|
"gtfs_routes": "main",
|
|
"gtfs_trips": "main",
|
|
"gtfs_calendars": "main",
|
|
"gtfs_calendar_dates": "main",
|
|
"gtfs_shapes": "main",
|
|
},
|
|
}
|
|
return sidecar_path
|
|
|
|
summary[GTFS_STORAGE_METADATA_KEY] = {
|
|
"mode": GTFS_STORAGE_MAIN,
|
|
"tables": {
|
|
"gtfs_stop_times": "main",
|
|
"gtfs_agencies": "main",
|
|
"gtfs_stops": "main",
|
|
"gtfs_routes": "main",
|
|
"gtfs_trips": "main",
|
|
"gtfs_calendars": "main",
|
|
"gtfs_calendar_dates": "main",
|
|
"gtfs_shapes": "main",
|
|
},
|
|
}
|
|
return stage_path
|
|
|
|
|
|
def _stage_gtfs_zip(zip_path: Path, stage_path: Path, progress_callback: ProgressCallback | None = None) -> dict[str, Any]:
|
|
if stage_path.exists():
|
|
stage_path.unlink()
|
|
stage_path.parent.mkdir(parents=True, exist_ok=True)
|
|
connection = sqlite3.connect(stage_path)
|
|
try:
|
|
_configure_stage_connection(connection)
|
|
_create_gtfs_stage_schema(connection)
|
|
with zipfile.ZipFile(zip_path) as zf:
|
|
names = {Path(name).name: name for name in zf.namelist() if not name.endswith("/")}
|
|
missing = sorted(REQUIRED_FILES - set(names.keys()))
|
|
agency_names = _stage_agencies(connection, zf, names, progress_callback)
|
|
calendars_count = _stage_calendars(connection, zf, names, progress_callback)
|
|
calendar_dates_count = _stage_calendar_dates(connection, zf, names, progress_callback)
|
|
stops_by_id, stops_count = _stage_stops(connection, zf, names, progress_callback)
|
|
trips_by_route, first_shape_by_route, first_trip_by_route, trips_count = _stage_trips(connection, zf, names, progress_callback)
|
|
shapes_by_id = _read_shapes_with_progress(zf, names, progress_callback)
|
|
shapes_count = _stage_shapes(connection, shapes_by_id, progress_callback)
|
|
stopseq_by_trip, stop_times_seen, stop_times_imported = _stage_stop_times(
|
|
connection,
|
|
zf,
|
|
names,
|
|
first_trip_ids=set(first_trip_by_route.values()),
|
|
progress_callback=progress_callback,
|
|
)
|
|
routes_count = _stage_routes(
|
|
connection=connection,
|
|
routes_raw=list(_read_gtfs_csv(zf, names, "routes.txt")),
|
|
agency_names=agency_names,
|
|
stops_by_id=stops_by_id,
|
|
trips_by_route=trips_by_route,
|
|
first_shape_by_route=first_shape_by_route,
|
|
first_trip_by_route=first_trip_by_route,
|
|
shapes_by_id=shapes_by_id,
|
|
stopseq_by_trip=stopseq_by_trip,
|
|
progress_callback=progress_callback,
|
|
)
|
|
_create_gtfs_stage_indexes(connection, progress_callback)
|
|
connection.commit()
|
|
summary = {
|
|
"importer": GTFS_IMPORTER_VERSION,
|
|
"stage_path": str(stage_path),
|
|
"missing_required_files": missing,
|
|
"agencies": agency_names and len(agency_names) or 0,
|
|
"stops": stops_count,
|
|
"routes": routes_count,
|
|
"trips": trips_count,
|
|
"calendars": calendars_count,
|
|
"calendar_dates": calendar_dates_count,
|
|
"shapes": shapes_count,
|
|
"stop_times_seen": stop_times_seen,
|
|
"stop_times_imported": stop_times_imported,
|
|
"stop_times_import_limit": settings.gtfs_stop_times_import_limit,
|
|
"staging": "sqlite",
|
|
}
|
|
_emit_progress(progress_callback, "gtfs_staging_completed", "GTFS staging completed.", None, None, summary)
|
|
return summary
|
|
finally:
|
|
connection.close()
|
|
|
|
|
|
def _configure_stage_connection(connection: sqlite3.Connection) -> None:
|
|
connection.execute("PRAGMA journal_mode=OFF")
|
|
connection.execute("PRAGMA synchronous=OFF")
|
|
connection.execute("PRAGMA temp_store=MEMORY")
|
|
connection.execute("PRAGMA locking_mode=EXCLUSIVE")
|
|
|
|
|
|
def _create_gtfs_stage_schema(connection: sqlite3.Connection) -> None:
|
|
connection.executescript(
|
|
"""
|
|
CREATE TABLE gtfs_agencies (
|
|
agency_id TEXT NOT NULL,
|
|
name TEXT NOT NULL,
|
|
url TEXT,
|
|
timezone TEXT
|
|
);
|
|
CREATE TABLE gtfs_stops (
|
|
stop_id TEXT NOT NULL,
|
|
name TEXT,
|
|
lat REAL,
|
|
lon REAL,
|
|
parent_station TEXT
|
|
);
|
|
CREATE TABLE gtfs_routes (
|
|
route_id TEXT NOT NULL,
|
|
agency_id TEXT,
|
|
short_name TEXT,
|
|
long_name TEXT,
|
|
route_type INTEGER,
|
|
mode TEXT,
|
|
route_scope TEXT,
|
|
operator_name TEXT,
|
|
geometry_geojson TEXT,
|
|
min_lon REAL,
|
|
min_lat REAL,
|
|
max_lon REAL,
|
|
max_lat REAL,
|
|
route_key TEXT,
|
|
operator_key TEXT
|
|
);
|
|
CREATE TABLE gtfs_trips (
|
|
route_id TEXT NOT NULL,
|
|
trip_id TEXT NOT NULL,
|
|
service_id TEXT,
|
|
shape_id TEXT
|
|
);
|
|
CREATE TABLE gtfs_calendars (
|
|
service_id TEXT NOT NULL,
|
|
monday INTEGER NOT NULL,
|
|
tuesday INTEGER NOT NULL,
|
|
wednesday INTEGER NOT NULL,
|
|
thursday INTEGER NOT NULL,
|
|
friday INTEGER NOT NULL,
|
|
saturday INTEGER NOT NULL,
|
|
sunday INTEGER NOT NULL,
|
|
start_date INTEGER NOT NULL,
|
|
end_date INTEGER NOT NULL
|
|
);
|
|
CREATE TABLE gtfs_calendar_dates (
|
|
service_id TEXT NOT NULL,
|
|
date INTEGER NOT NULL,
|
|
exception_type INTEGER NOT NULL
|
|
);
|
|
CREATE TABLE gtfs_shapes (
|
|
shape_id TEXT NOT NULL,
|
|
geometry_geojson TEXT NOT NULL,
|
|
min_lon REAL,
|
|
min_lat REAL,
|
|
max_lon REAL,
|
|
max_lat REAL
|
|
);
|
|
CREATE TABLE gtfs_stop_times (
|
|
trip_id TEXT NOT NULL,
|
|
stop_id TEXT NOT NULL,
|
|
stop_sequence INTEGER NOT NULL,
|
|
arrival_time TEXT,
|
|
departure_time TEXT,
|
|
arrival_seconds INTEGER,
|
|
departure_seconds INTEGER
|
|
);
|
|
"""
|
|
)
|
|
|
|
|
|
def _create_gtfs_stage_indexes(connection: sqlite3.Connection, progress_callback: ProgressCallback | None = None) -> None:
|
|
_emit_progress(progress_callback, "gtfs_stage_indexes_started", "Building GTFS stage indexes.", None, None, None)
|
|
for statement in [
|
|
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_stop_times_stop_depart_trip ON gtfs_stop_times (stop_id, departure_seconds, trip_id, stop_sequence)",
|
|
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_stop_times_stop_arrive_trip ON gtfs_stop_times (stop_id, arrival_seconds, trip_id, stop_sequence)",
|
|
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_stop_times_trip_seq ON gtfs_stop_times (trip_id, stop_sequence)",
|
|
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_stop_times_trip_stop_seq ON gtfs_stop_times (trip_id, stop_id, stop_sequence)",
|
|
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_trips_trip ON gtfs_trips (trip_id)",
|
|
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_trips_service_trip ON gtfs_trips (service_id, trip_id)",
|
|
"CREATE INDEX IF NOT EXISTS ix_stage_gtfs_trips_route_service ON gtfs_trips (route_id, service_id)",
|
|
]:
|
|
connection.execute(statement)
|
|
_emit_progress(progress_callback, "gtfs_stage_indexes_completed", "Built GTFS stage indexes.", None, None, None)
|
|
|
|
|
|
def _activate_staged_gtfs(
|
|
session: Session,
|
|
source: Source,
|
|
dataset: Dataset,
|
|
stage_path: Path,
|
|
summary: dict[str, Any],
|
|
progress_callback: ProgressCallback | None = None,
|
|
) -> None:
|
|
if not stage_path.exists():
|
|
raise FileNotFoundError(f"GTFS staging database is missing: {stage_path}")
|
|
dataset = session.get(Dataset, dataset.id) or dataset
|
|
source = session.get(Source, source.id) or source
|
|
replaced_datasets = [existing for existing in list(source.datasets) if existing.id != dataset.id and existing.kind == "gtfs"]
|
|
for existing in source.datasets:
|
|
if existing.id != dataset.id:
|
|
existing.is_active = False
|
|
copy_stop_times = _copy_stop_times_to_main(summary)
|
|
heavy_index_drop = copy_stop_times and _should_drop_indexes_for_activation(stage_path)
|
|
if heavy_index_drop:
|
|
_emit_progress(progress_callback, "gtfs_activation_indexes_dropped", "Dropping heavy GTFS lookup indexes before bulk activation.", None, None, None)
|
|
_drop_gtfs_bulk_indexes(session.connection())
|
|
try:
|
|
if replaced_datasets:
|
|
_emit_progress(
|
|
progress_callback,
|
|
"gtfs_activation_pruning_replaced",
|
|
f"Pruning {len(replaced_datasets)} replaced GTFS dataset(s) before activation.",
|
|
None,
|
|
None,
|
|
{"dataset_ids": [dataset.id for dataset in replaced_datasets]},
|
|
)
|
|
from app.data_management import _delete_dataset_files, _delete_dataset_rows, _detach_update_checks_for_dataset
|
|
|
|
for old_dataset in replaced_datasets:
|
|
_detach_update_checks_for_dataset(session, old_dataset.id)
|
|
_delete_dataset_rows(session, old_dataset)
|
|
_delete_dataset_files(old_dataset)
|
|
session.delete(old_dataset)
|
|
with sqlite3.connect(stage_path) as stage_connection:
|
|
_copy_stage_table(
|
|
session,
|
|
stage_connection,
|
|
dataset.id,
|
|
"gtfs_agencies",
|
|
["agency_id", "name", "url", "timezone"],
|
|
progress_callback,
|
|
)
|
|
_copy_stage_table(
|
|
session,
|
|
stage_connection,
|
|
dataset.id,
|
|
"gtfs_stops",
|
|
["stop_id", "name", "lat", "lon", "parent_station"],
|
|
progress_callback,
|
|
)
|
|
_copy_stage_table(
|
|
session,
|
|
stage_connection,
|
|
dataset.id,
|
|
"gtfs_calendars",
|
|
["service_id", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "start_date", "end_date"],
|
|
progress_callback,
|
|
)
|
|
_copy_stage_table(
|
|
session,
|
|
stage_connection,
|
|
dataset.id,
|
|
"gtfs_calendar_dates",
|
|
["service_id", "date", "exception_type"],
|
|
progress_callback,
|
|
)
|
|
_copy_stage_table(
|
|
session,
|
|
stage_connection,
|
|
dataset.id,
|
|
"gtfs_trips",
|
|
["route_id", "trip_id", "service_id", "shape_id"],
|
|
progress_callback,
|
|
)
|
|
_copy_stage_table(
|
|
session,
|
|
stage_connection,
|
|
dataset.id,
|
|
"gtfs_shapes",
|
|
["shape_id", "geometry_geojson", "min_lon", "min_lat", "max_lon", "max_lat"],
|
|
progress_callback,
|
|
)
|
|
if copy_stop_times:
|
|
_copy_stage_table(
|
|
session,
|
|
stage_connection,
|
|
dataset.id,
|
|
"gtfs_stop_times",
|
|
["trip_id", "stop_id", "stop_sequence", "arrival_time", "departure_time", "arrival_seconds", "departure_seconds"],
|
|
progress_callback,
|
|
)
|
|
else:
|
|
_emit_progress(
|
|
progress_callback,
|
|
"gtfs_activation_sidecar_stop_times",
|
|
"Kept gtfs_stop_times in sidecar storage.",
|
|
None,
|
|
None,
|
|
{"table": "gtfs_stop_times", "sidecar_path": str(stage_path)},
|
|
)
|
|
_copy_stage_table(
|
|
session,
|
|
stage_connection,
|
|
dataset.id,
|
|
"gtfs_routes",
|
|
[
|
|
"route_id",
|
|
"agency_id",
|
|
"short_name",
|
|
"long_name",
|
|
"route_type",
|
|
"mode",
|
|
"route_scope",
|
|
"operator_name",
|
|
"geometry_geojson",
|
|
"min_lon",
|
|
"min_lat",
|
|
"max_lon",
|
|
"max_lat",
|
|
"route_key",
|
|
"operator_key",
|
|
],
|
|
progress_callback,
|
|
)
|
|
finally:
|
|
if heavy_index_drop:
|
|
_emit_progress(progress_callback, "gtfs_activation_indexes_rebuilding", "Rebuilding GTFS lookup indexes after bulk activation.", None, None, None)
|
|
_create_gtfs_bulk_indexes(session.connection())
|
|
dataset.status = "imported"
|
|
dataset.is_active = True
|
|
dataset.metadata_json = json.dumps(summary, indent=2)
|
|
source.status = "ok"
|
|
source.last_error = None
|
|
session.flush()
|
|
refresh_postgis_geometries(session, dataset_id=dataset.id, tables=["gtfs_stops", "gtfs_routes", "gtfs_shapes"])
|
|
analyze_postgresql_tables(session, ["gtfs_stops", "gtfs_routes", "gtfs_shapes", "gtfs_trips", "gtfs_stop_times"])
|
|
if copy_stop_times and not settings.gtfs_keep_activation_stage:
|
|
try:
|
|
stage_path.unlink()
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
|
|
def _copy_stop_times_to_main(summary: dict[str, Any]) -> bool:
|
|
storage = summary.get(GTFS_STORAGE_METADATA_KEY)
|
|
if not isinstance(storage, dict):
|
|
return True
|
|
tables = storage.get("tables")
|
|
if isinstance(tables, dict):
|
|
return tables.get("gtfs_stop_times") != "sidecar"
|
|
return storage.get("mode") != GTFS_STORAGE_SIDECAR_STOP_TIMES
|
|
|
|
|
|
def _copy_stage_table(
|
|
session: Session,
|
|
stage_connection: sqlite3.Connection,
|
|
dataset_id: int,
|
|
table: str,
|
|
columns: list[str],
|
|
progress_callback: ProgressCallback | None,
|
|
) -> None:
|
|
column_sql = ", ".join(columns)
|
|
placeholders = ", ".join([":dataset_id", *[f":{column}" for column in columns]])
|
|
insert_sql = f"INSERT INTO {table} (dataset_id, {column_sql}) VALUES ({placeholders})"
|
|
cursor = stage_connection.execute(f"SELECT {column_sql} FROM {table}")
|
|
copied = 0
|
|
while True:
|
|
rows = cursor.fetchmany(GTFS_STAGE_BATCH_SIZE)
|
|
if not rows:
|
|
break
|
|
payload = [
|
|
{"dataset_id": dataset_id, **{column: row[index] for index, column in enumerate(columns)}}
|
|
for row in rows
|
|
]
|
|
session.execute(text(insert_sql), payload)
|
|
copied += len(rows)
|
|
_emit_progress(
|
|
progress_callback,
|
|
"gtfs_activation_chunk",
|
|
f"Activated {table} chunk.",
|
|
copied,
|
|
None,
|
|
{"table": table, "rows": copied},
|
|
)
|
|
|
|
|
|
def _should_drop_indexes_for_activation(stage_path: Path) -> bool:
|
|
if settings.is_postgresql_database:
|
|
return False
|
|
try:
|
|
with sqlite3.connect(stage_path) as connection:
|
|
stop_times = connection.execute("SELECT COUNT(*) FROM gtfs_stop_times").fetchone()[0]
|
|
trips = connection.execute("SELECT COUNT(*) FROM gtfs_trips").fetchone()[0]
|
|
except sqlite3.Error:
|
|
return False
|
|
return int(stop_times or 0) >= 250_000 or int(trips or 0) >= 100_000
|
|
|
|
|
|
def _drop_gtfs_bulk_indexes(connection) -> None:
|
|
for index_name in [
|
|
"ix_gtfs_stop_times_stop",
|
|
"ix_gtfs_stop_times_stop_depart_trip",
|
|
"ix_gtfs_stop_times_stop_arrival",
|
|
"ix_gtfs_stop_times_stop_arrive_trip",
|
|
"ix_gtfs_stop_times_trip_seq",
|
|
"ix_gtfs_stop_times_trip_stop_seq",
|
|
"ix_gtfs_trips_dataset_trip",
|
|
"ix_gtfs_trips_dataset_route",
|
|
"ix_gtfs_trips_dataset_service",
|
|
"ix_gtfs_trips_dataset_route_service",
|
|
"ix_gtfs_routes_dataset_route",
|
|
"ix_gtfs_shapes_dataset_shape",
|
|
"ix_gtfs_calendars_dataset_service_dates",
|
|
"ix_gtfs_calendar_dates_dataset_date",
|
|
]:
|
|
connection.exec_driver_sql(f"DROP INDEX IF EXISTS {index_name}")
|
|
|
|
|
|
def _create_gtfs_bulk_indexes(connection) -> None:
|
|
for statement in [
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_stop ON gtfs_stop_times (dataset_id, stop_id, departure_seconds, trip_id, stop_sequence)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_stop_depart_trip ON gtfs_stop_times (dataset_id, stop_id, departure_seconds, trip_id)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_stop_arrival ON gtfs_stop_times (dataset_id, stop_id, arrival_seconds, trip_id, stop_sequence)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_stop_arrive_trip ON gtfs_stop_times (dataset_id, stop_id, arrival_seconds, trip_id)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_trip_seq ON gtfs_stop_times (dataset_id, trip_id, stop_sequence)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_stop_times_trip_stop_seq ON gtfs_stop_times (dataset_id, trip_id, stop_id, stop_sequence)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_trips_dataset_trip ON gtfs_trips (dataset_id, trip_id)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_trips_dataset_route ON gtfs_trips (dataset_id, route_id)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_trips_dataset_service ON gtfs_trips (dataset_id, service_id, trip_id)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_trips_dataset_route_service ON gtfs_trips (dataset_id, route_id, service_id)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_routes_dataset_route ON gtfs_routes (dataset_id, route_id)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_shapes_dataset_shape ON gtfs_shapes (dataset_id, shape_id)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_calendars_dataset_service_dates ON gtfs_calendars (dataset_id, service_id, start_date, end_date)",
|
|
"CREATE INDEX IF NOT EXISTS ix_gtfs_calendar_dates_dataset_date ON gtfs_calendar_dates (dataset_id, date, service_id, exception_type)",
|
|
]:
|
|
connection.exec_driver_sql(statement)
|
|
|
|
|
|
def _stage_agencies(
|
|
connection: sqlite3.Connection,
|
|
zf: zipfile.ZipFile,
|
|
names: dict[str, str],
|
|
progress_callback: ProgressCallback | None,
|
|
) -> dict[str, str]:
|
|
_emit_progress(progress_callback, "gtfs_file_started", "Reading agency.txt.", None, None, {"file": "agency.txt"})
|
|
agency_names: dict[str, str] = {}
|
|
rows = []
|
|
for idx, row in enumerate(_read_gtfs_csv(zf, names, "agency.txt")):
|
|
agency_id = first_nonempty(row.get("agency_id"), f"agency_{idx}")
|
|
name = first_nonempty(row.get("agency_name"), agency_id)
|
|
agency_names[agency_id] = name
|
|
rows.append((agency_id, name, row.get("agency_url") or None, row.get("agency_timezone") or None))
|
|
connection.executemany("INSERT INTO gtfs_agencies (agency_id, name, url, timezone) VALUES (?, ?, ?, ?)", rows)
|
|
_emit_progress(progress_callback, "gtfs_file_completed", "Imported agency.txt.", len(rows), None, {"file": "agency.txt", "rows": len(rows)})
|
|
return agency_names
|
|
|
|
|
|
def _stage_calendars(
|
|
connection: sqlite3.Connection,
|
|
zf: zipfile.ZipFile,
|
|
names: dict[str, str],
|
|
progress_callback: ProgressCallback | None,
|
|
) -> int:
|
|
_emit_progress(progress_callback, "gtfs_file_started", "Reading calendar.txt.", None, None, {"file": "calendar.txt"})
|
|
rows = []
|
|
for row in _read_gtfs_csv(zf, names, "calendar.txt"):
|
|
service_id = row.get("service_id") or ""
|
|
start_date = _int_or_none(row.get("start_date"))
|
|
end_date = _int_or_none(row.get("end_date"))
|
|
if not service_id or start_date is None or end_date is None:
|
|
continue
|
|
rows.append(
|
|
(
|
|
service_id,
|
|
int(_bool_flag(row.get("monday"))),
|
|
int(_bool_flag(row.get("tuesday"))),
|
|
int(_bool_flag(row.get("wednesday"))),
|
|
int(_bool_flag(row.get("thursday"))),
|
|
int(_bool_flag(row.get("friday"))),
|
|
int(_bool_flag(row.get("saturday"))),
|
|
int(_bool_flag(row.get("sunday"))),
|
|
start_date,
|
|
end_date,
|
|
)
|
|
)
|
|
connection.executemany(
|
|
"""
|
|
INSERT INTO gtfs_calendars
|
|
(service_id, monday, tuesday, wednesday, thursday, friday, saturday, sunday, start_date, end_date)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""",
|
|
rows,
|
|
)
|
|
_emit_progress(progress_callback, "gtfs_file_completed", "Imported calendar.txt.", len(rows), None, {"file": "calendar.txt", "rows": len(rows)})
|
|
return len(rows)
|
|
|
|
|
|
def _stage_calendar_dates(
|
|
connection: sqlite3.Connection,
|
|
zf: zipfile.ZipFile,
|
|
names: dict[str, str],
|
|
progress_callback: ProgressCallback | None,
|
|
) -> int:
|
|
return _stage_chunked_rows(
|
|
connection=connection,
|
|
zf=zf,
|
|
names=names,
|
|
basename="calendar_dates.txt",
|
|
insert_sql="INSERT INTO gtfs_calendar_dates (service_id, date, exception_type) VALUES (?, ?, ?)",
|
|
row_factory=lambda row: (
|
|
row.get("service_id") or "",
|
|
_int_or_none(row.get("date")),
|
|
_int_or_none(row.get("exception_type")),
|
|
),
|
|
validator=lambda row: bool(row[0]) and row[1] is not None and row[2] is not None,
|
|
progress_callback=progress_callback,
|
|
)
|
|
|
|
|
|
def _stage_stops(
|
|
connection: sqlite3.Connection,
|
|
zf: zipfile.ZipFile,
|
|
names: dict[str, str],
|
|
progress_callback: ProgressCallback | None,
|
|
) -> tuple[dict[str, tuple[float, float, str]], int]:
|
|
_emit_progress(progress_callback, "gtfs_file_started", "Reading stops.txt.", None, None, {"file": "stops.txt"})
|
|
stops_by_id: dict[str, tuple[float, float, str]] = {}
|
|
rows = []
|
|
for row in _read_gtfs_csv(zf, names, "stops.txt"):
|
|
stop_id = row.get("stop_id", "")
|
|
if not stop_id:
|
|
continue
|
|
lat = _float_or_none(row.get("stop_lat"))
|
|
lon = _float_or_none(row.get("stop_lon"))
|
|
name = row.get("stop_name") or None
|
|
if lat is not None and lon is not None:
|
|
stops_by_id[stop_id] = (lon, lat, name or stop_id)
|
|
rows.append((stop_id, name, lat, lon, row.get("parent_station") or None))
|
|
connection.executemany("INSERT INTO gtfs_stops (stop_id, name, lat, lon, parent_station) VALUES (?, ?, ?, ?, ?)", rows)
|
|
_emit_progress(progress_callback, "gtfs_file_completed", "Imported stops.txt.", len(rows), None, {"file": "stops.txt", "rows": len(rows)})
|
|
return stops_by_id, len(rows)
|
|
|
|
|
|
def _stage_trips(
|
|
connection: sqlite3.Connection,
|
|
zf: zipfile.ZipFile,
|
|
names: dict[str, str],
|
|
progress_callback: ProgressCallback | None,
|
|
) -> tuple[dict[str, list[str]], dict[str, str], dict[str, str], int]:
|
|
_emit_progress(progress_callback, "gtfs_file_started", "Reading trips.txt.", None, None, {"file": "trips.txt"})
|
|
trips_by_route: dict[str, list[str]] = defaultdict(list)
|
|
first_shape_by_route: dict[str, str] = {}
|
|
first_trip_by_route: dict[str, str] = {}
|
|
rows = []
|
|
imported = 0
|
|
for row in _read_gtfs_csv(zf, names, "trips.txt"):
|
|
route_id = row.get("route_id", "")
|
|
trip_id = row.get("trip_id", "")
|
|
if not route_id or not trip_id:
|
|
continue
|
|
trips_by_route[route_id].append(trip_id)
|
|
first_trip_by_route.setdefault(route_id, trip_id)
|
|
shape_id = row.get("shape_id") or ""
|
|
if shape_id:
|
|
first_shape_by_route.setdefault(route_id, shape_id)
|
|
rows.append((route_id, trip_id, row.get("service_id") or None, shape_id or None))
|
|
imported += 1
|
|
if len(rows) >= GTFS_STAGE_BATCH_SIZE:
|
|
connection.executemany("INSERT INTO gtfs_trips (route_id, trip_id, service_id, shape_id) VALUES (?, ?, ?, ?)", rows)
|
|
rows.clear()
|
|
_emit_progress(progress_callback, "gtfs_file_chunk", "Imported trips.txt chunk.", imported, None, {"file": "trips.txt", "rows": imported})
|
|
if rows:
|
|
connection.executemany("INSERT INTO gtfs_trips (route_id, trip_id, service_id, shape_id) VALUES (?, ?, ?, ?)", rows)
|
|
_emit_progress(progress_callback, "gtfs_file_completed", "Imported trips.txt.", imported, None, {"file": "trips.txt", "rows": imported})
|
|
return dict(trips_by_route), first_shape_by_route, first_trip_by_route, imported
|
|
|
|
|
|
def _read_shapes_with_progress(
|
|
zf: zipfile.ZipFile,
|
|
names: dict[str, str],
|
|
progress_callback: ProgressCallback | None,
|
|
) -> dict[str, list[tuple[float, float]]]:
|
|
_emit_progress(progress_callback, "gtfs_file_started", "Reading shapes.txt.", None, None, {"file": "shapes.txt"})
|
|
shapes = _read_shapes(zf, names)
|
|
_emit_progress(progress_callback, "gtfs_file_completed", "Read shapes.txt.", len(shapes), None, {"file": "shapes.txt", "shapes": len(shapes)})
|
|
return shapes
|
|
|
|
|
|
def _stage_shapes(
|
|
connection: sqlite3.Connection,
|
|
shapes_by_id: dict[str, list[tuple[float, float]]],
|
|
progress_callback: ProgressCallback | None,
|
|
) -> int:
|
|
rows = []
|
|
imported = 0
|
|
for shape_id, coords in shapes_by_id.items():
|
|
if len(coords) < 2:
|
|
continue
|
|
geometry_text, bbox = geometry_json_and_bbox(LineString(coords))
|
|
if geometry_text is None:
|
|
continue
|
|
rows.append((shape_id, geometry_text, bbox[0], bbox[1], bbox[2], bbox[3]))
|
|
imported += 1
|
|
if len(rows) >= 5000:
|
|
connection.executemany(
|
|
"INSERT INTO gtfs_shapes (shape_id, geometry_geojson, min_lon, min_lat, max_lon, max_lat) VALUES (?, ?, ?, ?, ?, ?)",
|
|
rows,
|
|
)
|
|
rows.clear()
|
|
_emit_progress(progress_callback, "gtfs_file_chunk", "Imported shapes chunk.", imported, None, {"file": "shapes.txt", "rows": imported})
|
|
if rows:
|
|
connection.executemany(
|
|
"INSERT INTO gtfs_shapes (shape_id, geometry_geojson, min_lon, min_lat, max_lon, max_lat) VALUES (?, ?, ?, ?, ?, ?)",
|
|
rows,
|
|
)
|
|
_emit_progress(progress_callback, "gtfs_file_completed", "Imported shapes.", imported, None, {"file": "shapes.txt", "rows": imported})
|
|
return imported
|
|
|
|
|
|
def _stage_stop_times(
|
|
connection: sqlite3.Connection,
|
|
zf: zipfile.ZipFile,
|
|
names: dict[str, str],
|
|
first_trip_ids: set[str],
|
|
progress_callback: ProgressCallback | None,
|
|
) -> tuple[dict[str, list[str]], int, int]:
|
|
_emit_progress(progress_callback, "gtfs_file_started", "Reading stop_times.txt.", None, None, {"file": "stop_times.txt"})
|
|
stopseq_by_trip: dict[str, list[tuple[int, str]]] = defaultdict(list)
|
|
rows = []
|
|
count = 0
|
|
imported = 0
|
|
limit = settings.gtfs_stop_times_import_limit
|
|
for row in _read_gtfs_csv(zf, names, "stop_times.txt"):
|
|
count += 1
|
|
trip_id = row.get("trip_id", "")
|
|
stop_id = row.get("stop_id", "")
|
|
seq = _int_or_none(row.get("stop_sequence"))
|
|
if not trip_id or not stop_id or seq is None:
|
|
continue
|
|
if trip_id in first_trip_ids:
|
|
stopseq_by_trip[trip_id].append((seq, stop_id))
|
|
if limit <= 0 or imported < limit:
|
|
arrival_time = row.get("arrival_time") or None
|
|
departure_time = row.get("departure_time") or None
|
|
rows.append((trip_id, stop_id, seq, arrival_time, departure_time, _time_seconds(arrival_time), _time_seconds(departure_time)))
|
|
imported += 1
|
|
if len(rows) >= GTFS_STAGE_BATCH_SIZE:
|
|
connection.executemany(
|
|
"""
|
|
INSERT INTO gtfs_stop_times
|
|
(trip_id, stop_id, stop_sequence, arrival_time, departure_time, arrival_seconds, departure_seconds)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
""",
|
|
rows,
|
|
)
|
|
rows.clear()
|
|
_emit_progress(progress_callback, "gtfs_file_chunk", "Imported stop_times.txt chunk.", imported, None, {"file": "stop_times.txt", "rows": imported, "seen": count})
|
|
if rows:
|
|
connection.executemany(
|
|
"""
|
|
INSERT INTO gtfs_stop_times
|
|
(trip_id, stop_id, stop_sequence, arrival_time, departure_time, arrival_seconds, departure_seconds)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
""",
|
|
rows,
|
|
)
|
|
_emit_progress(progress_callback, "gtfs_file_completed", "Imported stop_times.txt.", imported, None, {"file": "stop_times.txt", "rows": imported, "seen": count})
|
|
return {trip: [stop for _, stop in sorted(seq)] for trip, seq in stopseq_by_trip.items()}, count, imported
|
|
|
|
|
|
def _stage_routes(
|
|
connection: sqlite3.Connection,
|
|
routes_raw: list[dict[str, str]],
|
|
agency_names: dict[str, str],
|
|
stops_by_id: dict[str, tuple[float, float, str]],
|
|
trips_by_route: dict[str, list[str]],
|
|
first_shape_by_route: dict[str, str],
|
|
first_trip_by_route: dict[str, str],
|
|
shapes_by_id: dict[str, list[tuple[float, float]]],
|
|
stopseq_by_trip: dict[str, list[str]],
|
|
progress_callback: ProgressCallback | None,
|
|
) -> int:
|
|
_emit_progress(progress_callback, "gtfs_file_started", "Reading routes.txt.", None, None, {"file": "routes.txt"})
|
|
rows = []
|
|
for row in routes_raw:
|
|
route_id = row.get("route_id", "")
|
|
if not route_id:
|
|
continue
|
|
route_type = _int_or_none(row.get("route_type"))
|
|
mode = _gtfs_mode(route_type)
|
|
agency_id = row.get("agency_id") or None
|
|
operator = agency_names.get(agency_id or "", agency_id or "")
|
|
short_name = row.get("route_short_name") or None
|
|
long_name = row.get("route_long_name") or None
|
|
route_scope = infer_osm_route_scope(mode=mode, ref=short_name, name=long_name, network=operator)
|
|
geometry = _route_geometry(route_id, first_shape_by_route, first_trip_by_route, shapes_by_id, stopseq_by_trip, stops_by_id)
|
|
geometry_text, bbox = geometry_json_and_bbox(geometry) if geometry is not None else (None, (None, None, None, None))
|
|
rows.append(
|
|
(
|
|
route_id,
|
|
agency_id,
|
|
short_name,
|
|
long_name,
|
|
route_type,
|
|
mode,
|
|
route_scope,
|
|
operator or None,
|
|
geometry_text,
|
|
bbox[0],
|
|
bbox[1],
|
|
bbox[2],
|
|
bbox[3],
|
|
norm_ref(short_name) or norm_text(long_name) or norm_ref(route_id),
|
|
norm_text(operator),
|
|
)
|
|
)
|
|
connection.executemany(
|
|
"""
|
|
INSERT INTO gtfs_routes
|
|
(route_id, agency_id, short_name, long_name, route_type, mode, route_scope, operator_name, geometry_geojson, min_lon, min_lat, max_lon, max_lat, route_key, operator_key)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""",
|
|
rows,
|
|
)
|
|
_emit_progress(progress_callback, "gtfs_file_completed", "Imported routes.txt.", len(rows), None, {"file": "routes.txt", "rows": len(rows)})
|
|
return len(rows)
|
|
|
|
|
|
def _stage_chunked_rows(
|
|
connection: sqlite3.Connection,
|
|
zf: zipfile.ZipFile,
|
|
names: dict[str, str],
|
|
basename: str,
|
|
insert_sql: str,
|
|
row_factory,
|
|
validator,
|
|
progress_callback: ProgressCallback | None,
|
|
) -> int:
|
|
_emit_progress(progress_callback, "gtfs_file_started", f"Reading {basename}.", None, None, {"file": basename})
|
|
rows = []
|
|
imported = 0
|
|
for raw in _read_gtfs_csv(zf, names, basename):
|
|
row = row_factory(raw)
|
|
if not validator(row):
|
|
continue
|
|
rows.append(row)
|
|
imported += 1
|
|
if len(rows) >= GTFS_STAGE_BATCH_SIZE:
|
|
connection.executemany(insert_sql, rows)
|
|
rows.clear()
|
|
_emit_progress(progress_callback, "gtfs_file_chunk", f"Imported {basename} chunk.", imported, None, {"file": basename, "rows": imported})
|
|
if rows:
|
|
connection.executemany(insert_sql, rows)
|
|
_emit_progress(progress_callback, "gtfs_file_completed", f"Imported {basename}.", imported, None, {"file": basename, "rows": imported})
|
|
return imported
|
|
|
|
|
|
def _emit_progress(
|
|
progress_callback: ProgressCallback | None,
|
|
event_type: str,
|
|
message: str,
|
|
progress_current: int | None = None,
|
|
progress_total: int | None = None,
|
|
metadata: dict[str, Any] | None = None,
|
|
) -> None:
|
|
if progress_callback is not None:
|
|
progress_callback(event_type, message, progress_current, progress_total, metadata)
|
|
|
|
|
|
def _read_gtfs_csv(zf: zipfile.ZipFile, names: dict[str, str], basename: str) -> Iterator[dict[str, str]]:
|
|
if basename not in names:
|
|
return iter(())
|
|
|
|
def _iter() -> Iterator[dict[str, str]]:
|
|
with zf.open(names[basename], "r") as raw:
|
|
text = io.TextIOWrapper(raw, encoding="utf-8-sig", newline="")
|
|
reader = csv.DictReader(text)
|
|
for row in reader:
|
|
yield {str(k).strip(): (v or "").strip() for k, v in row.items() if k is not None}
|
|
|
|
return _iter()
|
|
|
|
|
|
def _record_importer_metadata(dataset: Dataset, shapes_count: int | None = None) -> None:
|
|
metadata = {}
|
|
if dataset.metadata_json:
|
|
try:
|
|
metadata = json.loads(dataset.metadata_json)
|
|
except json.JSONDecodeError:
|
|
metadata = {}
|
|
metadata["importer"] = GTFS_IMPORTER_VERSION
|
|
if shapes_count is not None:
|
|
metadata["shapes"] = shapes_count
|
|
dataset.metadata_json = json.dumps(metadata, indent=2)
|
|
|
|
|
|
def _import_agencies(session: Session, dataset_id: int, rows: list[dict[str, str]]) -> dict[str, str]:
|
|
agency_names: dict[str, str] = {}
|
|
objects: list[GtfsAgency] = []
|
|
for idx, row in enumerate(rows):
|
|
agency_id = first_nonempty(row.get("agency_id"), f"agency_{idx}")
|
|
name = first_nonempty(row.get("agency_name"), agency_id)
|
|
agency_names[agency_id] = name
|
|
objects.append(
|
|
GtfsAgency(
|
|
dataset_id=dataset_id,
|
|
agency_id=agency_id,
|
|
name=name,
|
|
url=row.get("agency_url") or None,
|
|
timezone=row.get("agency_timezone") or None,
|
|
)
|
|
)
|
|
if objects:
|
|
session.bulk_save_objects(objects)
|
|
return agency_names
|
|
|
|
|
|
def _import_calendars(session: Session, dataset_id: int, rows: list[dict[str, str]]) -> int:
|
|
objects: list[GtfsCalendar] = []
|
|
for row in rows:
|
|
service_id = row.get("service_id") or ""
|
|
start_date = _int_or_none(row.get("start_date"))
|
|
end_date = _int_or_none(row.get("end_date"))
|
|
if not service_id or start_date is None or end_date is None:
|
|
continue
|
|
objects.append(
|
|
GtfsCalendar(
|
|
dataset_id=dataset_id,
|
|
service_id=service_id,
|
|
monday=_bool_flag(row.get("monday")),
|
|
tuesday=_bool_flag(row.get("tuesday")),
|
|
wednesday=_bool_flag(row.get("wednesday")),
|
|
thursday=_bool_flag(row.get("thursday")),
|
|
friday=_bool_flag(row.get("friday")),
|
|
saturday=_bool_flag(row.get("saturday")),
|
|
sunday=_bool_flag(row.get("sunday")),
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
)
|
|
)
|
|
if objects:
|
|
session.bulk_save_objects(objects)
|
|
return len(objects)
|
|
|
|
|
|
def _import_calendar_dates(session: Session, dataset_id: int, rows: list[dict[str, str]]) -> int:
|
|
objects: list[GtfsCalendarDate] = []
|
|
for row in rows:
|
|
service_id = row.get("service_id") or ""
|
|
date = _int_or_none(row.get("date"))
|
|
exception_type = _int_or_none(row.get("exception_type"))
|
|
if not service_id or date is None or exception_type is None:
|
|
continue
|
|
objects.append(
|
|
GtfsCalendarDate(
|
|
dataset_id=dataset_id,
|
|
service_id=service_id,
|
|
date=date,
|
|
exception_type=exception_type,
|
|
)
|
|
)
|
|
for batch_start in range(0, len(objects), 5000):
|
|
session.bulk_save_objects(objects[batch_start : batch_start + 5000])
|
|
return len(objects)
|
|
|
|
|
|
def _import_stops(session: Session, dataset_id: int, rows: list[dict[str, str]]) -> dict[str, tuple[float, float, str]]:
|
|
stops_by_id: dict[str, tuple[float, float, str]] = {}
|
|
objects: list[GtfsStop] = []
|
|
for row in rows:
|
|
stop_id = row.get("stop_id", "")
|
|
if not stop_id:
|
|
continue
|
|
lat = _float_or_none(row.get("stop_lat"))
|
|
lon = _float_or_none(row.get("stop_lon"))
|
|
name = row.get("stop_name") or None
|
|
if lat is not None and lon is not None:
|
|
stops_by_id[stop_id] = (lon, lat, name or stop_id)
|
|
objects.append(
|
|
GtfsStop(
|
|
dataset_id=dataset_id,
|
|
stop_id=stop_id,
|
|
name=name,
|
|
lat=lat,
|
|
lon=lon,
|
|
parent_station=row.get("parent_station") or None,
|
|
)
|
|
)
|
|
if objects:
|
|
session.bulk_save_objects(objects)
|
|
return stops_by_id
|
|
|
|
|
|
def _import_trips(
|
|
session: Session, dataset_id: int, rows: list[dict[str, str]]
|
|
) -> tuple[dict[str, list[str]], dict[str, str], dict[str, str]]:
|
|
trips_by_route: dict[str, list[str]] = defaultdict(list)
|
|
first_shape_by_route: dict[str, str] = {}
|
|
first_trip_by_route: dict[str, str] = {}
|
|
objects: list[GtfsTrip] = []
|
|
for row in rows:
|
|
route_id = row.get("route_id", "")
|
|
trip_id = row.get("trip_id", "")
|
|
if not route_id or not trip_id:
|
|
continue
|
|
trips_by_route[route_id].append(trip_id)
|
|
first_trip_by_route.setdefault(route_id, trip_id)
|
|
shape_id = row.get("shape_id") or ""
|
|
if shape_id:
|
|
first_shape_by_route.setdefault(route_id, shape_id)
|
|
objects.append(
|
|
GtfsTrip(
|
|
dataset_id=dataset_id,
|
|
route_id=route_id,
|
|
trip_id=trip_id,
|
|
service_id=row.get("service_id") or None,
|
|
shape_id=shape_id or None,
|
|
)
|
|
)
|
|
for batch_start in range(0, len(objects), 5000):
|
|
session.bulk_save_objects(objects[batch_start : batch_start + 5000])
|
|
return dict(trips_by_route), first_shape_by_route, first_trip_by_route
|
|
|
|
|
|
def _read_shapes(zf: zipfile.ZipFile, names: dict[str, str]) -> dict[str, list[tuple[float, float]]]:
|
|
by_shape: dict[str, list[tuple[int, float, float]]] = defaultdict(list)
|
|
for row in _read_gtfs_csv(zf, names, "shapes.txt"):
|
|
shape_id = row.get("shape_id", "")
|
|
lat = _float_or_none(row.get("shape_pt_lat"))
|
|
lon = _float_or_none(row.get("shape_pt_lon"))
|
|
seq = _int_or_none(row.get("shape_pt_sequence"))
|
|
if shape_id and lat is not None and lon is not None:
|
|
by_shape[shape_id].append((seq if seq is not None else 0, lon, lat))
|
|
return {shape_id: [(lon, lat) for _, lon, lat in sorted(points)] for shape_id, points in by_shape.items()}
|
|
|
|
|
|
def _import_shapes(session: Session, dataset_id: int, shapes_by_id: dict[str, list[tuple[float, float]]]) -> int:
|
|
objects: list[GtfsShape] = []
|
|
imported = 0
|
|
for shape_id, coords in shapes_by_id.items():
|
|
if len(coords) < 2:
|
|
continue
|
|
geometry_text, bbox = geometry_json_and_bbox(LineString(coords))
|
|
if geometry_text is None:
|
|
continue
|
|
objects.append(
|
|
GtfsShape(
|
|
dataset_id=dataset_id,
|
|
shape_id=shape_id,
|
|
geometry_geojson=geometry_text,
|
|
min_lon=bbox[0],
|
|
min_lat=bbox[1],
|
|
max_lon=bbox[2],
|
|
max_lat=bbox[3],
|
|
)
|
|
)
|
|
imported += 1
|
|
if len(objects) >= 1000:
|
|
session.bulk_save_objects(objects)
|
|
objects.clear()
|
|
if objects:
|
|
session.bulk_save_objects(objects)
|
|
return imported
|
|
|
|
|
|
def _import_stop_times(
|
|
session: Session,
|
|
dataset_id: int,
|
|
zf: zipfile.ZipFile,
|
|
names: dict[str, str],
|
|
first_trip_ids: set[str],
|
|
) -> tuple[dict[str, list[str]], int, int]:
|
|
stopseq_by_trip: dict[str, list[tuple[int, str]]] = defaultdict(list)
|
|
objects: list[GtfsStopTime] = []
|
|
count = 0
|
|
imported = 0
|
|
limit = settings.gtfs_stop_times_import_limit
|
|
for row in _read_gtfs_csv(zf, names, "stop_times.txt"):
|
|
count += 1
|
|
trip_id = row.get("trip_id", "")
|
|
stop_id = row.get("stop_id", "")
|
|
seq = _int_or_none(row.get("stop_sequence"))
|
|
if not trip_id or not stop_id or seq is None:
|
|
continue
|
|
if trip_id in first_trip_ids:
|
|
stopseq_by_trip[trip_id].append((seq, stop_id))
|
|
if limit <= 0 or imported < limit:
|
|
arrival_time = row.get("arrival_time") or None
|
|
departure_time = row.get("departure_time") or None
|
|
objects.append(
|
|
GtfsStopTime(
|
|
dataset_id=dataset_id,
|
|
trip_id=trip_id,
|
|
stop_id=stop_id,
|
|
stop_sequence=seq,
|
|
arrival_time=arrival_time,
|
|
departure_time=departure_time,
|
|
arrival_seconds=_time_seconds(arrival_time),
|
|
departure_seconds=_time_seconds(departure_time),
|
|
)
|
|
)
|
|
imported += 1
|
|
if len(objects) >= 5000:
|
|
session.bulk_save_objects(objects)
|
|
objects.clear()
|
|
if objects:
|
|
session.bulk_save_objects(objects)
|
|
return {trip: [stop for _, stop in sorted(seq)] for trip, seq in stopseq_by_trip.items()}, count, imported
|
|
|
|
|
|
def _import_routes(
|
|
session: Session,
|
|
dataset_id: int,
|
|
routes_raw: list[dict[str, str]],
|
|
agency_names: dict[str, str],
|
|
stops_by_id: dict[str, tuple[float, float, str]],
|
|
trips_by_route: dict[str, list[str]],
|
|
first_shape_by_route: dict[str, str],
|
|
first_trip_by_route: dict[str, str],
|
|
shapes_by_id: dict[str, list[tuple[float, float]]],
|
|
stopseq_by_trip: dict[str, list[str]],
|
|
) -> int:
|
|
objects: list[GtfsRoute] = []
|
|
for row in routes_raw:
|
|
route_id = row.get("route_id", "")
|
|
if not route_id:
|
|
continue
|
|
route_type = _int_or_none(row.get("route_type"))
|
|
mode = _gtfs_mode(route_type)
|
|
agency_id = row.get("agency_id") or None
|
|
operator = agency_names.get(agency_id or "", agency_id or "")
|
|
short_name = row.get("route_short_name") or None
|
|
long_name = row.get("route_long_name") or None
|
|
route_scope = infer_osm_route_scope(mode=mode, ref=short_name, name=long_name, network=operator)
|
|
geometry = _route_geometry(route_id, first_shape_by_route, first_trip_by_route, shapes_by_id, stopseq_by_trip, stops_by_id)
|
|
geometry_text, bbox = geometry_json_and_bbox(geometry) if geometry is not None else (None, (None, None, None, None))
|
|
route_key = norm_ref(short_name) or norm_text(long_name) or norm_ref(route_id)
|
|
objects.append(
|
|
GtfsRoute(
|
|
dataset_id=dataset_id,
|
|
route_id=route_id,
|
|
agency_id=agency_id,
|
|
short_name=short_name,
|
|
long_name=long_name,
|
|
route_type=route_type,
|
|
mode=mode,
|
|
route_scope=route_scope,
|
|
operator_name=operator or None,
|
|
geometry_geojson=geometry_text,
|
|
min_lon=bbox[0],
|
|
min_lat=bbox[1],
|
|
max_lon=bbox[2],
|
|
max_lat=bbox[3],
|
|
route_key=route_key,
|
|
operator_key=norm_text(operator),
|
|
)
|
|
)
|
|
if objects:
|
|
session.bulk_save_objects(objects)
|
|
return len(objects)
|
|
|
|
|
|
def _route_geometry(
|
|
route_id: str,
|
|
first_shape_by_route: dict[str, str],
|
|
first_trip_by_route: dict[str, str],
|
|
shapes_by_id: dict[str, list[tuple[float, float]]],
|
|
stopseq_by_trip: dict[str, list[str]],
|
|
stops_by_id: dict[str, tuple[float, float, str]],
|
|
) -> Optional[LineString]:
|
|
shape_id = first_shape_by_route.get(route_id)
|
|
coords = shapes_by_id.get(shape_id or "", [])
|
|
if len(coords) >= 2:
|
|
return LineString(coords)
|
|
|
|
trip_id = first_trip_by_route.get(route_id)
|
|
stop_ids = stopseq_by_trip.get(trip_id or "", [])
|
|
fallback = [(stops_by_id[sid][0], stops_by_id[sid][1]) for sid in stop_ids if sid in stops_by_id]
|
|
if len(fallback) >= 2:
|
|
return LineString(fallback)
|
|
return None
|
|
|
|
|
|
def _float_or_none(value: object) -> Optional[float]:
|
|
try:
|
|
if value is None or str(value).strip() == "":
|
|
return None
|
|
return float(str(value))
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _int_or_none(value: object) -> Optional[int]:
|
|
try:
|
|
if value is None or str(value).strip() == "":
|
|
return None
|
|
return int(float(str(value)))
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _bool_flag(value: object) -> bool:
|
|
return str(value or "").strip() in {"1", "true", "True", "TRUE", "yes"}
|
|
|
|
|
|
def _time_seconds(value: str | None) -> Optional[int]:
|
|
if not value:
|
|
return None
|
|
parts = value.strip().split(":")
|
|
if len(parts) == 2:
|
|
parts.append("0")
|
|
if len(parts) != 3:
|
|
return None
|
|
try:
|
|
hours, minutes, seconds = [int(part) for part in parts]
|
|
except ValueError:
|
|
return None
|
|
if hours < 0 or minutes < 0 or minutes > 59 or seconds < 0 or seconds > 59:
|
|
return None
|
|
return hours * 3600 + minutes * 60 + seconds
|
|
|
|
|
|
def _gtfs_mode(route_type: Optional[int]) -> str:
|
|
if route_type is None:
|
|
return "unknown"
|
|
if route_type in GTFS_MODE:
|
|
return GTFS_MODE[route_type]
|
|
for start, end, mode in GTFS_EXTENDED_MODE_RANGES:
|
|
if start <= route_type <= end:
|
|
return mode
|
|
return "unknown"
|
|
|
|
|
|
def _dataset_importer_version(dataset: Dataset) -> str:
|
|
try:
|
|
return str(json.loads(dataset.metadata_json or "{}").get("importer") or "")
|
|
except json.JSONDecodeError:
|
|
return ""
|