Files
meubility-workbench/app/pipeline/osm_pbf.py
2026-07-01 23:29:51 +02:00

1582 lines
65 KiB
Python

from __future__ import annotations
import json
import shutil
import subprocess
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import osmium
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.config import settings
from app.db import SessionLocal
from app.db_lock import database_write_lock
from app.models import Dataset, OsmDiffState, Source
from app.osm_storage import OSM_STORAGE_MAIN, OSM_STORAGE_SIDECAR_FEATURES, effective_osm_feature_storage
from app.performance import measure_pipeline_phase
from app.pipeline.download import materialize_source
from app.pipeline.osm_geojson import import_osm_geojson, prepare_osm_geojson_storage
from app.pipeline.osm_replication import ReplicationState, apply_osm_changes, download_diff, fetch_replication_state
from app.pipeline.state import (
STAGE_ACQUIRE_RAW,
STAGE_BUILD_INDEXES,
STAGE_EXTRACT_GEOMETRY,
STAGE_FILTER_TRANSPORT,
dependency_hash,
finish_pipeline_run,
start_pipeline_run,
)
from app.pipeline.utils import sha256_file
ROUTE_MODES = {
"train",
"railway",
"light_rail",
"subway",
"tram",
"bus",
"trolleybus",
"coach",
"ferry",
"monorail",
"funicular",
"aerialway",
}
RAILWAY_MODE_BY_TAG = {
"rail": "train",
"light_rail": "light_rail",
"subway": "subway",
"tram": "tram",
"monorail": "monorail",
"funicular": "funicular",
}
EXTRACTOR_VERSION = "osmium_transport_geojson_v2_ordered_relation_members"
TRANSPORT_FILTER_VERSION = "osmium_transport_filter_v1"
RAW_ACQUIRE_VERSION = "osm_raw_acquire_v1"
OSM_SIDECAR_INDEX_VERSION = "osm_sidecar_indexes_v1"
@dataclass(frozen=True)
class _SourceRef:
id: int
name: str
kind: str
url: str
country: str | None = None
license: str | None = None
notes: str | None = None
@dataclass(frozen=True)
class _DatasetRef:
id: int
source_id: int
kind: str
local_path: str
sha256: str
status: str
metadata: dict[str, Any]
@dataclass(frozen=True)
class _PreparedRawFile:
path: Path
sha256: str
metadata: dict[str, Any]
replication_state: ReplicationState | None = None
diff_state_metadata: dict[str, Any] | None = None
@dataclass(frozen=True)
class _PreparedTransportFile:
path: Path
sha256: str
metadata: dict[str, Any]
reused: bool
@dataclass(frozen=True)
class _PreparedExtract:
path: Path
sha256: str
summary: dict[str, Any]
reused: bool
def run_osm_pbf_source(session: Session, source: Source, progress_callback=None) -> Dataset:
raw_dataset = _prepare_raw_osm_dataset(session, source, progress_callback=progress_callback)
input_dataset = raw_dataset
input_path = Path(raw_dataset.local_path)
if _should_prefilter(input_path):
input_dataset = _prepare_transport_pbf(session, source, raw_dataset, input_path)
input_path = Path(input_dataset.local_path)
existing_derived = _find_existing_derived(session, source, input_dataset)
if existing_derived is not None:
return existing_derived
output_dir = settings.data_dir / "derived" / f"source_{source.id}" / f"extract_dataset_{input_dataset.id}"
output_path = output_dir / "transport.geojson"
extract_summary = extract_osm_transport_geojson(input_path, output_path)
input_dataset.status = "extracted"
_update_dataset_metadata(input_dataset, extractor=EXTRACTOR_VERSION, extract_summary=extract_summary)
if input_dataset.id != raw_dataset.id:
raw_dataset.status = "filtered"
_update_dataset_metadata(raw_dataset, filtered_dataset_id=input_dataset.id)
session.flush()
derived_dataset = import_osm_geojson(session=session, source=source, path=output_path)
derived_metadata = json.loads(derived_dataset.metadata_json or "{}")
derived_metadata.update(
{
"stage": "derived_osm_transport_geojson",
"derived_from_dataset_id": input_dataset.id,
"raw_dataset_id": raw_dataset.id,
"filtered_dataset_id": input_dataset.id if input_dataset.id != raw_dataset.id else None,
"extractor": EXTRACTOR_VERSION,
"extract_summary": extract_summary,
}
)
derived_dataset.metadata_json = json.dumps(derived_metadata, indent=2)
session.flush()
return derived_dataset
def run_osm_pbf_source_staged(source_id: int, progress_callback=None) -> Dataset:
"""Run large OSM PBF imports with only short DB write-lock sections.
The expensive file work is deterministic and resumable from cached files:
raw source materialization, optional osmium transport filtering, GeoJSON
extraction, and sidecar creation all happen outside the global SQLite write
lock. Dataset rows are reserved/activated in short transactions.
"""
source_ref = _load_source_ref(source_id)
_mark_source_running(source_ref.id)
_emit_progress(progress_callback, "osm_staged_import_started", f"Preparing staged OSM import for {source_ref.name}.", 0, 7, {"source_id": source_ref.id})
prepared_raw = _prepare_raw_file_staged(source_ref, progress_callback=progress_callback)
raw_dataset = _reserve_raw_dataset(source_ref, prepared_raw)
_emit_progress(
progress_callback,
"osm_raw_dataset_reserved",
f"Reserved raw OSM dataset #{raw_dataset.id}.",
2,
7,
{"dataset_id": raw_dataset.id, "path": raw_dataset.local_path, "sha256": raw_dataset.sha256},
)
input_dataset = raw_dataset
input_path = Path(raw_dataset.local_path)
filtered_dataset: _DatasetRef | None = None
if _should_prefilter(input_path):
prepared_transport = _prepare_transport_file_staged(source_ref, raw_dataset, input_path, progress_callback=progress_callback)
filtered_dataset = _reserve_transport_dataset(source_ref, raw_dataset, prepared_transport)
input_dataset = filtered_dataset
input_path = Path(filtered_dataset.local_path)
_emit_progress(
progress_callback,
"osm_transport_dataset_reserved",
f"Reserved filtered OSM transport dataset #{filtered_dataset.id}.",
3,
7,
{"dataset_id": filtered_dataset.id, "path": filtered_dataset.local_path, "sha256": filtered_dataset.sha256, "reused": prepared_transport.reused},
)
existing = _existing_active_derived_ref(source_ref.id, input_dataset.id)
if existing is not None:
_activate_existing_derived(source_ref.id, existing.id)
_emit_progress(progress_callback, "osm_staged_import_reused", f"Reused active OSM transport dataset #{existing.id}.", 7, 7, {"dataset_id": existing.id})
return _load_dataset(existing.id)
extract = _extract_transport_geojson_staged(source_ref, input_dataset, input_path, progress_callback=progress_callback)
derived_dataset = _reserve_derived_dataset(
source_ref=source_ref,
raw_dataset=raw_dataset,
input_dataset=input_dataset,
filtered_dataset=filtered_dataset,
extract=extract,
)
_emit_progress(
progress_callback,
"osm_derived_dataset_reserved",
f"Reserved derived OSM dataset #{derived_dataset.id}.",
5,
7,
{"dataset_id": derived_dataset.id, "path": derived_dataset.local_path, "sha256": derived_dataset.sha256, "extract_reused": extract.reused},
)
sidecar_metadata = _prepare_derived_storage_staged(derived_dataset, extract, progress_callback=progress_callback)
activated_id = _activate_staged_osm_import(
source_ref=source_ref,
raw_dataset=raw_dataset,
filtered_dataset=filtered_dataset,
input_dataset=input_dataset,
derived_dataset=derived_dataset,
extract=extract,
sidecar_metadata=sidecar_metadata,
)
_emit_progress(progress_callback, "osm_staged_import_completed", f"Activated OSM dataset #{activated_id}.", 7, 7, {"dataset_id": activated_id})
return _load_dataset(activated_id)
def _load_source_ref(source_id: int) -> _SourceRef:
with SessionLocal() as session:
source = session.get(Source, source_id)
if source is None:
raise ValueError(f"source not found: {source_id}")
if source.kind != "osm_pbf":
raise ValueError(f"staged OSM import requires source kind osm_pbf, got {source.kind}")
return _SourceRef(
id=source.id,
name=source.name,
kind=source.kind,
url=source.url,
country=source.country,
license=source.license,
notes=source.notes,
)
def _load_dataset(dataset_id: int) -> Dataset:
with SessionLocal() as session:
dataset = session.get(Dataset, dataset_id)
if dataset is None:
raise ValueError(f"dataset not found after staged import: {dataset_id}")
return dataset
def _mark_source_running(source_id: int) -> None:
with database_write_lock(f"osm_staged_import:{source_id}:start", timeout=30):
with SessionLocal() as session:
source = session.get(Source, source_id)
if source is None:
raise ValueError(f"source not found: {source_id}")
source.status = "running"
source.last_error = None
source.last_run_at = datetime.now(timezone.utc)
session.commit()
def _prepare_raw_file_staged(source: _SourceRef, progress_callback=None) -> _PreparedRawFile:
diff_raw = _try_prepare_raw_file_from_diffs_staged(source, progress_callback=progress_callback)
if diff_raw is not None:
return diff_raw
_emit_progress(progress_callback, "osm_full_snapshot_started", f"Downloading/copying full OSM snapshot for {source.name}.", 1, 7, {"source_id": source.id})
with measure_pipeline_phase("osm_full_snapshot", source_id=source.id, metadata={"url": source.url}) as metric:
raw_path = materialize_source(source) # type: ignore[arg-type]
raw_hash = sha256_file(raw_path)
metric.update({"path": str(raw_path), "sha256": raw_hash, "bytes": raw_path.stat().st_size if raw_path.exists() else None})
metadata = {
"stage": "raw_osm",
"raw_format": _raw_format(raw_path),
"source_url": source.url,
"import_mode": "staged_short_lock",
}
replication_state = _fetch_current_replication_state_for_snapshot(source, progress_callback=progress_callback)
if replication_state is not None:
metadata["replication_state"] = {
"updates_url": _source_updates_url(source), # type: ignore[arg-type]
"sequence_number": replication_state.sequence_number,
"timestamp": replication_state.timestamp,
}
_emit_progress(progress_callback, "osm_full_snapshot_completed", "Prepared raw OSM snapshot file.", 1, 7, {"path": str(raw_path), "sha256": raw_hash})
return _PreparedRawFile(path=raw_path, sha256=raw_hash, metadata=metadata, replication_state=replication_state, diff_state_metadata={"source": "full_snapshot"} if replication_state is not None else None)
def _try_prepare_raw_file_from_diffs_staged(source: _SourceRef, progress_callback=None) -> _PreparedRawFile | None:
updates_url = _source_updates_url(source) # type: ignore[arg-type]
if not updates_url:
return None
with SessionLocal() as session:
current_state = _latest_diff_state(session, source.id)
if current_state is None or current_state.raw_dataset_id is None:
_emit_progress(progress_callback, "osm_diff_fallback", "No local OSM replication state yet; using full snapshot.", None, None, {"updates_url": updates_url})
return None
base_dataset = session.get(Dataset, current_state.raw_dataset_id)
if base_dataset is None or not Path(base_dataset.local_path).exists():
_emit_progress(progress_callback, "osm_diff_fallback", "Local raw OSM base is missing; using full snapshot.", None, None, {"updates_url": updates_url})
return None
base_ref = _dataset_ref(base_dataset)
local_sequence = current_state.sequence_number
try:
remote_state = fetch_replication_state(updates_url, timeout=settings.osm_diff_state_timeout_seconds)
except Exception as exc: # noqa: BLE001 - correctness fallback
_emit_progress(progress_callback, "osm_diff_fallback", f"Could not read OSM replication state; using full snapshot: {exc}", None, None, {"updates_url": updates_url})
return None
if remote_state.sequence_number <= local_sequence:
_emit_progress(
progress_callback,
"osm_diff_up_to_date",
"Local raw OSM extract is already at the latest known replication sequence.",
remote_state.sequence_number,
remote_state.sequence_number,
{"updates_url": updates_url, "sequence_number": remote_state.sequence_number},
)
return _PreparedRawFile(
path=Path(base_ref.local_path),
sha256=base_ref.sha256,
metadata=base_ref.metadata,
replication_state=remote_state,
diff_state_metadata={"source": "existing_raw_dataset", "raw_dataset_id": base_ref.id},
)
gap = remote_state.sequence_number - local_sequence
if gap > settings.osm_diff_max_sequence_gap:
_emit_progress(
progress_callback,
"osm_diff_fallback",
"OSM replication gap is too large; using full snapshot.",
local_sequence,
remote_state.sequence_number,
{"gap": gap, "max_gap": settings.osm_diff_max_sequence_gap, "updates_url": updates_url},
)
return None
host_tool = _host_tool_path()
if not host_tool.exists():
_emit_progress(progress_callback, "osm_diff_fallback", "host_tool.sh is missing; using full snapshot.", None, None, {"host_tool": str(host_tool)})
return None
try:
return _apply_diff_range_files_staged(
source=source,
base_dataset=base_ref,
updates_url=updates_url,
local_sequence=local_sequence,
remote_state=remote_state,
host_tool=host_tool,
progress_callback=progress_callback,
)
except Exception as exc: # noqa: BLE001 - fall back to full snapshot rather than risk a bad base
_emit_progress(progress_callback, "osm_diff_fallback", f"OSM diff application failed; using full snapshot: {exc}", None, None, {"updates_url": updates_url})
return None
def _apply_diff_range_files_staged(
*,
source: _SourceRef,
base_dataset: _DatasetRef,
updates_url: str,
local_sequence: int,
remote_state: ReplicationState,
host_tool: Path,
progress_callback=None,
) -> _PreparedRawFile:
update_root = settings.data_dir / "sources" / f"source_{source.id}" / "updates"
work_root = settings.data_dir / "sources" / f"source_{source.id}" / "diff_work"
work_root.mkdir(parents=True, exist_ok=True)
current_path = Path(base_dataset.local_path)
batch_size = max(1, int(settings.osm_diff_apply_batch_size))
sequences = list(range(local_sequence + 1, remote_state.sequence_number + 1))
applied_sequences: list[int] = []
_emit_progress(
progress_callback,
"osm_diff_started",
f"Applying {len(sequences)} OSM replication diffs.",
local_sequence,
remote_state.sequence_number,
{"updates_url": updates_url, "from_sequence": local_sequence + 1, "to_sequence": remote_state.sequence_number},
)
with measure_pipeline_phase("osm_diff_apply", source_id=source.id, metadata={"from_sequence": local_sequence + 1, "to_sequence": remote_state.sequence_number}) as metric:
for batch_start in range(0, len(sequences), batch_size):
batch = sequences[batch_start : batch_start + batch_size]
diff_paths = []
for sequence in batch:
diff_path = download_diff(updates_url, sequence, update_root)
diff_paths.append(diff_path)
_emit_progress(
progress_callback,
"osm_diff_downloaded",
f"Downloaded OSM diff sequence {sequence}.",
sequence,
remote_state.sequence_number,
{"path": str(diff_path), "sequence_number": sequence},
)
temp_output = work_root / f"source_{source.id}_{batch[0]}_{batch[-1]}.tmp.osm.pbf"
completed = apply_osm_changes(current_path, diff_paths, temp_output, host_tool)
current_path = _store_updated_raw_pbf(source, temp_output) # type: ignore[arg-type]
applied_sequences.extend(batch)
_emit_progress(
progress_callback,
"osm_diff_applied",
f"Applied OSM diff sequences {batch[0]}-{batch[-1]}.",
batch[-1],
remote_state.sequence_number,
{
"output_path": str(current_path),
"stdout": completed.stdout.strip(),
"stderr": completed.stderr.strip(),
"batch_start": batch[0],
"batch_end": batch[-1],
},
)
raw_hash = sha256_file(current_path)
metric.update({"applied_sequences": applied_sequences, "path": str(current_path), "sha256": raw_hash, "bytes": current_path.stat().st_size if current_path.exists() else None})
metadata = {
"stage": "raw_osm",
"raw_format": _raw_format(current_path),
"source_url": source.url,
"import_mode": "staged_short_lock",
"replication_state": {
"updates_url": updates_url,
"sequence_number": remote_state.sequence_number,
"timestamp": remote_state.timestamp,
},
"diff_update": {
"base_dataset_id": base_dataset.id,
"base_sequence_number": local_sequence,
"applied_sequences": applied_sequences,
},
}
return _PreparedRawFile(
path=current_path,
sha256=raw_hash,
metadata=metadata,
replication_state=remote_state,
diff_state_metadata={"base_dataset_id": base_dataset.id, "applied_sequences": applied_sequences},
)
def _fetch_current_replication_state_for_snapshot(source: _SourceRef, progress_callback=None) -> ReplicationState | None:
updates_url = _source_updates_url(source) # type: ignore[arg-type]
if not updates_url:
return None
try:
return fetch_replication_state(updates_url, timeout=settings.osm_diff_state_timeout_seconds)
except Exception as exc: # noqa: BLE001 - full snapshot is still usable without diff state
_emit_progress(progress_callback, "osm_diff_state_unavailable", f"Could not record OSM replication state: {exc}", None, None, {"updates_url": updates_url})
return None
def _reserve_raw_dataset(source_ref: _SourceRef, prepared: _PreparedRawFile) -> _DatasetRef:
with database_write_lock(f"osm_staged_import:{source_ref.id}:reserve_raw", timeout=60):
with SessionLocal() as session:
source = session.get(Source, source_ref.id)
if source is None:
raise ValueError(f"source not found: {source_ref.id}")
dataset = _find_raw_dataset(session, source, prepared.sha256)
if dataset is None:
dataset = Dataset(
source_id=source.id,
kind="osm_pbf_raw",
local_path=str(prepared.path),
sha256=prepared.sha256,
is_active=False,
status="committed",
metadata_json=json.dumps(prepared.metadata, indent=2),
)
session.add(dataset)
session.flush()
else:
dataset.local_path = str(prepared.path)
dataset.status = "committed"
dataset.metadata_json = json.dumps({**_metadata(dataset), **prepared.metadata}, indent=2)
if prepared.replication_state is not None:
_record_diff_state(
session,
source=source,
raw_dataset=dataset,
updates_url=str(prepared.metadata.get("replication_state", {}).get("updates_url") or _source_updates_url(source) or ""),
state=prepared.replication_state,
metadata=prepared.diff_state_metadata,
)
_record_pipeline_stage(
session,
stage=STAGE_ACQUIRE_RAW,
version=RAW_ACQUIRE_VERSION,
source_id=source.id,
dataset=dataset,
inputs={
"source_url": source.url,
"source_kind": source.kind,
"remote": prepared.metadata.get("replication_state") or prepared.metadata.get("source_url"),
},
outputs={
"path": str(prepared.path),
"sha256": prepared.sha256,
"raw_format": prepared.metadata.get("raw_format"),
"diff_update": prepared.metadata.get("diff_update"),
},
)
source.status = "running"
source.last_error = None
session.commit()
return _dataset_ref(dataset)
def _prepare_transport_file_staged(source: _SourceRef, raw_dataset: _DatasetRef, raw_path: Path, progress_callback=None) -> _PreparedTransportFile:
output_path = _transport_filter_path_for_raw_id(source.id, raw_dataset.id, raw_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
metadata_path = output_path.with_suffix(output_path.suffix + ".metadata.json")
existing_metadata = _read_json_file(metadata_path)
if output_path.exists() and existing_metadata.get("input_sha256") == raw_dataset.sha256 and existing_metadata.get("filter") == TRANSPORT_FILTER_VERSION:
filtered_hash = sha256_file(output_path)
_emit_progress(progress_callback, "osm_transport_filter_reused", "Reusing existing filtered OSM transport extract.", 3, 7, {"path": str(output_path), "sha256": filtered_hash})
return _PreparedTransportFile(path=output_path, sha256=filtered_hash, metadata=existing_metadata, reused=True)
script_path = _prefilter_script_path()
if not script_path.exists():
raise FileNotFoundError(f"OSM transport filter script not found: {script_path}")
_emit_progress(progress_callback, "osm_transport_filter_started", "Filtering OSM PBF to public-transport objects.", 2, 7, {"input_path": str(raw_path), "output_path": str(output_path)})
with measure_pipeline_phase("osm_transport_filter", source_id=source.id, dataset_id=raw_dataset.id, metadata={"input_path": str(raw_path), "output_path": str(output_path)}) as metric:
command = [str(script_path), str(raw_path), str(output_path)]
try:
completed = subprocess.run(command, check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as exc:
stderr = (exc.stderr or "").strip()
stdout = (exc.stdout or "").strip()
details = stderr or stdout or f"exit code {exc.returncode}"
raise RuntimeError(f"OSM transport filter failed for {raw_path}: {details}") from exc
filtered_hash = sha256_file(output_path)
metric.update({"sha256": filtered_hash, "bytes": output_path.stat().st_size if output_path.exists() else None})
metadata = {
"stage": "filtered_osm_transport_pbf",
"raw_format": _raw_format(output_path),
"derived_from_dataset_id": raw_dataset.id,
"source_url": source.url,
"filter": TRANSPORT_FILTER_VERSION,
"filter_script": str(script_path),
"input_path": str(raw_path),
"input_sha256": raw_dataset.sha256,
"output_path": str(output_path),
"stdout": completed.stdout.strip(),
"stderr": completed.stderr.strip(),
"import_mode": "staged_short_lock",
}
metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
_emit_progress(progress_callback, "osm_transport_filter_completed", "Filtered OSM transport extract.", 3, 7, {"path": str(output_path), "sha256": filtered_hash})
return _PreparedTransportFile(path=output_path, sha256=filtered_hash, metadata=metadata, reused=False)
def _reserve_transport_dataset(source_ref: _SourceRef, raw_dataset: _DatasetRef, prepared: _PreparedTransportFile) -> _DatasetRef:
with database_write_lock(f"osm_staged_import:{source_ref.id}:reserve_transport", timeout=60):
with SessionLocal() as session:
source = session.get(Source, source_ref.id)
raw = session.get(Dataset, raw_dataset.id)
if source is None or raw is None:
raise ValueError("source or raw dataset disappeared during staged import")
dataset = _find_transport_dataset_by_raw_id(session, source.id, raw_dataset.id)
if dataset is None:
dataset = Dataset(
source_id=source.id,
kind="osm_pbf_transport",
local_path=str(prepared.path),
sha256=prepared.sha256,
is_active=False,
status="filtered",
metadata_json=json.dumps(prepared.metadata, indent=2),
)
session.add(dataset)
session.flush()
else:
dataset.local_path = str(prepared.path)
dataset.sha256 = prepared.sha256
dataset.status = "filtered"
dataset.metadata_json = json.dumps(prepared.metadata, indent=2)
raw.status = "filtered"
raw.metadata_json = json.dumps({**_metadata(raw), "filtered_dataset_id": dataset.id}, indent=2)
_record_pipeline_stage(
session,
stage=STAGE_FILTER_TRANSPORT,
version=TRANSPORT_FILTER_VERSION,
source_id=source.id,
dataset=dataset,
inputs={
"raw_dataset_id": raw_dataset.id,
"raw_sha256": raw_dataset.sha256,
"filter_script": prepared.metadata.get("filter_script"),
},
outputs={"path": str(prepared.path), "sha256": prepared.sha256, "reused": prepared.reused},
)
session.commit()
return _dataset_ref(dataset)
def _extract_transport_geojson_staged(source: _SourceRef, input_dataset: _DatasetRef, input_path: Path, progress_callback=None) -> _PreparedExtract:
output_dir = settings.data_dir / "derived" / f"source_{source.id}" / f"extract_dataset_{input_dataset.id}"
output_path = output_dir / "transport.geojson"
summary_path = output_path.with_suffix(".summary.json")
existing_summary = _read_json_file(summary_path)
if output_path.exists() and existing_summary.get("input_sha256") == input_dataset.sha256 and existing_summary.get("extractor") == EXTRACTOR_VERSION:
output_hash = sha256_file(output_path)
_emit_progress(progress_callback, "osm_extract_reused", "Reusing existing extracted OSM transport GeoJSON.", 4, 7, {"path": str(output_path), "sha256": output_hash})
return _PreparedExtract(path=output_path, sha256=output_hash, summary=existing_summary["extract_summary"], reused=True)
_emit_progress(progress_callback, "osm_extract_started", "Extracting route, stop, and infrastructure geometry from OSM.", 4, 7, {"input_path": str(input_path), "output_path": str(output_path)})
with measure_pipeline_phase("osm_transport_extract", source_id=source.id, dataset_id=input_dataset.id, metadata={"input_path": str(input_path), "output_path": str(output_path)}) as metric:
extract_summary = extract_osm_transport_geojson(input_path, output_path)
output_hash = sha256_file(output_path)
metric.update({**extract_summary, "sha256": output_hash, "bytes": output_path.stat().st_size if output_path.exists() else None})
summary = {
"input_dataset_id": input_dataset.id,
"input_sha256": input_dataset.sha256,
"extractor": EXTRACTOR_VERSION,
"extract_summary": extract_summary,
}
summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
_emit_progress(progress_callback, "osm_extract_completed", "Extracted OSM transport GeoJSON.", 4, 7, {"path": str(output_path), "sha256": output_hash, **extract_summary})
return _PreparedExtract(path=output_path, sha256=output_hash, summary=extract_summary, reused=False)
def _existing_active_derived_ref(source_id: int, input_dataset_id: int) -> _DatasetRef | None:
with SessionLocal() as session:
source = session.get(Source, source_id)
if source is None:
return None
dataset = _find_existing_derived(session, source, Dataset(id=input_dataset_id))
if dataset is None:
return None
return _dataset_ref(dataset)
def _activate_existing_derived(source_id: int, derived_dataset_id: int) -> None:
with database_write_lock(f"osm_staged_import:{source_id}:reuse_existing", timeout=60):
with SessionLocal() as session:
source = session.get(Source, source_id)
dataset = session.get(Dataset, derived_dataset_id)
if source is None or dataset is None:
return
for existing in source.datasets:
existing.is_active = existing.id == dataset.id
source.status = "ok"
source.last_error = None
source.last_run_at = datetime.now(timezone.utc)
session.commit()
def _reserve_derived_dataset(
*,
source_ref: _SourceRef,
raw_dataset: _DatasetRef,
input_dataset: _DatasetRef,
filtered_dataset: _DatasetRef | None,
extract: _PreparedExtract,
) -> _DatasetRef:
metadata = {
"stage": "derived_osm_transport_geojson",
"derived_from_dataset_id": input_dataset.id,
"raw_dataset_id": raw_dataset.id,
"filtered_dataset_id": None if filtered_dataset is None else filtered_dataset.id,
"extractor": EXTRACTOR_VERSION,
"extract_summary": extract.summary,
"import_mode": "staged_short_lock",
"sidecar_status": "pending",
}
with database_write_lock(f"osm_staged_import:{source_ref.id}:reserve_derived", timeout=60):
with SessionLocal() as session:
source = session.get(Source, source_ref.id)
if source is None:
raise ValueError(f"source not found: {source_ref.id}")
dataset = _find_staged_derived_dataset(session, source.id, input_dataset.id, extract.sha256)
if dataset is None:
dataset = Dataset(
source_id=source.id,
kind="osm_geojson",
local_path=str(extract.path),
sha256=extract.sha256,
is_active=False,
status="sidecar_staging",
metadata_json=json.dumps(metadata, indent=2),
)
session.add(dataset)
session.flush()
else:
dataset.local_path = str(extract.path)
dataset.sha256 = extract.sha256
dataset.status = "sidecar_staging"
dataset.metadata_json = json.dumps({**_metadata(dataset), **metadata}, indent=2)
_record_pipeline_stage(
session,
stage=STAGE_EXTRACT_GEOMETRY,
version=EXTRACTOR_VERSION,
source_id=source.id,
dataset=dataset,
inputs={
"input_dataset_id": input_dataset.id,
"input_sha256": input_dataset.sha256,
"extractor": EXTRACTOR_VERSION,
},
outputs={"path": str(extract.path), "sha256": extract.sha256, "summary": extract.summary, "reused": extract.reused},
)
session.commit()
return _dataset_ref(dataset)
def _prepare_derived_storage_staged(derived_dataset: _DatasetRef, extract: _PreparedExtract, progress_callback=None) -> dict[str, object]:
storage = derived_dataset.metadata.get("osm_storage")
if isinstance(storage, dict):
if storage.get("mode") == OSM_STORAGE_MAIN and derived_dataset.metadata.get("storage_status") == "ready":
_emit_progress(progress_callback, "osm_storage_reused", "Reusing existing OSM main-table storage.", 6, 7, {"dataset_id": derived_dataset.id})
return derived_dataset.metadata
sidecar = storage.get("sidecar_path")
if sidecar and Path(str(sidecar)).exists() and derived_dataset.metadata.get("sidecar_status") == "ready":
_emit_progress(progress_callback, "osm_sidecar_reused", "Reusing existing OSM feature sidecar.", 6, 7, {"dataset_id": derived_dataset.id, "sidecar_path": str(sidecar)})
return derived_dataset.metadata
storage_mode = effective_osm_feature_storage()
storage_label = "main-table OSM feature storage" if storage_mode == OSM_STORAGE_MAIN else "OSM feature sidecar"
started_event = "osm_storage_started" if storage_mode == OSM_STORAGE_MAIN else "osm_sidecar_started"
completed_event = "osm_storage_completed" if storage_mode == OSM_STORAGE_MAIN else "osm_sidecar_completed"
_emit_progress(progress_callback, started_event, f"Building {storage_label}.", 5, 7, {"dataset_id": derived_dataset.id, "path": str(extract.path), "storage_mode": storage_mode})
transient_dataset = Dataset(
id=derived_dataset.id,
source_id=derived_dataset.source_id,
kind=derived_dataset.kind,
local_path=derived_dataset.local_path,
sha256=derived_dataset.sha256,
is_active=False,
status=derived_dataset.status,
metadata_json=json.dumps(derived_dataset.metadata, indent=2),
)
with measure_pipeline_phase("osm_sidecar_build", source_id=derived_dataset.source_id, dataset_id=derived_dataset.id, metadata={"path": str(extract.path)}) as metric:
with SessionLocal() as session:
sidecar_metadata = prepare_osm_geojson_storage(
session=session,
dataset=transient_dataset,
path=extract.path,
source_hash=derived_dataset.sha256,
storage_mode=storage_mode,
)
session.commit()
metric.update(sidecar_metadata)
metadata = {**derived_dataset.metadata, **sidecar_metadata, "sidecar_status": "ready" if storage_mode == OSM_STORAGE_SIDECAR_FEATURES else "not_used", "storage_status": "ready"}
_emit_progress(progress_callback, completed_event, f"Built {storage_label}.", 6, 7, {"dataset_id": derived_dataset.id, **sidecar_metadata})
return metadata
def _activate_staged_osm_import(
*,
source_ref: _SourceRef,
raw_dataset: _DatasetRef,
filtered_dataset: _DatasetRef | None,
input_dataset: _DatasetRef,
derived_dataset: _DatasetRef,
extract: _PreparedExtract,
sidecar_metadata: dict[str, object],
) -> int:
metadata = {
**sidecar_metadata,
"stage": "derived_osm_transport_geojson",
"derived_from_dataset_id": input_dataset.id,
"raw_dataset_id": raw_dataset.id,
"filtered_dataset_id": None if filtered_dataset is None else filtered_dataset.id,
"extractor": EXTRACTOR_VERSION,
"extract_summary": extract.summary,
"import_mode": "staged_short_lock",
"sidecar_status": "ready",
}
with database_write_lock(f"osm_staged_import:{source_ref.id}:activate", timeout=60):
with SessionLocal() as session:
source = session.get(Source, source_ref.id)
raw = session.get(Dataset, raw_dataset.id)
filtered = session.get(Dataset, filtered_dataset.id) if filtered_dataset is not None else None
derived = session.get(Dataset, derived_dataset.id)
if source is None or raw is None or derived is None:
raise ValueError("staged OSM activation lost source or dataset rows")
for dataset in source.datasets:
dataset.is_active = False
raw.status = "filtered" if filtered is not None else "extracted"
raw.is_active = False
raw.metadata_json = json.dumps({**_metadata(raw), "extractor": EXTRACTOR_VERSION, "extract_summary": extract.summary}, indent=2)
if filtered is not None:
filtered.status = "extracted"
filtered.is_active = False
filtered.metadata_json = json.dumps({**_metadata(filtered), "extractor": EXTRACTOR_VERSION, "extract_summary": extract.summary}, indent=2)
derived.status = "imported"
derived.is_active = True
derived.local_path = str(extract.path)
derived.sha256 = extract.sha256
derived.metadata_json = json.dumps(metadata, indent=2)
_record_pipeline_stage(
session,
stage=STAGE_BUILD_INDEXES,
version=OSM_SIDECAR_INDEX_VERSION,
source_id=source.id,
dataset=derived,
inputs={
"dataset_id": derived.id,
"dataset_sha256": derived.sha256,
"sidecar_schema": "osm_features_v1",
"indexed_columns": ["kind", "mode", "route_scope", "bbox", "route_key", "ref", "identity"],
},
outputs=sidecar_metadata.get("osm_storage") if isinstance(sidecar_metadata.get("osm_storage"), dict) else sidecar_metadata,
)
source.status = "ok"
source.last_error = None
source.last_run_at = datetime.now(timezone.utc)
session.commit()
return derived.id
def _find_transport_dataset_by_raw_id(session: Session, source_id: int, raw_dataset_id: int) -> Dataset | None:
datasets = session.scalars(
select(Dataset)
.where(Dataset.source_id == source_id, Dataset.kind == "osm_pbf_transport")
.order_by(Dataset.id.desc())
).all()
for dataset in datasets:
metadata = _metadata(dataset)
if metadata.get("derived_from_dataset_id") == raw_dataset_id and metadata.get("filter") == TRANSPORT_FILTER_VERSION:
return dataset
return None
def _find_staged_derived_dataset(session: Session, source_id: int, input_dataset_id: int, extract_hash: str) -> Dataset | None:
datasets = session.scalars(
select(Dataset)
.where(
Dataset.source_id == source_id,
Dataset.kind == "osm_geojson",
Dataset.status.in_(["sidecar_staging", "importing"]),
Dataset.sha256 == extract_hash,
)
.order_by(Dataset.id.desc())
).all()
for dataset in datasets:
metadata = _metadata(dataset)
if metadata.get("derived_from_dataset_id") == input_dataset_id and metadata.get("extractor") == EXTRACTOR_VERSION:
return dataset
return None
def _dataset_ref(dataset: Dataset) -> _DatasetRef:
return _DatasetRef(
id=int(dataset.id),
source_id=int(dataset.source_id),
kind=dataset.kind,
local_path=dataset.local_path,
sha256=dataset.sha256,
status=dataset.status,
metadata=_metadata(dataset),
)
def _transport_filter_path_for_raw_id(source_id: int, raw_dataset_id: int, raw_path: Path) -> Path:
raw_format = _raw_format(raw_path)
suffix = ".osm.pbf" if raw_format == "osm_pbf" else ".osm"
return settings.data_dir / "derived" / f"source_{source_id}" / f"raw_dataset_{raw_dataset_id}" / f"transport{suffix}"
def _read_json_file(path: Path) -> dict[str, Any]:
if not path.exists():
return {}
try:
data = json.loads(path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
return {}
return data if isinstance(data, dict) else {}
def _prepare_raw_osm_dataset(session: Session, source: Source, progress_callback=None) -> Dataset:
diff_dataset = _try_prepare_raw_from_diffs(session, source, progress_callback=progress_callback)
if diff_dataset is not None:
return diff_dataset
_emit_progress(progress_callback, "osm_full_snapshot_started", f"Downloading/copying full OSM snapshot for {source.name}.", None, None, {"source_id": source.id})
raw_path = materialize_source(source)
raw_hash = sha256_file(raw_path)
raw_dataset = _find_raw_dataset(session, source, raw_hash) or _commit_raw_dataset(session, source, raw_path, raw_hash)
_record_current_replication_state_for_snapshot(session, source, raw_dataset, progress_callback=progress_callback)
_emit_progress(progress_callback, "osm_full_snapshot_completed", f"Prepared raw OSM dataset #{raw_dataset.id}.", None, None, {"dataset_id": raw_dataset.id})
return raw_dataset
def extract_osm_transport_geojson(input_path: Path, output_path: Path) -> dict[str, Any]:
scan = _TransportScanHandler()
scan.apply_file(str(input_path))
geometry = _TransportGeometryHandler(scan.route_relations, scan.route_way_ids)
geometry.apply_file(str(input_path), locations=True)
features = geometry.features()
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps({"type": "FeatureCollection", "features": features}), encoding="utf-8")
route_features = sum(1 for feature in features if feature["properties"].get("type") == "route")
infra_features = sum(1 for feature in features if feature["properties"].get("kind") == "infra")
stop_features = len(features) - route_features - infra_features
return {
"input_path": str(input_path),
"output_path": str(output_path),
"route_relations_seen": len(scan.route_relations),
"route_relation_member_ways": len(scan.route_way_ids),
"features": len(features),
"route_features": route_features,
"infrastructure_features": infra_features,
"stop_station_features": stop_features,
"route_relations_without_geometry": geometry.route_relations_without_geometry,
}
def _commit_raw_dataset(session: Session, source: Source, path: Path, source_hash: str) -> Dataset:
for dataset in source.datasets:
dataset.is_active = False
dataset = Dataset(
source_id=source.id,
kind="osm_pbf_raw",
local_path=str(path),
sha256=source_hash,
is_active=False,
status="committed",
metadata_json=json.dumps(
{
"stage": "raw_osm",
"raw_format": _raw_format(path),
"source_url": source.url,
},
indent=2,
),
)
session.add(dataset)
session.flush()
return dataset
def _try_prepare_raw_from_diffs(session: Session, source: Source, progress_callback=None) -> Dataset | None:
updates_url = _source_updates_url(source)
if not updates_url:
return None
current_state = _latest_diff_state(session, source.id)
if current_state is None or current_state.raw_dataset_id is None:
_emit_progress(progress_callback, "osm_diff_fallback", "No local OSM replication state yet; using full snapshot.", None, None, {"updates_url": updates_url})
return None
raw_dataset = session.get(Dataset, current_state.raw_dataset_id)
if raw_dataset is None or not Path(raw_dataset.local_path).exists():
_emit_progress(progress_callback, "osm_diff_fallback", "Local raw OSM base is missing; using full snapshot.", None, None, {"updates_url": updates_url})
return None
try:
remote_state = fetch_replication_state(updates_url, timeout=settings.osm_diff_state_timeout_seconds)
except Exception as exc: # noqa: BLE001 - correctness fallback
_emit_progress(progress_callback, "osm_diff_fallback", f"Could not read OSM replication state; using full snapshot: {exc}", None, None, {"updates_url": updates_url})
return None
if remote_state.sequence_number <= current_state.sequence_number:
_emit_progress(
progress_callback,
"osm_diff_up_to_date",
"Local raw OSM extract is already at the latest known replication sequence.",
remote_state.sequence_number,
remote_state.sequence_number,
{"updates_url": updates_url, "sequence_number": remote_state.sequence_number},
)
return raw_dataset
gap = remote_state.sequence_number - current_state.sequence_number
if gap > settings.osm_diff_max_sequence_gap:
_emit_progress(
progress_callback,
"osm_diff_fallback",
"OSM replication gap is too large; using full snapshot.",
current_state.sequence_number,
remote_state.sequence_number,
{"gap": gap, "max_gap": settings.osm_diff_max_sequence_gap, "updates_url": updates_url},
)
return None
host_tool = _host_tool_path()
if not host_tool.exists():
_emit_progress(progress_callback, "osm_diff_fallback", "host_tool.sh is missing; using full snapshot.", None, None, {"host_tool": str(host_tool)})
return None
try:
return _apply_diff_range(
session=session,
source=source,
base_dataset=raw_dataset,
updates_url=updates_url,
local_sequence=current_state.sequence_number,
remote_state=remote_state,
host_tool=host_tool,
progress_callback=progress_callback,
)
except Exception as exc: # noqa: BLE001 - fall back to full snapshot rather than risk a bad base
_emit_progress(progress_callback, "osm_diff_fallback", f"OSM diff application failed; using full snapshot: {exc}", None, None, {"updates_url": updates_url})
return None
def _apply_diff_range(
session: Session,
source: Source,
base_dataset: Dataset,
updates_url: str,
local_sequence: int,
remote_state: ReplicationState,
host_tool: Path,
progress_callback=None,
) -> Dataset:
update_root = settings.data_dir / "sources" / f"source_{source.id}" / "updates"
work_root = settings.data_dir / "sources" / f"source_{source.id}" / "diff_work"
work_root.mkdir(parents=True, exist_ok=True)
current_path = Path(base_dataset.local_path)
batch_size = max(1, int(settings.osm_diff_apply_batch_size))
sequences = list(range(local_sequence + 1, remote_state.sequence_number + 1))
applied_sequences: list[int] = []
_emit_progress(
progress_callback,
"osm_diff_started",
f"Applying {len(sequences)} OSM replication diffs.",
local_sequence,
remote_state.sequence_number,
{"updates_url": updates_url, "from_sequence": local_sequence + 1, "to_sequence": remote_state.sequence_number},
)
for batch_start in range(0, len(sequences), batch_size):
batch = sequences[batch_start : batch_start + batch_size]
diff_paths = []
for sequence in batch:
diff_path = download_diff(updates_url, sequence, update_root)
diff_paths.append(diff_path)
_emit_progress(
progress_callback,
"osm_diff_downloaded",
f"Downloaded OSM diff sequence {sequence}.",
sequence,
remote_state.sequence_number,
{"path": str(diff_path), "sequence_number": sequence},
)
temp_output = work_root / f"source_{source.id}_{batch[0]}_{batch[-1]}.tmp.osm.pbf"
completed = apply_osm_changes(current_path, diff_paths, temp_output, host_tool)
current_path = _store_updated_raw_pbf(source, temp_output)
applied_sequences.extend(batch)
_emit_progress(
progress_callback,
"osm_diff_applied",
f"Applied OSM diff sequences {batch[0]}-{batch[-1]}.",
batch[-1],
remote_state.sequence_number,
{
"output_path": str(current_path),
"stdout": completed.stdout.strip(),
"stderr": completed.stderr.strip(),
"batch_start": batch[0],
"batch_end": batch[-1],
},
)
raw_hash = sha256_file(current_path)
dataset = _find_raw_dataset(session, source, raw_hash) or _commit_raw_dataset(session, source, current_path, raw_hash)
_update_dataset_metadata(
dataset,
replication_state={
"updates_url": updates_url,
"sequence_number": remote_state.sequence_number,
"timestamp": remote_state.timestamp,
},
diff_update={
"base_dataset_id": base_dataset.id,
"base_sequence_number": local_sequence,
"applied_sequences": applied_sequences,
},
)
_record_diff_state(
session,
source=source,
raw_dataset=dataset,
updates_url=updates_url,
state=remote_state,
metadata={"base_dataset_id": base_dataset.id, "applied_sequences": applied_sequences},
)
return dataset
def _record_current_replication_state_for_snapshot(session: Session, source: Source, raw_dataset: Dataset, progress_callback=None) -> None:
updates_url = _source_updates_url(source)
if not updates_url:
return
try:
state = fetch_replication_state(updates_url, timeout=settings.osm_diff_state_timeout_seconds)
except Exception as exc: # noqa: BLE001 - full snapshot is still usable without diff state
_emit_progress(progress_callback, "osm_diff_state_unavailable", f"Could not record OSM replication state: {exc}", None, None, {"updates_url": updates_url})
return
_update_dataset_metadata(
raw_dataset,
replication_state={
"updates_url": updates_url,
"sequence_number": state.sequence_number,
"timestamp": state.timestamp,
},
)
_record_diff_state(
session,
source=source,
raw_dataset=raw_dataset,
updates_url=updates_url,
state=state,
metadata={"source": "full_snapshot"},
)
def _record_diff_state(
session: Session,
source: Source,
raw_dataset: Dataset,
updates_url: str,
state: ReplicationState,
metadata: dict[str, Any] | None = None,
) -> OsmDiffState:
for existing in session.scalars(select(OsmDiffState).where(OsmDiffState.source_id == source.id, OsmDiffState.status == "active")).all():
existing.status = "superseded"
row = OsmDiffState(
source_id=source.id,
raw_dataset_id=raw_dataset.id,
updates_url=updates_url,
sequence_number=state.sequence_number,
timestamp=state.timestamp,
status="active",
metadata_json=json.dumps({"state": state.raw, **(metadata or {})}, separators=(",", ":")),
)
session.add(row)
session.flush()
return row
def _latest_diff_state(session: Session, source_id: int) -> OsmDiffState | None:
return session.scalar(
select(OsmDiffState)
.where(OsmDiffState.source_id == source_id, OsmDiffState.status == "active")
.order_by(OsmDiffState.sequence_number.desc(), OsmDiffState.id.desc())
)
def _store_updated_raw_pbf(source: Source, temp_path: Path) -> Path:
source_dir = settings.data_dir / "sources" / f"source_{source.id}"
source_dir.mkdir(parents=True, exist_ok=True)
raw_hash = sha256_file(temp_path)
target = source_dir / f"{raw_hash[:16]}.osm.pbf"
if target.exists() and sha256_file(target) == raw_hash:
temp_path.unlink(missing_ok=True)
return target
shutil.move(str(temp_path), str(target))
return target
def _source_updates_url(source: Source) -> str | None:
notes = source.notes or ""
for part in notes.split(";"):
if "=" not in part:
continue
key, value = part.strip().split("=", 1)
if key.strip() == "updates_url" and value.strip():
return value.strip()
if source.kind == "osm_diff" and source.url:
return source.url
return None
def _host_tool_path() -> Path:
return Path(__file__).resolve().parents[2] / "scripts" / "host_tool.sh"
def _find_raw_dataset(session: Session, source: Source, raw_hash: str) -> Dataset | None:
return session.scalar(
select(Dataset)
.where(
Dataset.source_id == source.id,
Dataset.kind == "osm_pbf_raw",
Dataset.sha256 == raw_hash,
)
.order_by(Dataset.id.desc())
)
def _prepare_transport_pbf(session: Session, source: Source, raw_dataset: Dataset, raw_path: Path) -> Dataset:
existing = _find_transport_dataset(session, source, raw_dataset)
if existing is not None and Path(existing.local_path).exists():
return existing
output_path = _transport_filter_path(source, raw_dataset, raw_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
script_path = _prefilter_script_path()
if not script_path.exists():
raise FileNotFoundError(f"OSM transport filter script not found: {script_path}")
command = [str(script_path), str(raw_path), str(output_path)]
try:
completed = subprocess.run(command, check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as exc:
stderr = (exc.stderr or "").strip()
stdout = (exc.stdout or "").strip()
details = stderr or stdout or f"exit code {exc.returncode}"
raise RuntimeError(f"OSM transport filter failed for {raw_path}: {details}") from exc
filtered_hash = sha256_file(output_path)
metadata = {
"stage": "filtered_osm_transport_pbf",
"raw_format": _raw_format(output_path),
"derived_from_dataset_id": raw_dataset.id,
"source_url": source.url,
"filter": TRANSPORT_FILTER_VERSION,
"filter_script": str(script_path),
"input_path": str(raw_path),
"input_sha256": raw_dataset.sha256,
"output_path": str(output_path),
"stdout": completed.stdout.strip(),
"stderr": completed.stderr.strip(),
}
if existing is None:
dataset = Dataset(
source_id=source.id,
kind="osm_pbf_transport",
local_path=str(output_path),
sha256=filtered_hash,
is_active=False,
status="filtered",
metadata_json=json.dumps(metadata, indent=2),
)
session.add(dataset)
else:
dataset = existing
dataset.local_path = str(output_path)
dataset.sha256 = filtered_hash
dataset.status = "filtered"
dataset.metadata_json = json.dumps(metadata, indent=2)
raw_dataset.status = "filtered"
session.flush()
return dataset
def _find_transport_dataset(session: Session, source: Source, raw_dataset: Dataset) -> Dataset | None:
datasets = session.scalars(
select(Dataset)
.where(Dataset.source_id == source.id, Dataset.kind == "osm_pbf_transport")
.order_by(Dataset.id.desc())
).all()
for dataset in datasets:
metadata = _metadata(dataset)
if (
metadata.get("derived_from_dataset_id") == raw_dataset.id
and metadata.get("filter") == TRANSPORT_FILTER_VERSION
):
return dataset
return None
def _find_existing_derived(session: Session, source: Source, input_dataset: Dataset) -> Dataset | None:
derived_datasets = session.scalars(
select(Dataset)
.where(
Dataset.source_id == source.id,
Dataset.kind == "osm_geojson",
Dataset.status == "imported",
Dataset.is_active.is_(True),
)
.order_by(Dataset.id.desc())
).all()
for derived_dataset in derived_datasets:
metadata = _metadata(derived_dataset)
if (
metadata.get("derived_from_dataset_id") == input_dataset.id
and metadata.get("extractor") == EXTRACTOR_VERSION
):
return derived_dataset
return None
def _metadata(dataset: Dataset) -> dict[str, Any]:
try:
return json.loads(dataset.metadata_json or "{}")
except json.JSONDecodeError:
return {}
def _update_dataset_metadata(dataset: Dataset, **values: Any) -> None:
metadata = _metadata(dataset)
metadata.update(values)
dataset.metadata_json = json.dumps(metadata, indent=2)
def _emit_progress(progress_callback, event_type: str, message: str, progress_current=None, progress_total=None, metadata: dict[str, Any] | None = None) -> None:
if progress_callback is not None:
progress_callback(event_type, message, progress_current, progress_total, metadata)
def _should_prefilter(path: Path) -> bool:
if not settings.osm_pbf_prefilter_enabled:
return False
return _raw_format(path) in _prefilter_formats()
def _prefilter_formats() -> set[str]:
return {
value.strip()
for value in str(settings.osm_pbf_prefilter_formats or "").split(",")
if value.strip()
}
def _prefilter_script_path() -> Path:
path = settings.osm_pbf_prefilter_script
if path.is_absolute():
return path
return Path.cwd() / path
def _transport_filter_path(source: Source, raw_dataset: Dataset, raw_path: Path) -> Path:
raw_format = _raw_format(raw_path)
suffix = ".osm.pbf" if raw_format == "osm_pbf" else ".osm"
return settings.data_dir / "derived" / f"source_{source.id}" / f"raw_dataset_{raw_dataset.id}" / f"transport{suffix}"
class _TransportScanHandler(osmium.SimpleHandler):
def __init__(self) -> None:
super().__init__()
self.route_relations: dict[int, dict[str, Any]] = {}
self.route_way_ids: set[int] = set()
def relation(self, relation: osmium.osm.Relation) -> None:
tags = _tags_dict(relation.tags)
mode = _route_mode(tags)
if tags.get("type") != "route" or mode is None:
return
way_refs = [member.ref for member in relation.members if member.type == "w"]
if not way_refs:
return
self.route_relations[relation.id] = {
"tags": tags,
"way_refs": way_refs,
}
self.route_way_ids.update(way_refs)
class _TransportGeometryHandler(osmium.SimpleHandler):
def __init__(self, route_relations: dict[int, dict[str, Any]], route_way_ids: set[int]) -> None:
super().__init__()
self.route_relations = route_relations
self.route_way_ids = route_way_ids
self.route_way_lines: dict[int, list[list[float]]] = {}
self.infrastructure_features: list[dict[str, Any]] = []
self.stop_features: list[dict[str, Any]] = []
self.route_relations_without_geometry = 0
def node(self, node: osmium.osm.Node) -> None:
tags = _tags_dict(node.tags)
if not _is_stop_or_station(tags):
return
coords = _node_coords(node)
if coords is None:
return
props = {
**tags,
"osm_type": "node",
"osm_id": str(node.id),
}
self.stop_features.append({"type": "Feature", "geometry": {"type": "Point", "coordinates": coords}, "properties": props})
def way(self, way: osmium.osm.Way) -> None:
tags = _tags_dict(way.tags)
coords = _way_coords(way)
if coords is not None and way.id in self.route_way_ids:
self.route_way_lines[way.id] = coords
if coords is not None and _is_transport_infrastructure(tags):
props = {
**tags,
"osm_type": "way",
"osm_id": str(way.id),
"kind": "infra",
}
mode = _infrastructure_mode(tags)
if mode:
props.setdefault("mode", mode)
self.infrastructure_features.append(
{"type": "Feature", "geometry": {"type": "LineString", "coordinates": coords}, "properties": props}
)
if _is_stop_or_station(tags):
feature = _way_area_or_line_feature(way, tags, coords)
if feature is not None:
self.stop_features.append(feature)
def features(self) -> list[dict[str, Any]]:
route_features = []
for relation_id, route in self.route_relations.items():
lines = [line for way_ref in route["way_refs"] if (line := self.route_way_lines.get(way_ref))]
if not lines:
self.route_relations_without_geometry += 1
continue
geometry: dict[str, Any]
ordered_lines = _ordered_route_lines(route["way_refs"], self.route_way_lines)
if len(ordered_lines) == 1:
geometry = {"type": "LineString", "coordinates": ordered_lines[0]}
else:
geometry = {"type": "MultiLineString", "coordinates": ordered_lines}
props = {
**route["tags"],
"osm_type": "relation",
"osm_id": str(relation_id),
"member_way_count": len(route["way_refs"]),
"geometry_source": "ordered_route_relation_member_ways",
"geometry_part_count": len(ordered_lines),
}
route_features.append({"type": "Feature", "geometry": geometry, "properties": props})
return route_features + self.infrastructure_features + self.stop_features
def _ordered_route_lines(way_refs: list[int], route_way_lines: dict[int, list[list[float]]]) -> list[list[list[float]]]:
parts: list[list[list[float]]] = []
for way_ref in way_refs:
line = route_way_lines.get(way_ref)
if not line:
continue
coords = [list(coord) for coord in line]
if len(coords) < 2:
continue
if not parts:
parts.append(coords)
continue
if _append_connected(parts[-1], coords):
continue
attached = False
for part in reversed(parts[:-1]):
if _append_connected(part, coords):
attached = True
break
if not attached:
parts.append(coords)
return parts
def _append_connected(part: list[list[float]], coords: list[list[float]]) -> bool:
if _same_coord(part[-1], coords[0]):
part.extend(coords[1:])
return True
if _same_coord(part[-1], coords[-1]):
part.extend(reversed(coords[:-1]))
return True
if _same_coord(part[0], coords[-1]):
part[:0] = coords[:-1]
return True
if _same_coord(part[0], coords[0]):
part[:0] = list(reversed(coords[1:]))
return True
return False
def _same_coord(left: list[float], right: list[float]) -> bool:
return len(left) >= 2 and len(right) >= 2 and abs(left[0] - right[0]) < 1e-9 and abs(left[1] - right[1]) < 1e-9
def _tags_dict(tags: osmium.osm.TagList) -> dict[str, str]:
return {tag.k: tag.v for tag in tags}
def _route_mode(tags: dict[str, str]) -> str | None:
value = tags.get("route")
if value in ROUTE_MODES:
return "train" if value == "railway" else value
return None
def _is_transport_infrastructure(tags: dict[str, str]) -> bool:
return _infrastructure_mode(tags) is not None
def _infrastructure_mode(tags: dict[str, str]) -> str | None:
railway = tags.get("railway")
if railway in RAILWAY_MODE_BY_TAG:
return RAILWAY_MODE_BY_TAG[railway]
if tags.get("route") == "ferry":
return "ferry"
aerialway = tags.get("aerialway")
if aerialway and aerialway != "station":
return "aerialway"
return None
def _is_stop_or_station(tags: dict[str, str]) -> bool:
if tags.get("public_transport") in {"platform", "stop_position", "station"}:
return True
if tags.get("railway") in {"station", "halt", "tram_stop", "subway_entrance", "platform"}:
return True
if tags.get("highway") == "bus_stop":
return True
if tags.get("amenity") in {"bus_station", "ferry_terminal"}:
return True
if tags.get("aerialway") == "station":
return True
return False
def _node_coords(node: osmium.osm.Node) -> list[float] | None:
try:
if not node.location.valid():
return None
return [float(node.location.lon), float(node.location.lat)]
except Exception:
return None
def _way_coords(way: osmium.osm.Way) -> list[list[float]] | None:
coords = []
try:
for node in way.nodes:
if not node.location.valid():
return None
coords.append([float(node.location.lon), float(node.location.lat)])
except Exception:
return None
return coords if len(coords) >= 2 else None
def _way_area_or_line_feature(way: osmium.osm.Way, tags: dict[str, str], coords: list[list[float]] | None) -> dict[str, Any] | None:
if coords is None:
return None
props = {
**tags,
"osm_type": "way",
"osm_id": str(way.id),
}
if len(coords) >= 4 and coords[0] == coords[-1]:
return {"type": "Feature", "geometry": {"type": "Polygon", "coordinates": [coords]}, "properties": props}
return {"type": "Feature", "geometry": {"type": "LineString", "coordinates": coords}, "properties": props}
def _record_pipeline_stage(
session: Session,
*,
stage: str,
version: str,
source_id: int,
dataset: Dataset,
inputs: dict[str, Any],
outputs: dict[str, Any] | None,
) -> None:
dependency_hash_value = dependency_hash(inputs)
run = start_pipeline_run(
session,
stage=stage,
version=version,
dependency_hash_value=dependency_hash_value,
source_id=source_id,
dataset_id=dataset.id,
inputs=inputs,
)
finish_pipeline_run(session, run, outputs=outputs or {})
def _raw_format(path: Path) -> str:
name = path.name.lower()
if name.endswith(".osm.pbf") or name.endswith(".pbf"):
return "osm_pbf"
if name.endswith(".osm") or name.endswith(".osm.xml") or name.endswith(".xml"):
return "osm_xml"
if name.endswith(".osc") or name.endswith(".osc.gz"):
return "osm_change"
return "osm"