1582 lines
65 KiB
Python
1582 lines
65 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import shutil
|
|
import subprocess
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import osmium
|
|
from sqlalchemy import select
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.config import settings
|
|
from app.db import SessionLocal
|
|
from app.db_lock import database_write_lock
|
|
from app.models import Dataset, OsmDiffState, Source
|
|
from app.osm_storage import OSM_STORAGE_MAIN, OSM_STORAGE_SIDECAR_FEATURES, effective_osm_feature_storage
|
|
from app.performance import measure_pipeline_phase
|
|
from app.pipeline.download import materialize_source
|
|
from app.pipeline.osm_geojson import import_osm_geojson, prepare_osm_geojson_storage
|
|
from app.pipeline.osm_replication import ReplicationState, apply_osm_changes, download_diff, fetch_replication_state
|
|
from app.pipeline.state import (
|
|
STAGE_ACQUIRE_RAW,
|
|
STAGE_BUILD_INDEXES,
|
|
STAGE_EXTRACT_GEOMETRY,
|
|
STAGE_FILTER_TRANSPORT,
|
|
dependency_hash,
|
|
finish_pipeline_run,
|
|
start_pipeline_run,
|
|
)
|
|
from app.pipeline.utils import sha256_file
|
|
|
|
ROUTE_MODES = {
|
|
"train",
|
|
"railway",
|
|
"light_rail",
|
|
"subway",
|
|
"tram",
|
|
"bus",
|
|
"trolleybus",
|
|
"coach",
|
|
"ferry",
|
|
"monorail",
|
|
"funicular",
|
|
"aerialway",
|
|
}
|
|
|
|
RAILWAY_MODE_BY_TAG = {
|
|
"rail": "train",
|
|
"light_rail": "light_rail",
|
|
"subway": "subway",
|
|
"tram": "tram",
|
|
"monorail": "monorail",
|
|
"funicular": "funicular",
|
|
}
|
|
|
|
EXTRACTOR_VERSION = "osmium_transport_geojson_v2_ordered_relation_members"
|
|
TRANSPORT_FILTER_VERSION = "osmium_transport_filter_v1"
|
|
RAW_ACQUIRE_VERSION = "osm_raw_acquire_v1"
|
|
OSM_SIDECAR_INDEX_VERSION = "osm_sidecar_indexes_v1"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class _SourceRef:
|
|
id: int
|
|
name: str
|
|
kind: str
|
|
url: str
|
|
country: str | None = None
|
|
license: str | None = None
|
|
notes: str | None = None
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class _DatasetRef:
|
|
id: int
|
|
source_id: int
|
|
kind: str
|
|
local_path: str
|
|
sha256: str
|
|
status: str
|
|
metadata: dict[str, Any]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class _PreparedRawFile:
|
|
path: Path
|
|
sha256: str
|
|
metadata: dict[str, Any]
|
|
replication_state: ReplicationState | None = None
|
|
diff_state_metadata: dict[str, Any] | None = None
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class _PreparedTransportFile:
|
|
path: Path
|
|
sha256: str
|
|
metadata: dict[str, Any]
|
|
reused: bool
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class _PreparedExtract:
|
|
path: Path
|
|
sha256: str
|
|
summary: dict[str, Any]
|
|
reused: bool
|
|
|
|
|
|
def run_osm_pbf_source(session: Session, source: Source, progress_callback=None) -> Dataset:
|
|
raw_dataset = _prepare_raw_osm_dataset(session, source, progress_callback=progress_callback)
|
|
input_dataset = raw_dataset
|
|
input_path = Path(raw_dataset.local_path)
|
|
|
|
if _should_prefilter(input_path):
|
|
input_dataset = _prepare_transport_pbf(session, source, raw_dataset, input_path)
|
|
input_path = Path(input_dataset.local_path)
|
|
|
|
existing_derived = _find_existing_derived(session, source, input_dataset)
|
|
if existing_derived is not None:
|
|
return existing_derived
|
|
|
|
output_dir = settings.data_dir / "derived" / f"source_{source.id}" / f"extract_dataset_{input_dataset.id}"
|
|
output_path = output_dir / "transport.geojson"
|
|
extract_summary = extract_osm_transport_geojson(input_path, output_path)
|
|
|
|
input_dataset.status = "extracted"
|
|
_update_dataset_metadata(input_dataset, extractor=EXTRACTOR_VERSION, extract_summary=extract_summary)
|
|
if input_dataset.id != raw_dataset.id:
|
|
raw_dataset.status = "filtered"
|
|
_update_dataset_metadata(raw_dataset, filtered_dataset_id=input_dataset.id)
|
|
session.flush()
|
|
|
|
derived_dataset = import_osm_geojson(session=session, source=source, path=output_path)
|
|
derived_metadata = json.loads(derived_dataset.metadata_json or "{}")
|
|
derived_metadata.update(
|
|
{
|
|
"stage": "derived_osm_transport_geojson",
|
|
"derived_from_dataset_id": input_dataset.id,
|
|
"raw_dataset_id": raw_dataset.id,
|
|
"filtered_dataset_id": input_dataset.id if input_dataset.id != raw_dataset.id else None,
|
|
"extractor": EXTRACTOR_VERSION,
|
|
"extract_summary": extract_summary,
|
|
}
|
|
)
|
|
derived_dataset.metadata_json = json.dumps(derived_metadata, indent=2)
|
|
session.flush()
|
|
return derived_dataset
|
|
|
|
|
|
def run_osm_pbf_source_staged(source_id: int, progress_callback=None) -> Dataset:
|
|
"""Run large OSM PBF imports with only short DB write-lock sections.
|
|
|
|
The expensive file work is deterministic and resumable from cached files:
|
|
raw source materialization, optional osmium transport filtering, GeoJSON
|
|
extraction, and sidecar creation all happen outside the global SQLite write
|
|
lock. Dataset rows are reserved/activated in short transactions.
|
|
"""
|
|
source_ref = _load_source_ref(source_id)
|
|
_mark_source_running(source_ref.id)
|
|
_emit_progress(progress_callback, "osm_staged_import_started", f"Preparing staged OSM import for {source_ref.name}.", 0, 7, {"source_id": source_ref.id})
|
|
|
|
prepared_raw = _prepare_raw_file_staged(source_ref, progress_callback=progress_callback)
|
|
raw_dataset = _reserve_raw_dataset(source_ref, prepared_raw)
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_raw_dataset_reserved",
|
|
f"Reserved raw OSM dataset #{raw_dataset.id}.",
|
|
2,
|
|
7,
|
|
{"dataset_id": raw_dataset.id, "path": raw_dataset.local_path, "sha256": raw_dataset.sha256},
|
|
)
|
|
|
|
input_dataset = raw_dataset
|
|
input_path = Path(raw_dataset.local_path)
|
|
filtered_dataset: _DatasetRef | None = None
|
|
if _should_prefilter(input_path):
|
|
prepared_transport = _prepare_transport_file_staged(source_ref, raw_dataset, input_path, progress_callback=progress_callback)
|
|
filtered_dataset = _reserve_transport_dataset(source_ref, raw_dataset, prepared_transport)
|
|
input_dataset = filtered_dataset
|
|
input_path = Path(filtered_dataset.local_path)
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_transport_dataset_reserved",
|
|
f"Reserved filtered OSM transport dataset #{filtered_dataset.id}.",
|
|
3,
|
|
7,
|
|
{"dataset_id": filtered_dataset.id, "path": filtered_dataset.local_path, "sha256": filtered_dataset.sha256, "reused": prepared_transport.reused},
|
|
)
|
|
|
|
existing = _existing_active_derived_ref(source_ref.id, input_dataset.id)
|
|
if existing is not None:
|
|
_activate_existing_derived(source_ref.id, existing.id)
|
|
_emit_progress(progress_callback, "osm_staged_import_reused", f"Reused active OSM transport dataset #{existing.id}.", 7, 7, {"dataset_id": existing.id})
|
|
return _load_dataset(existing.id)
|
|
|
|
extract = _extract_transport_geojson_staged(source_ref, input_dataset, input_path, progress_callback=progress_callback)
|
|
derived_dataset = _reserve_derived_dataset(
|
|
source_ref=source_ref,
|
|
raw_dataset=raw_dataset,
|
|
input_dataset=input_dataset,
|
|
filtered_dataset=filtered_dataset,
|
|
extract=extract,
|
|
)
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_derived_dataset_reserved",
|
|
f"Reserved derived OSM dataset #{derived_dataset.id}.",
|
|
5,
|
|
7,
|
|
{"dataset_id": derived_dataset.id, "path": derived_dataset.local_path, "sha256": derived_dataset.sha256, "extract_reused": extract.reused},
|
|
)
|
|
|
|
sidecar_metadata = _prepare_derived_storage_staged(derived_dataset, extract, progress_callback=progress_callback)
|
|
activated_id = _activate_staged_osm_import(
|
|
source_ref=source_ref,
|
|
raw_dataset=raw_dataset,
|
|
filtered_dataset=filtered_dataset,
|
|
input_dataset=input_dataset,
|
|
derived_dataset=derived_dataset,
|
|
extract=extract,
|
|
sidecar_metadata=sidecar_metadata,
|
|
)
|
|
_emit_progress(progress_callback, "osm_staged_import_completed", f"Activated OSM dataset #{activated_id}.", 7, 7, {"dataset_id": activated_id})
|
|
return _load_dataset(activated_id)
|
|
|
|
|
|
def _load_source_ref(source_id: int) -> _SourceRef:
|
|
with SessionLocal() as session:
|
|
source = session.get(Source, source_id)
|
|
if source is None:
|
|
raise ValueError(f"source not found: {source_id}")
|
|
if source.kind != "osm_pbf":
|
|
raise ValueError(f"staged OSM import requires source kind osm_pbf, got {source.kind}")
|
|
return _SourceRef(
|
|
id=source.id,
|
|
name=source.name,
|
|
kind=source.kind,
|
|
url=source.url,
|
|
country=source.country,
|
|
license=source.license,
|
|
notes=source.notes,
|
|
)
|
|
|
|
|
|
def _load_dataset(dataset_id: int) -> Dataset:
|
|
with SessionLocal() as session:
|
|
dataset = session.get(Dataset, dataset_id)
|
|
if dataset is None:
|
|
raise ValueError(f"dataset not found after staged import: {dataset_id}")
|
|
return dataset
|
|
|
|
|
|
def _mark_source_running(source_id: int) -> None:
|
|
with database_write_lock(f"osm_staged_import:{source_id}:start", timeout=30):
|
|
with SessionLocal() as session:
|
|
source = session.get(Source, source_id)
|
|
if source is None:
|
|
raise ValueError(f"source not found: {source_id}")
|
|
source.status = "running"
|
|
source.last_error = None
|
|
source.last_run_at = datetime.now(timezone.utc)
|
|
session.commit()
|
|
|
|
|
|
def _prepare_raw_file_staged(source: _SourceRef, progress_callback=None) -> _PreparedRawFile:
|
|
diff_raw = _try_prepare_raw_file_from_diffs_staged(source, progress_callback=progress_callback)
|
|
if diff_raw is not None:
|
|
return diff_raw
|
|
|
|
_emit_progress(progress_callback, "osm_full_snapshot_started", f"Downloading/copying full OSM snapshot for {source.name}.", 1, 7, {"source_id": source.id})
|
|
with measure_pipeline_phase("osm_full_snapshot", source_id=source.id, metadata={"url": source.url}) as metric:
|
|
raw_path = materialize_source(source) # type: ignore[arg-type]
|
|
raw_hash = sha256_file(raw_path)
|
|
metric.update({"path": str(raw_path), "sha256": raw_hash, "bytes": raw_path.stat().st_size if raw_path.exists() else None})
|
|
metadata = {
|
|
"stage": "raw_osm",
|
|
"raw_format": _raw_format(raw_path),
|
|
"source_url": source.url,
|
|
"import_mode": "staged_short_lock",
|
|
}
|
|
replication_state = _fetch_current_replication_state_for_snapshot(source, progress_callback=progress_callback)
|
|
if replication_state is not None:
|
|
metadata["replication_state"] = {
|
|
"updates_url": _source_updates_url(source), # type: ignore[arg-type]
|
|
"sequence_number": replication_state.sequence_number,
|
|
"timestamp": replication_state.timestamp,
|
|
}
|
|
_emit_progress(progress_callback, "osm_full_snapshot_completed", "Prepared raw OSM snapshot file.", 1, 7, {"path": str(raw_path), "sha256": raw_hash})
|
|
return _PreparedRawFile(path=raw_path, sha256=raw_hash, metadata=metadata, replication_state=replication_state, diff_state_metadata={"source": "full_snapshot"} if replication_state is not None else None)
|
|
|
|
|
|
def _try_prepare_raw_file_from_diffs_staged(source: _SourceRef, progress_callback=None) -> _PreparedRawFile | None:
|
|
updates_url = _source_updates_url(source) # type: ignore[arg-type]
|
|
if not updates_url:
|
|
return None
|
|
with SessionLocal() as session:
|
|
current_state = _latest_diff_state(session, source.id)
|
|
if current_state is None or current_state.raw_dataset_id is None:
|
|
_emit_progress(progress_callback, "osm_diff_fallback", "No local OSM replication state yet; using full snapshot.", None, None, {"updates_url": updates_url})
|
|
return None
|
|
base_dataset = session.get(Dataset, current_state.raw_dataset_id)
|
|
if base_dataset is None or not Path(base_dataset.local_path).exists():
|
|
_emit_progress(progress_callback, "osm_diff_fallback", "Local raw OSM base is missing; using full snapshot.", None, None, {"updates_url": updates_url})
|
|
return None
|
|
base_ref = _dataset_ref(base_dataset)
|
|
local_sequence = current_state.sequence_number
|
|
|
|
try:
|
|
remote_state = fetch_replication_state(updates_url, timeout=settings.osm_diff_state_timeout_seconds)
|
|
except Exception as exc: # noqa: BLE001 - correctness fallback
|
|
_emit_progress(progress_callback, "osm_diff_fallback", f"Could not read OSM replication state; using full snapshot: {exc}", None, None, {"updates_url": updates_url})
|
|
return None
|
|
|
|
if remote_state.sequence_number <= local_sequence:
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_diff_up_to_date",
|
|
"Local raw OSM extract is already at the latest known replication sequence.",
|
|
remote_state.sequence_number,
|
|
remote_state.sequence_number,
|
|
{"updates_url": updates_url, "sequence_number": remote_state.sequence_number},
|
|
)
|
|
return _PreparedRawFile(
|
|
path=Path(base_ref.local_path),
|
|
sha256=base_ref.sha256,
|
|
metadata=base_ref.metadata,
|
|
replication_state=remote_state,
|
|
diff_state_metadata={"source": "existing_raw_dataset", "raw_dataset_id": base_ref.id},
|
|
)
|
|
|
|
gap = remote_state.sequence_number - local_sequence
|
|
if gap > settings.osm_diff_max_sequence_gap:
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_diff_fallback",
|
|
"OSM replication gap is too large; using full snapshot.",
|
|
local_sequence,
|
|
remote_state.sequence_number,
|
|
{"gap": gap, "max_gap": settings.osm_diff_max_sequence_gap, "updates_url": updates_url},
|
|
)
|
|
return None
|
|
|
|
host_tool = _host_tool_path()
|
|
if not host_tool.exists():
|
|
_emit_progress(progress_callback, "osm_diff_fallback", "host_tool.sh is missing; using full snapshot.", None, None, {"host_tool": str(host_tool)})
|
|
return None
|
|
|
|
try:
|
|
return _apply_diff_range_files_staged(
|
|
source=source,
|
|
base_dataset=base_ref,
|
|
updates_url=updates_url,
|
|
local_sequence=local_sequence,
|
|
remote_state=remote_state,
|
|
host_tool=host_tool,
|
|
progress_callback=progress_callback,
|
|
)
|
|
except Exception as exc: # noqa: BLE001 - fall back to full snapshot rather than risk a bad base
|
|
_emit_progress(progress_callback, "osm_diff_fallback", f"OSM diff application failed; using full snapshot: {exc}", None, None, {"updates_url": updates_url})
|
|
return None
|
|
|
|
|
|
def _apply_diff_range_files_staged(
|
|
*,
|
|
source: _SourceRef,
|
|
base_dataset: _DatasetRef,
|
|
updates_url: str,
|
|
local_sequence: int,
|
|
remote_state: ReplicationState,
|
|
host_tool: Path,
|
|
progress_callback=None,
|
|
) -> _PreparedRawFile:
|
|
update_root = settings.data_dir / "sources" / f"source_{source.id}" / "updates"
|
|
work_root = settings.data_dir / "sources" / f"source_{source.id}" / "diff_work"
|
|
work_root.mkdir(parents=True, exist_ok=True)
|
|
current_path = Path(base_dataset.local_path)
|
|
batch_size = max(1, int(settings.osm_diff_apply_batch_size))
|
|
sequences = list(range(local_sequence + 1, remote_state.sequence_number + 1))
|
|
applied_sequences: list[int] = []
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_diff_started",
|
|
f"Applying {len(sequences)} OSM replication diffs.",
|
|
local_sequence,
|
|
remote_state.sequence_number,
|
|
{"updates_url": updates_url, "from_sequence": local_sequence + 1, "to_sequence": remote_state.sequence_number},
|
|
)
|
|
with measure_pipeline_phase("osm_diff_apply", source_id=source.id, metadata={"from_sequence": local_sequence + 1, "to_sequence": remote_state.sequence_number}) as metric:
|
|
for batch_start in range(0, len(sequences), batch_size):
|
|
batch = sequences[batch_start : batch_start + batch_size]
|
|
diff_paths = []
|
|
for sequence in batch:
|
|
diff_path = download_diff(updates_url, sequence, update_root)
|
|
diff_paths.append(diff_path)
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_diff_downloaded",
|
|
f"Downloaded OSM diff sequence {sequence}.",
|
|
sequence,
|
|
remote_state.sequence_number,
|
|
{"path": str(diff_path), "sequence_number": sequence},
|
|
)
|
|
temp_output = work_root / f"source_{source.id}_{batch[0]}_{batch[-1]}.tmp.osm.pbf"
|
|
completed = apply_osm_changes(current_path, diff_paths, temp_output, host_tool)
|
|
current_path = _store_updated_raw_pbf(source, temp_output) # type: ignore[arg-type]
|
|
applied_sequences.extend(batch)
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_diff_applied",
|
|
f"Applied OSM diff sequences {batch[0]}-{batch[-1]}.",
|
|
batch[-1],
|
|
remote_state.sequence_number,
|
|
{
|
|
"output_path": str(current_path),
|
|
"stdout": completed.stdout.strip(),
|
|
"stderr": completed.stderr.strip(),
|
|
"batch_start": batch[0],
|
|
"batch_end": batch[-1],
|
|
},
|
|
)
|
|
raw_hash = sha256_file(current_path)
|
|
metric.update({"applied_sequences": applied_sequences, "path": str(current_path), "sha256": raw_hash, "bytes": current_path.stat().st_size if current_path.exists() else None})
|
|
metadata = {
|
|
"stage": "raw_osm",
|
|
"raw_format": _raw_format(current_path),
|
|
"source_url": source.url,
|
|
"import_mode": "staged_short_lock",
|
|
"replication_state": {
|
|
"updates_url": updates_url,
|
|
"sequence_number": remote_state.sequence_number,
|
|
"timestamp": remote_state.timestamp,
|
|
},
|
|
"diff_update": {
|
|
"base_dataset_id": base_dataset.id,
|
|
"base_sequence_number": local_sequence,
|
|
"applied_sequences": applied_sequences,
|
|
},
|
|
}
|
|
return _PreparedRawFile(
|
|
path=current_path,
|
|
sha256=raw_hash,
|
|
metadata=metadata,
|
|
replication_state=remote_state,
|
|
diff_state_metadata={"base_dataset_id": base_dataset.id, "applied_sequences": applied_sequences},
|
|
)
|
|
|
|
|
|
def _fetch_current_replication_state_for_snapshot(source: _SourceRef, progress_callback=None) -> ReplicationState | None:
|
|
updates_url = _source_updates_url(source) # type: ignore[arg-type]
|
|
if not updates_url:
|
|
return None
|
|
try:
|
|
return fetch_replication_state(updates_url, timeout=settings.osm_diff_state_timeout_seconds)
|
|
except Exception as exc: # noqa: BLE001 - full snapshot is still usable without diff state
|
|
_emit_progress(progress_callback, "osm_diff_state_unavailable", f"Could not record OSM replication state: {exc}", None, None, {"updates_url": updates_url})
|
|
return None
|
|
|
|
|
|
def _reserve_raw_dataset(source_ref: _SourceRef, prepared: _PreparedRawFile) -> _DatasetRef:
|
|
with database_write_lock(f"osm_staged_import:{source_ref.id}:reserve_raw", timeout=60):
|
|
with SessionLocal() as session:
|
|
source = session.get(Source, source_ref.id)
|
|
if source is None:
|
|
raise ValueError(f"source not found: {source_ref.id}")
|
|
dataset = _find_raw_dataset(session, source, prepared.sha256)
|
|
if dataset is None:
|
|
dataset = Dataset(
|
|
source_id=source.id,
|
|
kind="osm_pbf_raw",
|
|
local_path=str(prepared.path),
|
|
sha256=prepared.sha256,
|
|
is_active=False,
|
|
status="committed",
|
|
metadata_json=json.dumps(prepared.metadata, indent=2),
|
|
)
|
|
session.add(dataset)
|
|
session.flush()
|
|
else:
|
|
dataset.local_path = str(prepared.path)
|
|
dataset.status = "committed"
|
|
dataset.metadata_json = json.dumps({**_metadata(dataset), **prepared.metadata}, indent=2)
|
|
if prepared.replication_state is not None:
|
|
_record_diff_state(
|
|
session,
|
|
source=source,
|
|
raw_dataset=dataset,
|
|
updates_url=str(prepared.metadata.get("replication_state", {}).get("updates_url") or _source_updates_url(source) or ""),
|
|
state=prepared.replication_state,
|
|
metadata=prepared.diff_state_metadata,
|
|
)
|
|
_record_pipeline_stage(
|
|
session,
|
|
stage=STAGE_ACQUIRE_RAW,
|
|
version=RAW_ACQUIRE_VERSION,
|
|
source_id=source.id,
|
|
dataset=dataset,
|
|
inputs={
|
|
"source_url": source.url,
|
|
"source_kind": source.kind,
|
|
"remote": prepared.metadata.get("replication_state") or prepared.metadata.get("source_url"),
|
|
},
|
|
outputs={
|
|
"path": str(prepared.path),
|
|
"sha256": prepared.sha256,
|
|
"raw_format": prepared.metadata.get("raw_format"),
|
|
"diff_update": prepared.metadata.get("diff_update"),
|
|
},
|
|
)
|
|
source.status = "running"
|
|
source.last_error = None
|
|
session.commit()
|
|
return _dataset_ref(dataset)
|
|
|
|
|
|
def _prepare_transport_file_staged(source: _SourceRef, raw_dataset: _DatasetRef, raw_path: Path, progress_callback=None) -> _PreparedTransportFile:
|
|
output_path = _transport_filter_path_for_raw_id(source.id, raw_dataset.id, raw_path)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
metadata_path = output_path.with_suffix(output_path.suffix + ".metadata.json")
|
|
existing_metadata = _read_json_file(metadata_path)
|
|
if output_path.exists() and existing_metadata.get("input_sha256") == raw_dataset.sha256 and existing_metadata.get("filter") == TRANSPORT_FILTER_VERSION:
|
|
filtered_hash = sha256_file(output_path)
|
|
_emit_progress(progress_callback, "osm_transport_filter_reused", "Reusing existing filtered OSM transport extract.", 3, 7, {"path": str(output_path), "sha256": filtered_hash})
|
|
return _PreparedTransportFile(path=output_path, sha256=filtered_hash, metadata=existing_metadata, reused=True)
|
|
|
|
script_path = _prefilter_script_path()
|
|
if not script_path.exists():
|
|
raise FileNotFoundError(f"OSM transport filter script not found: {script_path}")
|
|
_emit_progress(progress_callback, "osm_transport_filter_started", "Filtering OSM PBF to public-transport objects.", 2, 7, {"input_path": str(raw_path), "output_path": str(output_path)})
|
|
with measure_pipeline_phase("osm_transport_filter", source_id=source.id, dataset_id=raw_dataset.id, metadata={"input_path": str(raw_path), "output_path": str(output_path)}) as metric:
|
|
command = [str(script_path), str(raw_path), str(output_path)]
|
|
try:
|
|
completed = subprocess.run(command, check=True, capture_output=True, text=True)
|
|
except subprocess.CalledProcessError as exc:
|
|
stderr = (exc.stderr or "").strip()
|
|
stdout = (exc.stdout or "").strip()
|
|
details = stderr or stdout or f"exit code {exc.returncode}"
|
|
raise RuntimeError(f"OSM transport filter failed for {raw_path}: {details}") from exc
|
|
filtered_hash = sha256_file(output_path)
|
|
metric.update({"sha256": filtered_hash, "bytes": output_path.stat().st_size if output_path.exists() else None})
|
|
metadata = {
|
|
"stage": "filtered_osm_transport_pbf",
|
|
"raw_format": _raw_format(output_path),
|
|
"derived_from_dataset_id": raw_dataset.id,
|
|
"source_url": source.url,
|
|
"filter": TRANSPORT_FILTER_VERSION,
|
|
"filter_script": str(script_path),
|
|
"input_path": str(raw_path),
|
|
"input_sha256": raw_dataset.sha256,
|
|
"output_path": str(output_path),
|
|
"stdout": completed.stdout.strip(),
|
|
"stderr": completed.stderr.strip(),
|
|
"import_mode": "staged_short_lock",
|
|
}
|
|
metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
|
|
_emit_progress(progress_callback, "osm_transport_filter_completed", "Filtered OSM transport extract.", 3, 7, {"path": str(output_path), "sha256": filtered_hash})
|
|
return _PreparedTransportFile(path=output_path, sha256=filtered_hash, metadata=metadata, reused=False)
|
|
|
|
|
|
def _reserve_transport_dataset(source_ref: _SourceRef, raw_dataset: _DatasetRef, prepared: _PreparedTransportFile) -> _DatasetRef:
|
|
with database_write_lock(f"osm_staged_import:{source_ref.id}:reserve_transport", timeout=60):
|
|
with SessionLocal() as session:
|
|
source = session.get(Source, source_ref.id)
|
|
raw = session.get(Dataset, raw_dataset.id)
|
|
if source is None or raw is None:
|
|
raise ValueError("source or raw dataset disappeared during staged import")
|
|
dataset = _find_transport_dataset_by_raw_id(session, source.id, raw_dataset.id)
|
|
if dataset is None:
|
|
dataset = Dataset(
|
|
source_id=source.id,
|
|
kind="osm_pbf_transport",
|
|
local_path=str(prepared.path),
|
|
sha256=prepared.sha256,
|
|
is_active=False,
|
|
status="filtered",
|
|
metadata_json=json.dumps(prepared.metadata, indent=2),
|
|
)
|
|
session.add(dataset)
|
|
session.flush()
|
|
else:
|
|
dataset.local_path = str(prepared.path)
|
|
dataset.sha256 = prepared.sha256
|
|
dataset.status = "filtered"
|
|
dataset.metadata_json = json.dumps(prepared.metadata, indent=2)
|
|
raw.status = "filtered"
|
|
raw.metadata_json = json.dumps({**_metadata(raw), "filtered_dataset_id": dataset.id}, indent=2)
|
|
_record_pipeline_stage(
|
|
session,
|
|
stage=STAGE_FILTER_TRANSPORT,
|
|
version=TRANSPORT_FILTER_VERSION,
|
|
source_id=source.id,
|
|
dataset=dataset,
|
|
inputs={
|
|
"raw_dataset_id": raw_dataset.id,
|
|
"raw_sha256": raw_dataset.sha256,
|
|
"filter_script": prepared.metadata.get("filter_script"),
|
|
},
|
|
outputs={"path": str(prepared.path), "sha256": prepared.sha256, "reused": prepared.reused},
|
|
)
|
|
session.commit()
|
|
return _dataset_ref(dataset)
|
|
|
|
|
|
def _extract_transport_geojson_staged(source: _SourceRef, input_dataset: _DatasetRef, input_path: Path, progress_callback=None) -> _PreparedExtract:
|
|
output_dir = settings.data_dir / "derived" / f"source_{source.id}" / f"extract_dataset_{input_dataset.id}"
|
|
output_path = output_dir / "transport.geojson"
|
|
summary_path = output_path.with_suffix(".summary.json")
|
|
existing_summary = _read_json_file(summary_path)
|
|
if output_path.exists() and existing_summary.get("input_sha256") == input_dataset.sha256 and existing_summary.get("extractor") == EXTRACTOR_VERSION:
|
|
output_hash = sha256_file(output_path)
|
|
_emit_progress(progress_callback, "osm_extract_reused", "Reusing existing extracted OSM transport GeoJSON.", 4, 7, {"path": str(output_path), "sha256": output_hash})
|
|
return _PreparedExtract(path=output_path, sha256=output_hash, summary=existing_summary["extract_summary"], reused=True)
|
|
|
|
_emit_progress(progress_callback, "osm_extract_started", "Extracting route, stop, and infrastructure geometry from OSM.", 4, 7, {"input_path": str(input_path), "output_path": str(output_path)})
|
|
with measure_pipeline_phase("osm_transport_extract", source_id=source.id, dataset_id=input_dataset.id, metadata={"input_path": str(input_path), "output_path": str(output_path)}) as metric:
|
|
extract_summary = extract_osm_transport_geojson(input_path, output_path)
|
|
output_hash = sha256_file(output_path)
|
|
metric.update({**extract_summary, "sha256": output_hash, "bytes": output_path.stat().st_size if output_path.exists() else None})
|
|
summary = {
|
|
"input_dataset_id": input_dataset.id,
|
|
"input_sha256": input_dataset.sha256,
|
|
"extractor": EXTRACTOR_VERSION,
|
|
"extract_summary": extract_summary,
|
|
}
|
|
summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
|
|
_emit_progress(progress_callback, "osm_extract_completed", "Extracted OSM transport GeoJSON.", 4, 7, {"path": str(output_path), "sha256": output_hash, **extract_summary})
|
|
return _PreparedExtract(path=output_path, sha256=output_hash, summary=extract_summary, reused=False)
|
|
|
|
|
|
def _existing_active_derived_ref(source_id: int, input_dataset_id: int) -> _DatasetRef | None:
|
|
with SessionLocal() as session:
|
|
source = session.get(Source, source_id)
|
|
if source is None:
|
|
return None
|
|
dataset = _find_existing_derived(session, source, Dataset(id=input_dataset_id))
|
|
if dataset is None:
|
|
return None
|
|
return _dataset_ref(dataset)
|
|
|
|
|
|
def _activate_existing_derived(source_id: int, derived_dataset_id: int) -> None:
|
|
with database_write_lock(f"osm_staged_import:{source_id}:reuse_existing", timeout=60):
|
|
with SessionLocal() as session:
|
|
source = session.get(Source, source_id)
|
|
dataset = session.get(Dataset, derived_dataset_id)
|
|
if source is None or dataset is None:
|
|
return
|
|
for existing in source.datasets:
|
|
existing.is_active = existing.id == dataset.id
|
|
source.status = "ok"
|
|
source.last_error = None
|
|
source.last_run_at = datetime.now(timezone.utc)
|
|
session.commit()
|
|
|
|
|
|
def _reserve_derived_dataset(
|
|
*,
|
|
source_ref: _SourceRef,
|
|
raw_dataset: _DatasetRef,
|
|
input_dataset: _DatasetRef,
|
|
filtered_dataset: _DatasetRef | None,
|
|
extract: _PreparedExtract,
|
|
) -> _DatasetRef:
|
|
metadata = {
|
|
"stage": "derived_osm_transport_geojson",
|
|
"derived_from_dataset_id": input_dataset.id,
|
|
"raw_dataset_id": raw_dataset.id,
|
|
"filtered_dataset_id": None if filtered_dataset is None else filtered_dataset.id,
|
|
"extractor": EXTRACTOR_VERSION,
|
|
"extract_summary": extract.summary,
|
|
"import_mode": "staged_short_lock",
|
|
"sidecar_status": "pending",
|
|
}
|
|
with database_write_lock(f"osm_staged_import:{source_ref.id}:reserve_derived", timeout=60):
|
|
with SessionLocal() as session:
|
|
source = session.get(Source, source_ref.id)
|
|
if source is None:
|
|
raise ValueError(f"source not found: {source_ref.id}")
|
|
dataset = _find_staged_derived_dataset(session, source.id, input_dataset.id, extract.sha256)
|
|
if dataset is None:
|
|
dataset = Dataset(
|
|
source_id=source.id,
|
|
kind="osm_geojson",
|
|
local_path=str(extract.path),
|
|
sha256=extract.sha256,
|
|
is_active=False,
|
|
status="sidecar_staging",
|
|
metadata_json=json.dumps(metadata, indent=2),
|
|
)
|
|
session.add(dataset)
|
|
session.flush()
|
|
else:
|
|
dataset.local_path = str(extract.path)
|
|
dataset.sha256 = extract.sha256
|
|
dataset.status = "sidecar_staging"
|
|
dataset.metadata_json = json.dumps({**_metadata(dataset), **metadata}, indent=2)
|
|
_record_pipeline_stage(
|
|
session,
|
|
stage=STAGE_EXTRACT_GEOMETRY,
|
|
version=EXTRACTOR_VERSION,
|
|
source_id=source.id,
|
|
dataset=dataset,
|
|
inputs={
|
|
"input_dataset_id": input_dataset.id,
|
|
"input_sha256": input_dataset.sha256,
|
|
"extractor": EXTRACTOR_VERSION,
|
|
},
|
|
outputs={"path": str(extract.path), "sha256": extract.sha256, "summary": extract.summary, "reused": extract.reused},
|
|
)
|
|
session.commit()
|
|
return _dataset_ref(dataset)
|
|
|
|
|
|
def _prepare_derived_storage_staged(derived_dataset: _DatasetRef, extract: _PreparedExtract, progress_callback=None) -> dict[str, object]:
|
|
storage = derived_dataset.metadata.get("osm_storage")
|
|
if isinstance(storage, dict):
|
|
if storage.get("mode") == OSM_STORAGE_MAIN and derived_dataset.metadata.get("storage_status") == "ready":
|
|
_emit_progress(progress_callback, "osm_storage_reused", "Reusing existing OSM main-table storage.", 6, 7, {"dataset_id": derived_dataset.id})
|
|
return derived_dataset.metadata
|
|
sidecar = storage.get("sidecar_path")
|
|
if sidecar and Path(str(sidecar)).exists() and derived_dataset.metadata.get("sidecar_status") == "ready":
|
|
_emit_progress(progress_callback, "osm_sidecar_reused", "Reusing existing OSM feature sidecar.", 6, 7, {"dataset_id": derived_dataset.id, "sidecar_path": str(sidecar)})
|
|
return derived_dataset.metadata
|
|
|
|
storage_mode = effective_osm_feature_storage()
|
|
storage_label = "main-table OSM feature storage" if storage_mode == OSM_STORAGE_MAIN else "OSM feature sidecar"
|
|
started_event = "osm_storage_started" if storage_mode == OSM_STORAGE_MAIN else "osm_sidecar_started"
|
|
completed_event = "osm_storage_completed" if storage_mode == OSM_STORAGE_MAIN else "osm_sidecar_completed"
|
|
_emit_progress(progress_callback, started_event, f"Building {storage_label}.", 5, 7, {"dataset_id": derived_dataset.id, "path": str(extract.path), "storage_mode": storage_mode})
|
|
transient_dataset = Dataset(
|
|
id=derived_dataset.id,
|
|
source_id=derived_dataset.source_id,
|
|
kind=derived_dataset.kind,
|
|
local_path=derived_dataset.local_path,
|
|
sha256=derived_dataset.sha256,
|
|
is_active=False,
|
|
status=derived_dataset.status,
|
|
metadata_json=json.dumps(derived_dataset.metadata, indent=2),
|
|
)
|
|
with measure_pipeline_phase("osm_sidecar_build", source_id=derived_dataset.source_id, dataset_id=derived_dataset.id, metadata={"path": str(extract.path)}) as metric:
|
|
with SessionLocal() as session:
|
|
sidecar_metadata = prepare_osm_geojson_storage(
|
|
session=session,
|
|
dataset=transient_dataset,
|
|
path=extract.path,
|
|
source_hash=derived_dataset.sha256,
|
|
storage_mode=storage_mode,
|
|
)
|
|
session.commit()
|
|
metric.update(sidecar_metadata)
|
|
metadata = {**derived_dataset.metadata, **sidecar_metadata, "sidecar_status": "ready" if storage_mode == OSM_STORAGE_SIDECAR_FEATURES else "not_used", "storage_status": "ready"}
|
|
_emit_progress(progress_callback, completed_event, f"Built {storage_label}.", 6, 7, {"dataset_id": derived_dataset.id, **sidecar_metadata})
|
|
return metadata
|
|
|
|
|
|
def _activate_staged_osm_import(
|
|
*,
|
|
source_ref: _SourceRef,
|
|
raw_dataset: _DatasetRef,
|
|
filtered_dataset: _DatasetRef | None,
|
|
input_dataset: _DatasetRef,
|
|
derived_dataset: _DatasetRef,
|
|
extract: _PreparedExtract,
|
|
sidecar_metadata: dict[str, object],
|
|
) -> int:
|
|
metadata = {
|
|
**sidecar_metadata,
|
|
"stage": "derived_osm_transport_geojson",
|
|
"derived_from_dataset_id": input_dataset.id,
|
|
"raw_dataset_id": raw_dataset.id,
|
|
"filtered_dataset_id": None if filtered_dataset is None else filtered_dataset.id,
|
|
"extractor": EXTRACTOR_VERSION,
|
|
"extract_summary": extract.summary,
|
|
"import_mode": "staged_short_lock",
|
|
"sidecar_status": "ready",
|
|
}
|
|
with database_write_lock(f"osm_staged_import:{source_ref.id}:activate", timeout=60):
|
|
with SessionLocal() as session:
|
|
source = session.get(Source, source_ref.id)
|
|
raw = session.get(Dataset, raw_dataset.id)
|
|
filtered = session.get(Dataset, filtered_dataset.id) if filtered_dataset is not None else None
|
|
derived = session.get(Dataset, derived_dataset.id)
|
|
if source is None or raw is None or derived is None:
|
|
raise ValueError("staged OSM activation lost source or dataset rows")
|
|
for dataset in source.datasets:
|
|
dataset.is_active = False
|
|
raw.status = "filtered" if filtered is not None else "extracted"
|
|
raw.is_active = False
|
|
raw.metadata_json = json.dumps({**_metadata(raw), "extractor": EXTRACTOR_VERSION, "extract_summary": extract.summary}, indent=2)
|
|
if filtered is not None:
|
|
filtered.status = "extracted"
|
|
filtered.is_active = False
|
|
filtered.metadata_json = json.dumps({**_metadata(filtered), "extractor": EXTRACTOR_VERSION, "extract_summary": extract.summary}, indent=2)
|
|
derived.status = "imported"
|
|
derived.is_active = True
|
|
derived.local_path = str(extract.path)
|
|
derived.sha256 = extract.sha256
|
|
derived.metadata_json = json.dumps(metadata, indent=2)
|
|
_record_pipeline_stage(
|
|
session,
|
|
stage=STAGE_BUILD_INDEXES,
|
|
version=OSM_SIDECAR_INDEX_VERSION,
|
|
source_id=source.id,
|
|
dataset=derived,
|
|
inputs={
|
|
"dataset_id": derived.id,
|
|
"dataset_sha256": derived.sha256,
|
|
"sidecar_schema": "osm_features_v1",
|
|
"indexed_columns": ["kind", "mode", "route_scope", "bbox", "route_key", "ref", "identity"],
|
|
},
|
|
outputs=sidecar_metadata.get("osm_storage") if isinstance(sidecar_metadata.get("osm_storage"), dict) else sidecar_metadata,
|
|
)
|
|
source.status = "ok"
|
|
source.last_error = None
|
|
source.last_run_at = datetime.now(timezone.utc)
|
|
session.commit()
|
|
return derived.id
|
|
|
|
|
|
def _find_transport_dataset_by_raw_id(session: Session, source_id: int, raw_dataset_id: int) -> Dataset | None:
|
|
datasets = session.scalars(
|
|
select(Dataset)
|
|
.where(Dataset.source_id == source_id, Dataset.kind == "osm_pbf_transport")
|
|
.order_by(Dataset.id.desc())
|
|
).all()
|
|
for dataset in datasets:
|
|
metadata = _metadata(dataset)
|
|
if metadata.get("derived_from_dataset_id") == raw_dataset_id and metadata.get("filter") == TRANSPORT_FILTER_VERSION:
|
|
return dataset
|
|
return None
|
|
|
|
|
|
def _find_staged_derived_dataset(session: Session, source_id: int, input_dataset_id: int, extract_hash: str) -> Dataset | None:
|
|
datasets = session.scalars(
|
|
select(Dataset)
|
|
.where(
|
|
Dataset.source_id == source_id,
|
|
Dataset.kind == "osm_geojson",
|
|
Dataset.status.in_(["sidecar_staging", "importing"]),
|
|
Dataset.sha256 == extract_hash,
|
|
)
|
|
.order_by(Dataset.id.desc())
|
|
).all()
|
|
for dataset in datasets:
|
|
metadata = _metadata(dataset)
|
|
if metadata.get("derived_from_dataset_id") == input_dataset_id and metadata.get("extractor") == EXTRACTOR_VERSION:
|
|
return dataset
|
|
return None
|
|
|
|
|
|
def _dataset_ref(dataset: Dataset) -> _DatasetRef:
|
|
return _DatasetRef(
|
|
id=int(dataset.id),
|
|
source_id=int(dataset.source_id),
|
|
kind=dataset.kind,
|
|
local_path=dataset.local_path,
|
|
sha256=dataset.sha256,
|
|
status=dataset.status,
|
|
metadata=_metadata(dataset),
|
|
)
|
|
|
|
|
|
def _transport_filter_path_for_raw_id(source_id: int, raw_dataset_id: int, raw_path: Path) -> Path:
|
|
raw_format = _raw_format(raw_path)
|
|
suffix = ".osm.pbf" if raw_format == "osm_pbf" else ".osm"
|
|
return settings.data_dir / "derived" / f"source_{source_id}" / f"raw_dataset_{raw_dataset_id}" / f"transport{suffix}"
|
|
|
|
|
|
def _read_json_file(path: Path) -> dict[str, Any]:
|
|
if not path.exists():
|
|
return {}
|
|
try:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
except (OSError, json.JSONDecodeError):
|
|
return {}
|
|
return data if isinstance(data, dict) else {}
|
|
|
|
|
|
def _prepare_raw_osm_dataset(session: Session, source: Source, progress_callback=None) -> Dataset:
|
|
diff_dataset = _try_prepare_raw_from_diffs(session, source, progress_callback=progress_callback)
|
|
if diff_dataset is not None:
|
|
return diff_dataset
|
|
|
|
_emit_progress(progress_callback, "osm_full_snapshot_started", f"Downloading/copying full OSM snapshot for {source.name}.", None, None, {"source_id": source.id})
|
|
raw_path = materialize_source(source)
|
|
raw_hash = sha256_file(raw_path)
|
|
raw_dataset = _find_raw_dataset(session, source, raw_hash) or _commit_raw_dataset(session, source, raw_path, raw_hash)
|
|
_record_current_replication_state_for_snapshot(session, source, raw_dataset, progress_callback=progress_callback)
|
|
_emit_progress(progress_callback, "osm_full_snapshot_completed", f"Prepared raw OSM dataset #{raw_dataset.id}.", None, None, {"dataset_id": raw_dataset.id})
|
|
return raw_dataset
|
|
|
|
|
|
def extract_osm_transport_geojson(input_path: Path, output_path: Path) -> dict[str, Any]:
|
|
scan = _TransportScanHandler()
|
|
scan.apply_file(str(input_path))
|
|
|
|
geometry = _TransportGeometryHandler(scan.route_relations, scan.route_way_ids)
|
|
geometry.apply_file(str(input_path), locations=True)
|
|
|
|
features = geometry.features()
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.write_text(json.dumps({"type": "FeatureCollection", "features": features}), encoding="utf-8")
|
|
|
|
route_features = sum(1 for feature in features if feature["properties"].get("type") == "route")
|
|
infra_features = sum(1 for feature in features if feature["properties"].get("kind") == "infra")
|
|
stop_features = len(features) - route_features - infra_features
|
|
return {
|
|
"input_path": str(input_path),
|
|
"output_path": str(output_path),
|
|
"route_relations_seen": len(scan.route_relations),
|
|
"route_relation_member_ways": len(scan.route_way_ids),
|
|
"features": len(features),
|
|
"route_features": route_features,
|
|
"infrastructure_features": infra_features,
|
|
"stop_station_features": stop_features,
|
|
"route_relations_without_geometry": geometry.route_relations_without_geometry,
|
|
}
|
|
|
|
|
|
def _commit_raw_dataset(session: Session, source: Source, path: Path, source_hash: str) -> Dataset:
|
|
for dataset in source.datasets:
|
|
dataset.is_active = False
|
|
|
|
dataset = Dataset(
|
|
source_id=source.id,
|
|
kind="osm_pbf_raw",
|
|
local_path=str(path),
|
|
sha256=source_hash,
|
|
is_active=False,
|
|
status="committed",
|
|
metadata_json=json.dumps(
|
|
{
|
|
"stage": "raw_osm",
|
|
"raw_format": _raw_format(path),
|
|
"source_url": source.url,
|
|
},
|
|
indent=2,
|
|
),
|
|
)
|
|
session.add(dataset)
|
|
session.flush()
|
|
return dataset
|
|
|
|
|
|
def _try_prepare_raw_from_diffs(session: Session, source: Source, progress_callback=None) -> Dataset | None:
|
|
updates_url = _source_updates_url(source)
|
|
if not updates_url:
|
|
return None
|
|
|
|
current_state = _latest_diff_state(session, source.id)
|
|
if current_state is None or current_state.raw_dataset_id is None:
|
|
_emit_progress(progress_callback, "osm_diff_fallback", "No local OSM replication state yet; using full snapshot.", None, None, {"updates_url": updates_url})
|
|
return None
|
|
raw_dataset = session.get(Dataset, current_state.raw_dataset_id)
|
|
if raw_dataset is None or not Path(raw_dataset.local_path).exists():
|
|
_emit_progress(progress_callback, "osm_diff_fallback", "Local raw OSM base is missing; using full snapshot.", None, None, {"updates_url": updates_url})
|
|
return None
|
|
|
|
try:
|
|
remote_state = fetch_replication_state(updates_url, timeout=settings.osm_diff_state_timeout_seconds)
|
|
except Exception as exc: # noqa: BLE001 - correctness fallback
|
|
_emit_progress(progress_callback, "osm_diff_fallback", f"Could not read OSM replication state; using full snapshot: {exc}", None, None, {"updates_url": updates_url})
|
|
return None
|
|
|
|
if remote_state.sequence_number <= current_state.sequence_number:
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_diff_up_to_date",
|
|
"Local raw OSM extract is already at the latest known replication sequence.",
|
|
remote_state.sequence_number,
|
|
remote_state.sequence_number,
|
|
{"updates_url": updates_url, "sequence_number": remote_state.sequence_number},
|
|
)
|
|
return raw_dataset
|
|
|
|
gap = remote_state.sequence_number - current_state.sequence_number
|
|
if gap > settings.osm_diff_max_sequence_gap:
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_diff_fallback",
|
|
"OSM replication gap is too large; using full snapshot.",
|
|
current_state.sequence_number,
|
|
remote_state.sequence_number,
|
|
{"gap": gap, "max_gap": settings.osm_diff_max_sequence_gap, "updates_url": updates_url},
|
|
)
|
|
return None
|
|
|
|
host_tool = _host_tool_path()
|
|
if not host_tool.exists():
|
|
_emit_progress(progress_callback, "osm_diff_fallback", "host_tool.sh is missing; using full snapshot.", None, None, {"host_tool": str(host_tool)})
|
|
return None
|
|
|
|
try:
|
|
return _apply_diff_range(
|
|
session=session,
|
|
source=source,
|
|
base_dataset=raw_dataset,
|
|
updates_url=updates_url,
|
|
local_sequence=current_state.sequence_number,
|
|
remote_state=remote_state,
|
|
host_tool=host_tool,
|
|
progress_callback=progress_callback,
|
|
)
|
|
except Exception as exc: # noqa: BLE001 - fall back to full snapshot rather than risk a bad base
|
|
_emit_progress(progress_callback, "osm_diff_fallback", f"OSM diff application failed; using full snapshot: {exc}", None, None, {"updates_url": updates_url})
|
|
return None
|
|
|
|
|
|
def _apply_diff_range(
|
|
session: Session,
|
|
source: Source,
|
|
base_dataset: Dataset,
|
|
updates_url: str,
|
|
local_sequence: int,
|
|
remote_state: ReplicationState,
|
|
host_tool: Path,
|
|
progress_callback=None,
|
|
) -> Dataset:
|
|
update_root = settings.data_dir / "sources" / f"source_{source.id}" / "updates"
|
|
work_root = settings.data_dir / "sources" / f"source_{source.id}" / "diff_work"
|
|
work_root.mkdir(parents=True, exist_ok=True)
|
|
current_path = Path(base_dataset.local_path)
|
|
batch_size = max(1, int(settings.osm_diff_apply_batch_size))
|
|
sequences = list(range(local_sequence + 1, remote_state.sequence_number + 1))
|
|
applied_sequences: list[int] = []
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_diff_started",
|
|
f"Applying {len(sequences)} OSM replication diffs.",
|
|
local_sequence,
|
|
remote_state.sequence_number,
|
|
{"updates_url": updates_url, "from_sequence": local_sequence + 1, "to_sequence": remote_state.sequence_number},
|
|
)
|
|
for batch_start in range(0, len(sequences), batch_size):
|
|
batch = sequences[batch_start : batch_start + batch_size]
|
|
diff_paths = []
|
|
for sequence in batch:
|
|
diff_path = download_diff(updates_url, sequence, update_root)
|
|
diff_paths.append(diff_path)
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_diff_downloaded",
|
|
f"Downloaded OSM diff sequence {sequence}.",
|
|
sequence,
|
|
remote_state.sequence_number,
|
|
{"path": str(diff_path), "sequence_number": sequence},
|
|
)
|
|
temp_output = work_root / f"source_{source.id}_{batch[0]}_{batch[-1]}.tmp.osm.pbf"
|
|
completed = apply_osm_changes(current_path, diff_paths, temp_output, host_tool)
|
|
current_path = _store_updated_raw_pbf(source, temp_output)
|
|
applied_sequences.extend(batch)
|
|
_emit_progress(
|
|
progress_callback,
|
|
"osm_diff_applied",
|
|
f"Applied OSM diff sequences {batch[0]}-{batch[-1]}.",
|
|
batch[-1],
|
|
remote_state.sequence_number,
|
|
{
|
|
"output_path": str(current_path),
|
|
"stdout": completed.stdout.strip(),
|
|
"stderr": completed.stderr.strip(),
|
|
"batch_start": batch[0],
|
|
"batch_end": batch[-1],
|
|
},
|
|
)
|
|
raw_hash = sha256_file(current_path)
|
|
dataset = _find_raw_dataset(session, source, raw_hash) or _commit_raw_dataset(session, source, current_path, raw_hash)
|
|
_update_dataset_metadata(
|
|
dataset,
|
|
replication_state={
|
|
"updates_url": updates_url,
|
|
"sequence_number": remote_state.sequence_number,
|
|
"timestamp": remote_state.timestamp,
|
|
},
|
|
diff_update={
|
|
"base_dataset_id": base_dataset.id,
|
|
"base_sequence_number": local_sequence,
|
|
"applied_sequences": applied_sequences,
|
|
},
|
|
)
|
|
_record_diff_state(
|
|
session,
|
|
source=source,
|
|
raw_dataset=dataset,
|
|
updates_url=updates_url,
|
|
state=remote_state,
|
|
metadata={"base_dataset_id": base_dataset.id, "applied_sequences": applied_sequences},
|
|
)
|
|
return dataset
|
|
|
|
|
|
def _record_current_replication_state_for_snapshot(session: Session, source: Source, raw_dataset: Dataset, progress_callback=None) -> None:
|
|
updates_url = _source_updates_url(source)
|
|
if not updates_url:
|
|
return
|
|
try:
|
|
state = fetch_replication_state(updates_url, timeout=settings.osm_diff_state_timeout_seconds)
|
|
except Exception as exc: # noqa: BLE001 - full snapshot is still usable without diff state
|
|
_emit_progress(progress_callback, "osm_diff_state_unavailable", f"Could not record OSM replication state: {exc}", None, None, {"updates_url": updates_url})
|
|
return
|
|
_update_dataset_metadata(
|
|
raw_dataset,
|
|
replication_state={
|
|
"updates_url": updates_url,
|
|
"sequence_number": state.sequence_number,
|
|
"timestamp": state.timestamp,
|
|
},
|
|
)
|
|
_record_diff_state(
|
|
session,
|
|
source=source,
|
|
raw_dataset=raw_dataset,
|
|
updates_url=updates_url,
|
|
state=state,
|
|
metadata={"source": "full_snapshot"},
|
|
)
|
|
|
|
|
|
def _record_diff_state(
|
|
session: Session,
|
|
source: Source,
|
|
raw_dataset: Dataset,
|
|
updates_url: str,
|
|
state: ReplicationState,
|
|
metadata: dict[str, Any] | None = None,
|
|
) -> OsmDiffState:
|
|
for existing in session.scalars(select(OsmDiffState).where(OsmDiffState.source_id == source.id, OsmDiffState.status == "active")).all():
|
|
existing.status = "superseded"
|
|
row = OsmDiffState(
|
|
source_id=source.id,
|
|
raw_dataset_id=raw_dataset.id,
|
|
updates_url=updates_url,
|
|
sequence_number=state.sequence_number,
|
|
timestamp=state.timestamp,
|
|
status="active",
|
|
metadata_json=json.dumps({"state": state.raw, **(metadata or {})}, separators=(",", ":")),
|
|
)
|
|
session.add(row)
|
|
session.flush()
|
|
return row
|
|
|
|
|
|
def _latest_diff_state(session: Session, source_id: int) -> OsmDiffState | None:
|
|
return session.scalar(
|
|
select(OsmDiffState)
|
|
.where(OsmDiffState.source_id == source_id, OsmDiffState.status == "active")
|
|
.order_by(OsmDiffState.sequence_number.desc(), OsmDiffState.id.desc())
|
|
)
|
|
|
|
|
|
def _store_updated_raw_pbf(source: Source, temp_path: Path) -> Path:
|
|
source_dir = settings.data_dir / "sources" / f"source_{source.id}"
|
|
source_dir.mkdir(parents=True, exist_ok=True)
|
|
raw_hash = sha256_file(temp_path)
|
|
target = source_dir / f"{raw_hash[:16]}.osm.pbf"
|
|
if target.exists() and sha256_file(target) == raw_hash:
|
|
temp_path.unlink(missing_ok=True)
|
|
return target
|
|
shutil.move(str(temp_path), str(target))
|
|
return target
|
|
|
|
|
|
def _source_updates_url(source: Source) -> str | None:
|
|
notes = source.notes or ""
|
|
for part in notes.split(";"):
|
|
if "=" not in part:
|
|
continue
|
|
key, value = part.strip().split("=", 1)
|
|
if key.strip() == "updates_url" and value.strip():
|
|
return value.strip()
|
|
if source.kind == "osm_diff" and source.url:
|
|
return source.url
|
|
return None
|
|
|
|
|
|
def _host_tool_path() -> Path:
|
|
return Path(__file__).resolve().parents[2] / "scripts" / "host_tool.sh"
|
|
|
|
|
|
def _find_raw_dataset(session: Session, source: Source, raw_hash: str) -> Dataset | None:
|
|
return session.scalar(
|
|
select(Dataset)
|
|
.where(
|
|
Dataset.source_id == source.id,
|
|
Dataset.kind == "osm_pbf_raw",
|
|
Dataset.sha256 == raw_hash,
|
|
)
|
|
.order_by(Dataset.id.desc())
|
|
)
|
|
|
|
|
|
def _prepare_transport_pbf(session: Session, source: Source, raw_dataset: Dataset, raw_path: Path) -> Dataset:
|
|
existing = _find_transport_dataset(session, source, raw_dataset)
|
|
if existing is not None and Path(existing.local_path).exists():
|
|
return existing
|
|
|
|
output_path = _transport_filter_path(source, raw_dataset, raw_path)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
script_path = _prefilter_script_path()
|
|
if not script_path.exists():
|
|
raise FileNotFoundError(f"OSM transport filter script not found: {script_path}")
|
|
command = [str(script_path), str(raw_path), str(output_path)]
|
|
try:
|
|
completed = subprocess.run(command, check=True, capture_output=True, text=True)
|
|
except subprocess.CalledProcessError as exc:
|
|
stderr = (exc.stderr or "").strip()
|
|
stdout = (exc.stdout or "").strip()
|
|
details = stderr or stdout or f"exit code {exc.returncode}"
|
|
raise RuntimeError(f"OSM transport filter failed for {raw_path}: {details}") from exc
|
|
filtered_hash = sha256_file(output_path)
|
|
|
|
metadata = {
|
|
"stage": "filtered_osm_transport_pbf",
|
|
"raw_format": _raw_format(output_path),
|
|
"derived_from_dataset_id": raw_dataset.id,
|
|
"source_url": source.url,
|
|
"filter": TRANSPORT_FILTER_VERSION,
|
|
"filter_script": str(script_path),
|
|
"input_path": str(raw_path),
|
|
"input_sha256": raw_dataset.sha256,
|
|
"output_path": str(output_path),
|
|
"stdout": completed.stdout.strip(),
|
|
"stderr": completed.stderr.strip(),
|
|
}
|
|
if existing is None:
|
|
dataset = Dataset(
|
|
source_id=source.id,
|
|
kind="osm_pbf_transport",
|
|
local_path=str(output_path),
|
|
sha256=filtered_hash,
|
|
is_active=False,
|
|
status="filtered",
|
|
metadata_json=json.dumps(metadata, indent=2),
|
|
)
|
|
session.add(dataset)
|
|
else:
|
|
dataset = existing
|
|
dataset.local_path = str(output_path)
|
|
dataset.sha256 = filtered_hash
|
|
dataset.status = "filtered"
|
|
dataset.metadata_json = json.dumps(metadata, indent=2)
|
|
raw_dataset.status = "filtered"
|
|
session.flush()
|
|
return dataset
|
|
|
|
|
|
def _find_transport_dataset(session: Session, source: Source, raw_dataset: Dataset) -> Dataset | None:
|
|
datasets = session.scalars(
|
|
select(Dataset)
|
|
.where(Dataset.source_id == source.id, Dataset.kind == "osm_pbf_transport")
|
|
.order_by(Dataset.id.desc())
|
|
).all()
|
|
for dataset in datasets:
|
|
metadata = _metadata(dataset)
|
|
if (
|
|
metadata.get("derived_from_dataset_id") == raw_dataset.id
|
|
and metadata.get("filter") == TRANSPORT_FILTER_VERSION
|
|
):
|
|
return dataset
|
|
return None
|
|
|
|
|
|
def _find_existing_derived(session: Session, source: Source, input_dataset: Dataset) -> Dataset | None:
|
|
derived_datasets = session.scalars(
|
|
select(Dataset)
|
|
.where(
|
|
Dataset.source_id == source.id,
|
|
Dataset.kind == "osm_geojson",
|
|
Dataset.status == "imported",
|
|
Dataset.is_active.is_(True),
|
|
)
|
|
.order_by(Dataset.id.desc())
|
|
).all()
|
|
for derived_dataset in derived_datasets:
|
|
metadata = _metadata(derived_dataset)
|
|
if (
|
|
metadata.get("derived_from_dataset_id") == input_dataset.id
|
|
and metadata.get("extractor") == EXTRACTOR_VERSION
|
|
):
|
|
return derived_dataset
|
|
return None
|
|
|
|
|
|
def _metadata(dataset: Dataset) -> dict[str, Any]:
|
|
try:
|
|
return json.loads(dataset.metadata_json or "{}")
|
|
except json.JSONDecodeError:
|
|
return {}
|
|
|
|
|
|
def _update_dataset_metadata(dataset: Dataset, **values: Any) -> None:
|
|
metadata = _metadata(dataset)
|
|
metadata.update(values)
|
|
dataset.metadata_json = json.dumps(metadata, indent=2)
|
|
|
|
|
|
def _emit_progress(progress_callback, event_type: str, message: str, progress_current=None, progress_total=None, metadata: dict[str, Any] | None = None) -> None:
|
|
if progress_callback is not None:
|
|
progress_callback(event_type, message, progress_current, progress_total, metadata)
|
|
|
|
|
|
def _should_prefilter(path: Path) -> bool:
|
|
if not settings.osm_pbf_prefilter_enabled:
|
|
return False
|
|
return _raw_format(path) in _prefilter_formats()
|
|
|
|
|
|
def _prefilter_formats() -> set[str]:
|
|
return {
|
|
value.strip()
|
|
for value in str(settings.osm_pbf_prefilter_formats or "").split(",")
|
|
if value.strip()
|
|
}
|
|
|
|
|
|
def _prefilter_script_path() -> Path:
|
|
path = settings.osm_pbf_prefilter_script
|
|
if path.is_absolute():
|
|
return path
|
|
return Path.cwd() / path
|
|
|
|
|
|
def _transport_filter_path(source: Source, raw_dataset: Dataset, raw_path: Path) -> Path:
|
|
raw_format = _raw_format(raw_path)
|
|
suffix = ".osm.pbf" if raw_format == "osm_pbf" else ".osm"
|
|
return settings.data_dir / "derived" / f"source_{source.id}" / f"raw_dataset_{raw_dataset.id}" / f"transport{suffix}"
|
|
|
|
|
|
class _TransportScanHandler(osmium.SimpleHandler):
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
self.route_relations: dict[int, dict[str, Any]] = {}
|
|
self.route_way_ids: set[int] = set()
|
|
|
|
def relation(self, relation: osmium.osm.Relation) -> None:
|
|
tags = _tags_dict(relation.tags)
|
|
mode = _route_mode(tags)
|
|
if tags.get("type") != "route" or mode is None:
|
|
return
|
|
|
|
way_refs = [member.ref for member in relation.members if member.type == "w"]
|
|
if not way_refs:
|
|
return
|
|
|
|
self.route_relations[relation.id] = {
|
|
"tags": tags,
|
|
"way_refs": way_refs,
|
|
}
|
|
self.route_way_ids.update(way_refs)
|
|
|
|
|
|
class _TransportGeometryHandler(osmium.SimpleHandler):
|
|
def __init__(self, route_relations: dict[int, dict[str, Any]], route_way_ids: set[int]) -> None:
|
|
super().__init__()
|
|
self.route_relations = route_relations
|
|
self.route_way_ids = route_way_ids
|
|
self.route_way_lines: dict[int, list[list[float]]] = {}
|
|
self.infrastructure_features: list[dict[str, Any]] = []
|
|
self.stop_features: list[dict[str, Any]] = []
|
|
self.route_relations_without_geometry = 0
|
|
|
|
def node(self, node: osmium.osm.Node) -> None:
|
|
tags = _tags_dict(node.tags)
|
|
if not _is_stop_or_station(tags):
|
|
return
|
|
coords = _node_coords(node)
|
|
if coords is None:
|
|
return
|
|
props = {
|
|
**tags,
|
|
"osm_type": "node",
|
|
"osm_id": str(node.id),
|
|
}
|
|
self.stop_features.append({"type": "Feature", "geometry": {"type": "Point", "coordinates": coords}, "properties": props})
|
|
|
|
def way(self, way: osmium.osm.Way) -> None:
|
|
tags = _tags_dict(way.tags)
|
|
coords = _way_coords(way)
|
|
|
|
if coords is not None and way.id in self.route_way_ids:
|
|
self.route_way_lines[way.id] = coords
|
|
|
|
if coords is not None and _is_transport_infrastructure(tags):
|
|
props = {
|
|
**tags,
|
|
"osm_type": "way",
|
|
"osm_id": str(way.id),
|
|
"kind": "infra",
|
|
}
|
|
mode = _infrastructure_mode(tags)
|
|
if mode:
|
|
props.setdefault("mode", mode)
|
|
self.infrastructure_features.append(
|
|
{"type": "Feature", "geometry": {"type": "LineString", "coordinates": coords}, "properties": props}
|
|
)
|
|
|
|
if _is_stop_or_station(tags):
|
|
feature = _way_area_or_line_feature(way, tags, coords)
|
|
if feature is not None:
|
|
self.stop_features.append(feature)
|
|
|
|
def features(self) -> list[dict[str, Any]]:
|
|
route_features = []
|
|
for relation_id, route in self.route_relations.items():
|
|
lines = [line for way_ref in route["way_refs"] if (line := self.route_way_lines.get(way_ref))]
|
|
if not lines:
|
|
self.route_relations_without_geometry += 1
|
|
continue
|
|
|
|
geometry: dict[str, Any]
|
|
ordered_lines = _ordered_route_lines(route["way_refs"], self.route_way_lines)
|
|
if len(ordered_lines) == 1:
|
|
geometry = {"type": "LineString", "coordinates": ordered_lines[0]}
|
|
else:
|
|
geometry = {"type": "MultiLineString", "coordinates": ordered_lines}
|
|
|
|
props = {
|
|
**route["tags"],
|
|
"osm_type": "relation",
|
|
"osm_id": str(relation_id),
|
|
"member_way_count": len(route["way_refs"]),
|
|
"geometry_source": "ordered_route_relation_member_ways",
|
|
"geometry_part_count": len(ordered_lines),
|
|
}
|
|
route_features.append({"type": "Feature", "geometry": geometry, "properties": props})
|
|
return route_features + self.infrastructure_features + self.stop_features
|
|
|
|
|
|
def _ordered_route_lines(way_refs: list[int], route_way_lines: dict[int, list[list[float]]]) -> list[list[list[float]]]:
|
|
parts: list[list[list[float]]] = []
|
|
for way_ref in way_refs:
|
|
line = route_way_lines.get(way_ref)
|
|
if not line:
|
|
continue
|
|
coords = [list(coord) for coord in line]
|
|
if len(coords) < 2:
|
|
continue
|
|
if not parts:
|
|
parts.append(coords)
|
|
continue
|
|
if _append_connected(parts[-1], coords):
|
|
continue
|
|
attached = False
|
|
for part in reversed(parts[:-1]):
|
|
if _append_connected(part, coords):
|
|
attached = True
|
|
break
|
|
if not attached:
|
|
parts.append(coords)
|
|
return parts
|
|
|
|
|
|
def _append_connected(part: list[list[float]], coords: list[list[float]]) -> bool:
|
|
if _same_coord(part[-1], coords[0]):
|
|
part.extend(coords[1:])
|
|
return True
|
|
if _same_coord(part[-1], coords[-1]):
|
|
part.extend(reversed(coords[:-1]))
|
|
return True
|
|
if _same_coord(part[0], coords[-1]):
|
|
part[:0] = coords[:-1]
|
|
return True
|
|
if _same_coord(part[0], coords[0]):
|
|
part[:0] = list(reversed(coords[1:]))
|
|
return True
|
|
return False
|
|
|
|
|
|
def _same_coord(left: list[float], right: list[float]) -> bool:
|
|
return len(left) >= 2 and len(right) >= 2 and abs(left[0] - right[0]) < 1e-9 and abs(left[1] - right[1]) < 1e-9
|
|
|
|
|
|
def _tags_dict(tags: osmium.osm.TagList) -> dict[str, str]:
|
|
return {tag.k: tag.v for tag in tags}
|
|
|
|
|
|
def _route_mode(tags: dict[str, str]) -> str | None:
|
|
value = tags.get("route")
|
|
if value in ROUTE_MODES:
|
|
return "train" if value == "railway" else value
|
|
return None
|
|
|
|
|
|
def _is_transport_infrastructure(tags: dict[str, str]) -> bool:
|
|
return _infrastructure_mode(tags) is not None
|
|
|
|
|
|
def _infrastructure_mode(tags: dict[str, str]) -> str | None:
|
|
railway = tags.get("railway")
|
|
if railway in RAILWAY_MODE_BY_TAG:
|
|
return RAILWAY_MODE_BY_TAG[railway]
|
|
if tags.get("route") == "ferry":
|
|
return "ferry"
|
|
aerialway = tags.get("aerialway")
|
|
if aerialway and aerialway != "station":
|
|
return "aerialway"
|
|
return None
|
|
|
|
|
|
def _is_stop_or_station(tags: dict[str, str]) -> bool:
|
|
if tags.get("public_transport") in {"platform", "stop_position", "station"}:
|
|
return True
|
|
if tags.get("railway") in {"station", "halt", "tram_stop", "subway_entrance", "platform"}:
|
|
return True
|
|
if tags.get("highway") == "bus_stop":
|
|
return True
|
|
if tags.get("amenity") in {"bus_station", "ferry_terminal"}:
|
|
return True
|
|
if tags.get("aerialway") == "station":
|
|
return True
|
|
return False
|
|
|
|
|
|
def _node_coords(node: osmium.osm.Node) -> list[float] | None:
|
|
try:
|
|
if not node.location.valid():
|
|
return None
|
|
return [float(node.location.lon), float(node.location.lat)]
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _way_coords(way: osmium.osm.Way) -> list[list[float]] | None:
|
|
coords = []
|
|
try:
|
|
for node in way.nodes:
|
|
if not node.location.valid():
|
|
return None
|
|
coords.append([float(node.location.lon), float(node.location.lat)])
|
|
except Exception:
|
|
return None
|
|
return coords if len(coords) >= 2 else None
|
|
|
|
|
|
def _way_area_or_line_feature(way: osmium.osm.Way, tags: dict[str, str], coords: list[list[float]] | None) -> dict[str, Any] | None:
|
|
if coords is None:
|
|
return None
|
|
props = {
|
|
**tags,
|
|
"osm_type": "way",
|
|
"osm_id": str(way.id),
|
|
}
|
|
if len(coords) >= 4 and coords[0] == coords[-1]:
|
|
return {"type": "Feature", "geometry": {"type": "Polygon", "coordinates": [coords]}, "properties": props}
|
|
return {"type": "Feature", "geometry": {"type": "LineString", "coordinates": coords}, "properties": props}
|
|
|
|
|
|
def _record_pipeline_stage(
|
|
session: Session,
|
|
*,
|
|
stage: str,
|
|
version: str,
|
|
source_id: int,
|
|
dataset: Dataset,
|
|
inputs: dict[str, Any],
|
|
outputs: dict[str, Any] | None,
|
|
) -> None:
|
|
dependency_hash_value = dependency_hash(inputs)
|
|
run = start_pipeline_run(
|
|
session,
|
|
stage=stage,
|
|
version=version,
|
|
dependency_hash_value=dependency_hash_value,
|
|
source_id=source_id,
|
|
dataset_id=dataset.id,
|
|
inputs=inputs,
|
|
)
|
|
finish_pipeline_run(session, run, outputs=outputs or {})
|
|
|
|
|
|
def _raw_format(path: Path) -> str:
|
|
name = path.name.lower()
|
|
if name.endswith(".osm.pbf") or name.endswith(".pbf"):
|
|
return "osm_pbf"
|
|
if name.endswith(".osm") or name.endswith(".osm.xml") or name.endswith(".xml"):
|
|
return "osm_xml"
|
|
if name.endswith(".osc") or name.endswith(".osc.gz"):
|
|
return "osm_change"
|
|
return "osm"
|