meubility-workbench/app/pipeline/osm_pbf.py

from __future__ import annotations

import json
import shutil
import subprocess
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import osmium
from sqlalchemy import select
from sqlalchemy.orm import Session

from app.config import settings
from app.db import SessionLocal
from app.db_lock import database_write_lock
from app.models import Dataset, OsmDiffState, Source
from app.osm_storage import OSM_STORAGE_MAIN, OSM_STORAGE_SIDECAR_FEATURES, effective_osm_feature_storage
from app.performance import measure_pipeline_phase
from app.pipeline.download import materialize_source
from app.pipeline.osm_geojson import import_osm_geojson, prepare_osm_geojson_storage
from app.pipeline.osm_replication import ReplicationState, apply_osm_changes, download_diff, fetch_replication_state
from app.pipeline.state import (
    STAGE_ACQUIRE_RAW,
    STAGE_BUILD_INDEXES,
    STAGE_EXTRACT_GEOMETRY,
    STAGE_FILTER_TRANSPORT,
    dependency_hash,
    finish_pipeline_run,
    start_pipeline_run,
)
from app.pipeline.utils import sha256_file

ROUTE_MODES = {
    "train",
    "railway",
    "light_rail",
    "subway",
    "tram",
    "bus",
    "trolleybus",
    "coach",
    "ferry",
    "monorail",
    "funicular",
    "aerialway",
}

RAILWAY_MODE_BY_TAG = {
    "rail": "train",
    "light_rail": "light_rail",
    "subway": "subway",
    "tram": "tram",
    "monorail": "monorail",
    "funicular": "funicular",
}

EXTRACTOR_VERSION = "osmium_transport_geojson_v2_ordered_relation_members"
TRANSPORT_FILTER_VERSION = "osmium_transport_filter_v1"
RAW_ACQUIRE_VERSION = "osm_raw_acquire_v1"
OSM_SIDECAR_INDEX_VERSION = "osm_sidecar_indexes_v1"


@dataclass(frozen=True)
class _SourceRef:
    id: int
    name: str
    kind: str
    url: str
    country: str | None = None
    license: str | None = None
    notes: str | None = None


@dataclass(frozen=True)
class _DatasetRef:
    id: int
    source_id: int
    kind: str
    local_path: str
    sha256: str
    status: str
    metadata: dict[str, Any]


@dataclass(frozen=True)
class _PreparedRawFile:
    path: Path
    sha256: str
    metadata: dict[str, Any]
    replication_state: ReplicationState | None = None
    diff_state_metadata: dict[str, Any] | None = None


@dataclass(frozen=True)
class _PreparedTransportFile:
    path: Path
    sha256: str
    metadata: dict[str, Any]
    reused: bool


@dataclass(frozen=True)
class _PreparedExtract:
    path: Path
    sha256: str
    summary: dict[str, Any]
    reused: bool


def run_osm_pbf_source(session: Session, source: Source, progress_callback=None) -> Dataset:
    raw_dataset = _prepare_raw_osm_dataset(session, source, progress_callback=progress_callback)
    input_dataset = raw_dataset
    input_path = Path(raw_dataset.local_path)

    if _should_prefilter(input_path):
        input_dataset = _prepare_transport_pbf(session, source, raw_dataset, input_path)
        input_path = Path(input_dataset.local_path)

    existing_derived = _find_existing_derived(session, source, input_dataset)
    if existing_derived is not None:
        return existing_derived

    output_dir = settings.data_dir / "derived" / f"source_{source.id}" / f"extract_dataset_{input_dataset.id}"
    output_path = output_dir / "transport.geojson"
    extract_summary = extract_osm_transport_geojson(input_path, output_path)

    input_dataset.status = "extracted"
    _update_dataset_metadata(input_dataset, extractor=EXTRACTOR_VERSION, extract_summary=extract_summary)
    if input_dataset.id != raw_dataset.id:
        raw_dataset.status = "filtered"
        _update_dataset_metadata(raw_dataset, filtered_dataset_id=input_dataset.id)
    session.flush()

    derived_dataset = import_osm_geojson(session=session, source=source, path=output_path)
    derived_metadata = json.loads(derived_dataset.metadata_json or "{}")
    derived_metadata.update(
        {
            "stage": "derived_osm_transport_geojson",
            "derived_from_dataset_id": input_dataset.id,
            "raw_dataset_id": raw_dataset.id,
            "filtered_dataset_id": input_dataset.id if input_dataset.id != raw_dataset.id else None,
            "extractor": EXTRACTOR_VERSION,
            "extract_summary": extract_summary,
        }
    )
    derived_dataset.metadata_json = json.dumps(derived_metadata, indent=2)
    session.flush()
    return derived_dataset


def run_osm_pbf_source_staged(source_id: int, progress_callback=None) -> Dataset:
    """Run large OSM PBF imports with only short DB write-lock sections.

    The expensive file work is deterministic and resumable from cached files:
    raw source materialization, optional osmium transport filtering, GeoJSON
    extraction, and sidecar creation all happen outside the global SQLite write
    lock. Dataset rows are reserved/activated in short transactions.
    """
    source_ref = _load_source_ref(source_id)
    _mark_source_running(source_ref.id)
    _emit_progress(progress_callback, "osm_staged_import_started", f"Preparing staged OSM import for {source_ref.name}.", 0, 7, {"source_id": source_ref.id})

    prepared_raw = _prepare_raw_file_staged(source_ref, progress_callback=progress_callback)
    raw_dataset = _reserve_raw_dataset(source_ref, prepared_raw)
    _emit_progress(
        progress_callback,
        "osm_raw_dataset_reserved",
        f"Reserved raw OSM dataset #{raw_dataset.id}.",
        2,
        7,
        {"dataset_id": raw_dataset.id, "path": raw_dataset.local_path, "sha256": raw_dataset.sha256},
    )

    input_dataset = raw_dataset
    input_path = Path(raw_dataset.local_path)
    filtered_dataset: _DatasetRef | None = None
    if _should_prefilter(input_path):
        prepared_transport = _prepare_transport_file_staged(source_ref, raw_dataset, input_path, progress_callback=progress_callback)
        filtered_dataset = _reserve_transport_dataset(source_ref, raw_dataset, prepared_transport)
        input_dataset = filtered_dataset
        input_path = Path(filtered_dataset.local_path)
        _emit_progress(
            progress_callback,
            "osm_transport_dataset_reserved",
            f"Reserved filtered OSM transport dataset #{filtered_dataset.id}.",
            3,
            7,
            {"dataset_id": filtered_dataset.id, "path": filtered_dataset.local_path, "sha256": filtered_dataset.sha256, "reused": prepared_transport.reused},
        )

    existing = _existing_active_derived_ref(source_ref.id, input_dataset.id)
    if existing is not None:
        _activate_existing_derived(source_ref.id, existing.id)
        _emit_progress(progress_callback, "osm_staged_import_reused", f"Reused active OSM transport dataset #{existing.id}.", 7, 7, {"dataset_id": existing.id})
        return _load_dataset(existing.id)

    extract = _extract_transport_geojson_staged(source_ref, input_dataset, input_path, progress_callback=progress_callback)
    derived_dataset = _reserve_derived_dataset(
        source_ref=source_ref,
        raw_dataset=raw_dataset,
        input_dataset=input_dataset,
        filtered_dataset=filtered_dataset,
        extract=extract,
    )
    _emit_progress(
        progress_callback,
        "osm_derived_dataset_reserved",
        f"Reserved derived OSM dataset #{derived_dataset.id}.",
        5,
        7,
        {"dataset_id": derived_dataset.id, "path": derived_dataset.local_path, "sha256": derived_dataset.sha256, "extract_reused": extract.reused},
    )

    sidecar_metadata = _prepare_derived_storage_staged(derived_dataset, extract, progress_callback=progress_callback)
    activated_id = _activate_staged_osm_import(
        source_ref=source_ref,
        raw_dataset=raw_dataset,
        filtered_dataset=filtered_dataset,
        input_dataset=input_dataset,
        derived_dataset=derived_dataset,
        extract=extract,
        sidecar_metadata=sidecar_metadata,
    )
    _emit_progress(progress_callback, "osm_staged_import_completed", f"Activated OSM dataset #{activated_id}.", 7, 7, {"dataset_id": activated_id})
    return _load_dataset(activated_id)


def _load_source_ref(source_id: int) -> _SourceRef:
    with SessionLocal() as session:
        source = session.get(Source, source_id)
        if source is None:
            raise ValueError(f"source not found: {source_id}")
        if source.kind != "osm_pbf":
            raise ValueError(f"staged OSM import requires source kind osm_pbf, got {source.kind}")
        return _SourceRef(
            id=source.id,
            name=source.name,
            kind=source.kind,
            url=source.url,
            country=source.country,
            license=source.license,
            notes=source.notes,
        )


def _load_dataset(dataset_id: int) -> Dataset:
    with SessionLocal() as session:
        dataset = session.get(Dataset, dataset_id)
        if dataset is None:
            raise ValueError(f"dataset not found after staged import: {dataset_id}")
        return dataset


def _mark_source_running(source_id: int) -> None:
    with database_write_lock(f"osm_staged_import:{source_id}:start", timeout=30):
        with SessionLocal() as session:
            source = session.get(Source, source_id)
            if source is None:
                raise ValueError(f"source not found: {source_id}")
            source.status = "running"
            source.last_error = None
            source.last_run_at = datetime.now(timezone.utc)
            session.commit()


def _prepare_raw_file_staged(source: _SourceRef, progress_callback=None) -> _PreparedRawFile:
    diff_raw = _try_prepare_raw_file_from_diffs_staged(source, progress_callback=progress_callback)
    if diff_raw is not None:
        return diff_raw

    _emit_progress(progress_callback, "osm_full_snapshot_started", f"Downloading/copying full OSM snapshot for {source.name}.", 1, 7, {"source_id": source.id})
    with measure_pipeline_phase("osm_full_snapshot", source_id=source.id, metadata={"url": source.url}) as metric:
        raw_path = materialize_source(source)  # type: ignore[arg-type]
        raw_hash = sha256_file(raw_path)
        metric.update({"path": str(raw_path), "sha256": raw_hash, "bytes": raw_path.stat().st_size if raw_path.exists() else None})
    metadata = {
        "stage": "raw_osm",
        "raw_format": _raw_format(raw_path),
        "source_url": source.url,
        "import_mode": "staged_short_lock",
    }
    replication_state = _fetch_current_replication_state_for_snapshot(source, progress_callback=progress_callback)
    if replication_state is not None:
        metadata["replication_state"] = {
            "updates_url": _source_updates_url(source),  # type: ignore[arg-type]
            "sequence_number": replication_state.sequence_number,
            "timestamp": replication_state.timestamp,
        }
    _emit_progress(progress_callback, "osm_full_snapshot_completed", "Prepared raw OSM snapshot file.", 1, 7, {"path": str(raw_path), "sha256": raw_hash})
    return _PreparedRawFile(path=raw_path, sha256=raw_hash, metadata=metadata, replication_state=replication_state, diff_state_metadata={"source": "full_snapshot"} if replication_state is not None else None)


def _try_prepare_raw_file_from_diffs_staged(source: _SourceRef, progress_callback=None) -> _PreparedRawFile | None:
    updates_url = _source_updates_url(source)  # type: ignore[arg-type]
    if not updates_url:
        return None
    with SessionLocal() as session:
        current_state = _latest_diff_state(session, source.id)
        if current_state is None or current_state.raw_dataset_id is None:
            _emit_progress(progress_callback, "osm_diff_fallback", "No local OSM replication state yet; using full snapshot.", None, None, {"updates_url": updates_url})
            return None
        base_dataset = session.get(Dataset, current_state.raw_dataset_id)
        if base_dataset is None or not Path(base_dataset.local_path).exists():
            _emit_progress(progress_callback, "osm_diff_fallback", "Local raw OSM base is missing; using full snapshot.", None, None, {"updates_url": updates_url})
            return None
        base_ref = _dataset_ref(base_dataset)
        local_sequence = current_state.sequence_number

    try:
        remote_state = fetch_replication_state(updates_url, timeout=settings.osm_diff_state_timeout_seconds)
    except Exception as exc:  # noqa: BLE001 - correctness fallback
        _emit_progress(progress_callback, "osm_diff_fallback", f"Could not read OSM replication state; using full snapshot: {exc}", None, None, {"updates_url": updates_url})
        return None

    if remote_state.sequence_number <= local_sequence:
        _emit_progress(
            progress_callback,
            "osm_diff_up_to_date",
            "Local raw OSM extract is already at the latest known replication sequence.",
            remote_state.sequence_number,
            remote_state.sequence_number,
            {"updates_url": updates_url, "sequence_number": remote_state.sequence_number},
        )
        return _PreparedRawFile(
            path=Path(base_ref.local_path),
            sha256=base_ref.sha256,
            metadata=base_ref.metadata,
            replication_state=remote_state,
            diff_state_metadata={"source": "existing_raw_dataset", "raw_dataset_id": base_ref.id},
        )

    gap = remote_state.sequence_number - local_sequence
    if gap > settings.osm_diff_max_sequence_gap:
        _emit_progress(
            progress_callback,
            "osm_diff_fallback",
            "OSM replication gap is too large; using full snapshot.",
            local_sequence,
            remote_state.sequence_number,
            {"gap": gap, "max_gap": settings.osm_diff_max_sequence_gap, "updates_url": updates_url},
        )
        return None

    host_tool = _host_tool_path()
    if not host_tool.exists():
        _emit_progress(progress_callback, "osm_diff_fallback", "host_tool.sh is missing; using full snapshot.", None, None, {"host_tool": str(host_tool)})
        return None

    try:
        return _apply_diff_range_files_staged(
            source=source,
            base_dataset=base_ref,
            updates_url=updates_url,
            local_sequence=local_sequence,
            remote_state=remote_state,
            host_tool=host_tool,
            progress_callback=progress_callback,
        )
    except Exception as exc:  # noqa: BLE001 - fall back to full snapshot rather than risk a bad base
        _emit_progress(progress_callback, "osm_diff_fallback", f"OSM diff application failed; using full snapshot: {exc}", None, None, {"updates_url": updates_url})
        return None


def _apply_diff_range_files_staged(
    *,
    source: _SourceRef,
    base_dataset: _DatasetRef,
    updates_url: str,
    local_sequence: int,
    remote_state: ReplicationState,
    host_tool: Path,
    progress_callback=None,
) -> _PreparedRawFile:
    update_root = settings.data_dir / "sources" / f"source_{source.id}" / "updates"
    work_root = settings.data_dir / "sources" / f"source_{source.id}" / "diff_work"
    work_root.mkdir(parents=True, exist_ok=True)
    current_path = Path(base_dataset.local_path)
    batch_size = max(1, int(settings.osm_diff_apply_batch_size))
    sequences = list(range(local_sequence + 1, remote_state.sequence_number + 1))
    applied_sequences: list[int] = []
    _emit_progress(
        progress_callback,
        "osm_diff_started",
        f"Applying {len(sequences)} OSM replication diffs.",
        local_sequence,
        remote_state.sequence_number,
        {"updates_url": updates_url, "from_sequence": local_sequence + 1, "to_sequence": remote_state.sequence_number},
    )
    with measure_pipeline_phase("osm_diff_apply", source_id=source.id, metadata={"from_sequence": local_sequence + 1, "to_sequence": remote_state.sequence_number}) as metric:
        for batch_start in range(0, len(sequences), batch_size):
            batch = sequences[batch_start : batch_start + batch_size]
            diff_paths = []
            for sequence in batch:
                diff_path = download_diff(updates_url, sequence, update_root)
                diff_paths.append(diff_path)
                _emit_progress(
                    progress_callback,
                    "osm_diff_downloaded",
                    f"Downloaded OSM diff sequence {sequence}.",
                    sequence,
                    remote_state.sequence_number,
                    {"path": str(diff_path), "sequence_number": sequence},
                )
            temp_output = work_root / f"source_{source.id}_{batch[0]}_{batch[-1]}.tmp.osm.pbf"
            completed = apply_osm_changes(current_path, diff_paths, temp_output, host_tool)
            current_path = _store_updated_raw_pbf(source, temp_output)  # type: ignore[arg-type]
            applied_sequences.extend(batch)
            _emit_progress(
                progress_callback,
                "osm_diff_applied",
                f"Applied OSM diff sequences {batch[0]}-{batch[-1]}.",
                batch[-1],
                remote_state.sequence_number,
                {
                    "output_path": str(current_path),
                    "stdout": completed.stdout.strip(),
                    "stderr": completed.stderr.strip(),
                    "batch_start": batch[0],
                    "batch_end": batch[-1],
                },
            )
        raw_hash = sha256_file(current_path)
        metric.update({"applied_sequences": applied_sequences, "path": str(current_path), "sha256": raw_hash, "bytes": current_path.stat().st_size if current_path.exists() else None})
    metadata = {
        "stage": "raw_osm",
        "raw_format": _raw_format(current_path),
        "source_url": source.url,
        "import_mode": "staged_short_lock",
        "replication_state": {
            "updates_url": updates_url,
            "sequence_number": remote_state.sequence_number,
            "timestamp": remote_state.timestamp,
        },
        "diff_update": {
            "base_dataset_id": base_dataset.id,
            "base_sequence_number": local_sequence,
            "applied_sequences": applied_sequences,
        },
    }
    return _PreparedRawFile(
        path=current_path,
        sha256=raw_hash,
        metadata=metadata,
        replication_state=remote_state,
        diff_state_metadata={"base_dataset_id": base_dataset.id, "applied_sequences": applied_sequences},
    )


def _fetch_current_replication_state_for_snapshot(source: _SourceRef, progress_callback=None) -> ReplicationState | None:
    updates_url = _source_updates_url(source)  # type: ignore[arg-type]
    if not updates_url:
        return None
    try:
        return fetch_replication_state(updates_url, timeout=settings.osm_diff_state_timeout_seconds)
    except Exception as exc:  # noqa: BLE001 - full snapshot is still usable without diff state
        _emit_progress(progress_callback, "osm_diff_state_unavailable", f"Could not record OSM replication state: {exc}", None, None, {"updates_url": updates_url})
        return None


def _reserve_raw_dataset(source_ref: _SourceRef, prepared: _PreparedRawFile) -> _DatasetRef:
    with database_write_lock(f"osm_staged_import:{source_ref.id}:reserve_raw", timeout=60):
        with SessionLocal() as session:
            source = session.get(Source, source_ref.id)
            if source is None:
                raise ValueError(f"source not found: {source_ref.id}")
            dataset = _find_raw_dataset(session, source, prepared.sha256)
            if dataset is None:
                dataset = Dataset(
                    source_id=source.id,
                    kind="osm_pbf_raw",
                    local_path=str(prepared.path),
                    sha256=prepared.sha256,
                    is_active=False,
                    status="committed",
                    metadata_json=json.dumps(prepared.metadata, indent=2),
                )
                session.add(dataset)
                session.flush()
            else:
                dataset.local_path = str(prepared.path)
                dataset.status = "committed"
                dataset.metadata_json = json.dumps({**_metadata(dataset), **prepared.metadata}, indent=2)
            if prepared.replication_state is not None:
                _record_diff_state(
                    session,
                    source=source,
                    raw_dataset=dataset,
                    updates_url=str(prepared.metadata.get("replication_state", {}).get("updates_url") or _source_updates_url(source) or ""),
                    state=prepared.replication_state,
                        metadata=prepared.diff_state_metadata,
                )
            _record_pipeline_stage(
                session,
                stage=STAGE_ACQUIRE_RAW,
                version=RAW_ACQUIRE_VERSION,
                source_id=source.id,
                dataset=dataset,
                inputs={
                    "source_url": source.url,
                    "source_kind": source.kind,
                    "remote": prepared.metadata.get("replication_state") or prepared.metadata.get("source_url"),
                },
                outputs={
                    "path": str(prepared.path),
                    "sha256": prepared.sha256,
                    "raw_format": prepared.metadata.get("raw_format"),
                    "diff_update": prepared.metadata.get("diff_update"),
                },
            )
            source.status = "running"
            source.last_error = None
            session.commit()
            return _dataset_ref(dataset)


def _prepare_transport_file_staged(source: _SourceRef, raw_dataset: _DatasetRef, raw_path: Path, progress_callback=None) -> _PreparedTransportFile:
    output_path = _transport_filter_path_for_raw_id(source.id, raw_dataset.id, raw_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    metadata_path = output_path.with_suffix(output_path.suffix + ".metadata.json")
    existing_metadata = _read_json_file(metadata_path)
    if output_path.exists() and existing_metadata.get("input_sha256") == raw_dataset.sha256 and existing_metadata.get("filter") == TRANSPORT_FILTER_VERSION:
        filtered_hash = sha256_file(output_path)
        _emit_progress(progress_callback, "osm_transport_filter_reused", "Reusing existing filtered OSM transport extract.", 3, 7, {"path": str(output_path), "sha256": filtered_hash})
        return _PreparedTransportFile(path=output_path, sha256=filtered_hash, metadata=existing_metadata, reused=True)

    script_path = _prefilter_script_path()
    if not script_path.exists():
        raise FileNotFoundError(f"OSM transport filter script not found: {script_path}")
    _emit_progress(progress_callback, "osm_transport_filter_started", "Filtering OSM PBF to public-transport objects.", 2, 7, {"input_path": str(raw_path), "output_path": str(output_path)})
    with measure_pipeline_phase("osm_transport_filter", source_id=source.id, dataset_id=raw_dataset.id, metadata={"input_path": str(raw_path), "output_path": str(output_path)}) as metric:
        command = [str(script_path), str(raw_path), str(output_path)]
        try:
            completed = subprocess.run(command, check=True, capture_output=True, text=True)
        except subprocess.CalledProcessError as exc:
            stderr = (exc.stderr or "").strip()
            stdout = (exc.stdout or "").strip()
            details = stderr or stdout or f"exit code {exc.returncode}"
            raise RuntimeError(f"OSM transport filter failed for {raw_path}: {details}") from exc
        filtered_hash = sha256_file(output_path)
        metric.update({"sha256": filtered_hash, "bytes": output_path.stat().st_size if output_path.exists() else None})
    metadata = {
        "stage": "filtered_osm_transport_pbf",
        "raw_format": _raw_format(output_path),
        "derived_from_dataset_id": raw_dataset.id,
        "source_url": source.url,
        "filter": TRANSPORT_FILTER_VERSION,
        "filter_script": str(script_path),
        "input_path": str(raw_path),
        "input_sha256": raw_dataset.sha256,
        "output_path": str(output_path),
        "stdout": completed.stdout.strip(),
        "stderr": completed.stderr.strip(),
        "import_mode": "staged_short_lock",
    }
    metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
    _emit_progress(progress_callback, "osm_transport_filter_completed", "Filtered OSM transport extract.", 3, 7, {"path": str(output_path), "sha256": filtered_hash})
    return _PreparedTransportFile(path=output_path, sha256=filtered_hash, metadata=metadata, reused=False)


def _reserve_transport_dataset(source_ref: _SourceRef, raw_dataset: _DatasetRef, prepared: _PreparedTransportFile) -> _DatasetRef:
    with database_write_lock(f"osm_staged_import:{source_ref.id}:reserve_transport", timeout=60):
        with SessionLocal() as session:
            source = session.get(Source, source_ref.id)
            raw = session.get(Dataset, raw_dataset.id)
            if source is None or raw is None:
                raise ValueError("source or raw dataset disappeared during staged import")
            dataset = _find_transport_dataset_by_raw_id(session, source.id, raw_dataset.id)
            if dataset is None:
                dataset = Dataset(
                    source_id=source.id,
                    kind="osm_pbf_transport",
                    local_path=str(prepared.path),
                    sha256=prepared.sha256,
                    is_active=False,
                    status="filtered",
                    metadata_json=json.dumps(prepared.metadata, indent=2),
                )
                session.add(dataset)
                session.flush()
            else:
                dataset.local_path = str(prepared.path)
                dataset.sha256 = prepared.sha256
                dataset.status = "filtered"
                dataset.metadata_json = json.dumps(prepared.metadata, indent=2)
            raw.status = "filtered"
            raw.metadata_json = json.dumps({**_metadata(raw), "filtered_dataset_id": dataset.id}, indent=2)
            _record_pipeline_stage(
                session,
                stage=STAGE_FILTER_TRANSPORT,
                version=TRANSPORT_FILTER_VERSION,
                source_id=source.id,
                dataset=dataset,
                inputs={
                    "raw_dataset_id": raw_dataset.id,
                    "raw_sha256": raw_dataset.sha256,
                    "filter_script": prepared.metadata.get("filter_script"),
                },
                outputs={"path": str(prepared.path), "sha256": prepared.sha256, "reused": prepared.reused},
            )
            session.commit()
            return _dataset_ref(dataset)


def _extract_transport_geojson_staged(source: _SourceRef, input_dataset: _DatasetRef, input_path: Path, progress_callback=None) -> _PreparedExtract:
    output_dir = settings.data_dir / "derived" / f"source_{source.id}" / f"extract_dataset_{input_dataset.id}"
    output_path = output_dir / "transport.geojson"
    summary_path = output_path.with_suffix(".summary.json")
    existing_summary = _read_json_file(summary_path)
    if output_path.exists() and existing_summary.get("input_sha256") == input_dataset.sha256 and existing_summary.get("extractor") == EXTRACTOR_VERSION:
        output_hash = sha256_file(output_path)
        _emit_progress(progress_callback, "osm_extract_reused", "Reusing existing extracted OSM transport GeoJSON.", 4, 7, {"path": str(output_path), "sha256": output_hash})
        return _PreparedExtract(path=output_path, sha256=output_hash, summary=existing_summary["extract_summary"], reused=True)

    _emit_progress(progress_callback, "osm_extract_started", "Extracting route, stop, and infrastructure geometry from OSM.", 4, 7, {"input_path": str(input_path), "output_path": str(output_path)})
    with measure_pipeline_phase("osm_transport_extract", source_id=source.id, dataset_id=input_dataset.id, metadata={"input_path": str(input_path), "output_path": str(output_path)}) as metric:
        extract_summary = extract_osm_transport_geojson(input_path, output_path)
        output_hash = sha256_file(output_path)
        metric.update({**extract_summary, "sha256": output_hash, "bytes": output_path.stat().st_size if output_path.exists() else None})
    summary = {
        "input_dataset_id": input_dataset.id,
        "input_sha256": input_dataset.sha256,
        "extractor": EXTRACTOR_VERSION,
        "extract_summary": extract_summary,
    }
    summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
    _emit_progress(progress_callback, "osm_extract_completed", "Extracted OSM transport GeoJSON.", 4, 7, {"path": str(output_path), "sha256": output_hash, **extract_summary})
    return _PreparedExtract(path=output_path, sha256=output_hash, summary=extract_summary, reused=False)


def _existing_active_derived_ref(source_id: int, input_dataset_id: int) -> _DatasetRef | None:
    with SessionLocal() as session:
        source = session.get(Source, source_id)
        if source is None:
            return None
        dataset = _find_existing_derived(session, source, Dataset(id=input_dataset_id))
        if dataset is None:
            return None
        return _dataset_ref(dataset)


def _activate_existing_derived(source_id: int, derived_dataset_id: int) -> None:
    with database_write_lock(f"osm_staged_import:{source_id}:reuse_existing", timeout=60):
        with SessionLocal() as session:
            source = session.get(Source, source_id)
            dataset = session.get(Dataset, derived_dataset_id)
            if source is None or dataset is None:
                return
            for existing in source.datasets:
                existing.is_active = existing.id == dataset.id
            source.status = "ok"
            source.last_error = None
            source.last_run_at = datetime.now(timezone.utc)
            session.commit()


def _reserve_derived_dataset(
    *,
    source_ref: _SourceRef,
    raw_dataset: _DatasetRef,
    input_dataset: _DatasetRef,
    filtered_dataset: _DatasetRef | None,
    extract: _PreparedExtract,
) -> _DatasetRef:
    metadata = {
        "stage": "derived_osm_transport_geojson",
        "derived_from_dataset_id": input_dataset.id,
        "raw_dataset_id": raw_dataset.id,
        "filtered_dataset_id": None if filtered_dataset is None else filtered_dataset.id,
        "extractor": EXTRACTOR_VERSION,
        "extract_summary": extract.summary,
        "import_mode": "staged_short_lock",
        "sidecar_status": "pending",
    }
    with database_write_lock(f"osm_staged_import:{source_ref.id}:reserve_derived", timeout=60):
        with SessionLocal() as session:
            source = session.get(Source, source_ref.id)
            if source is None:
                raise ValueError(f"source not found: {source_ref.id}")
            dataset = _find_staged_derived_dataset(session, source.id, input_dataset.id, extract.sha256)
            if dataset is None:
                dataset = Dataset(
                    source_id=source.id,
                    kind="osm_geojson",
                    local_path=str(extract.path),
                    sha256=extract.sha256,
                    is_active=False,
                    status="sidecar_staging",
                    metadata_json=json.dumps(metadata, indent=2),
                )
                session.add(dataset)
                session.flush()
            else:
                dataset.local_path = str(extract.path)
                dataset.sha256 = extract.sha256
                dataset.status = "sidecar_staging"
                dataset.metadata_json = json.dumps({**_metadata(dataset), **metadata}, indent=2)
            _record_pipeline_stage(
                session,
                stage=STAGE_EXTRACT_GEOMETRY,
                version=EXTRACTOR_VERSION,
                source_id=source.id,
                dataset=dataset,
                inputs={
                    "input_dataset_id": input_dataset.id,
                    "input_sha256": input_dataset.sha256,
                    "extractor": EXTRACTOR_VERSION,
                },
                outputs={"path": str(extract.path), "sha256": extract.sha256, "summary": extract.summary, "reused": extract.reused},
            )
            session.commit()
            return _dataset_ref(dataset)


def _prepare_derived_storage_staged(derived_dataset: _DatasetRef, extract: _PreparedExtract, progress_callback=None) -> dict[str, object]:
    storage = derived_dataset.metadata.get("osm_storage")
    if isinstance(storage, dict):
        if storage.get("mode") == OSM_STORAGE_MAIN and derived_dataset.metadata.get("storage_status") == "ready":
            _emit_progress(progress_callback, "osm_storage_reused", "Reusing existing OSM main-table storage.", 6, 7, {"dataset_id": derived_dataset.id})
            return derived_dataset.metadata
        sidecar = storage.get("sidecar_path")
        if sidecar and Path(str(sidecar)).exists() and derived_dataset.metadata.get("sidecar_status") == "ready":
            _emit_progress(progress_callback, "osm_sidecar_reused", "Reusing existing OSM feature sidecar.", 6, 7, {"dataset_id": derived_dataset.id, "sidecar_path": str(sidecar)})
            return derived_dataset.metadata

    storage_mode = effective_osm_feature_storage()
    storage_label = "main-table OSM feature storage" if storage_mode == OSM_STORAGE_MAIN else "OSM feature sidecar"
    started_event = "osm_storage_started" if storage_mode == OSM_STORAGE_MAIN else "osm_sidecar_started"
    completed_event = "osm_storage_completed" if storage_mode == OSM_STORAGE_MAIN else "osm_sidecar_completed"
    _emit_progress(progress_callback, started_event, f"Building {storage_label}.", 5, 7, {"dataset_id": derived_dataset.id, "path": str(extract.path), "storage_mode": storage_mode})
    transient_dataset = Dataset(
        id=derived_dataset.id,
        source_id=derived_dataset.source_id,
        kind=derived_dataset.kind,
        local_path=derived_dataset.local_path,
        sha256=derived_dataset.sha256,
        is_active=False,
        status=derived_dataset.status,
        metadata_json=json.dumps(derived_dataset.metadata, indent=2),
    )
    with measure_pipeline_phase("osm_sidecar_build", source_id=derived_dataset.source_id, dataset_id=derived_dataset.id, metadata={"path": str(extract.path)}) as metric:
        with SessionLocal() as session:
            sidecar_metadata = prepare_osm_geojson_storage(
                session=session,
                dataset=transient_dataset,
                path=extract.path,
                source_hash=derived_dataset.sha256,
                storage_mode=storage_mode,
            )
            session.commit()
        metric.update(sidecar_metadata)
    metadata = {**derived_dataset.metadata, **sidecar_metadata, "sidecar_status": "ready" if storage_mode == OSM_STORAGE_SIDECAR_FEATURES else "not_used", "storage_status": "ready"}
    _emit_progress(progress_callback, completed_event, f"Built {storage_label}.", 6, 7, {"dataset_id": derived_dataset.id, **sidecar_metadata})
    return metadata


def _activate_staged_osm_import(
    *,
    source_ref: _SourceRef,
    raw_dataset: _DatasetRef,
    filtered_dataset: _DatasetRef | None,
    input_dataset: _DatasetRef,
    derived_dataset: _DatasetRef,
    extract: _PreparedExtract,
    sidecar_metadata: dict[str, object],
) -> int:
    metadata = {
        **sidecar_metadata,
        "stage": "derived_osm_transport_geojson",
        "derived_from_dataset_id": input_dataset.id,
        "raw_dataset_id": raw_dataset.id,
        "filtered_dataset_id": None if filtered_dataset is None else filtered_dataset.id,
        "extractor": EXTRACTOR_VERSION,
        "extract_summary": extract.summary,
        "import_mode": "staged_short_lock",
        "sidecar_status": "ready",
    }
    with database_write_lock(f"osm_staged_import:{source_ref.id}:activate", timeout=60):
        with SessionLocal() as session:
            source = session.get(Source, source_ref.id)
            raw = session.get(Dataset, raw_dataset.id)
            filtered = session.get(Dataset, filtered_dataset.id) if filtered_dataset is not None else None
            derived = session.get(Dataset, derived_dataset.id)
            if source is None or raw is None or derived is None:
                raise ValueError("staged OSM activation lost source or dataset rows")
            for dataset in source.datasets:
                dataset.is_active = False
            raw.status = "filtered" if filtered is not None else "extracted"
            raw.is_active = False
            raw.metadata_json = json.dumps({**_metadata(raw), "extractor": EXTRACTOR_VERSION, "extract_summary": extract.summary}, indent=2)
            if filtered is not None:
                filtered.status = "extracted"
                filtered.is_active = False
                filtered.metadata_json = json.dumps({**_metadata(filtered), "extractor": EXTRACTOR_VERSION, "extract_summary": extract.summary}, indent=2)
            derived.status = "imported"
            derived.is_active = True
            derived.local_path = str(extract.path)
            derived.sha256 = extract.sha256
            derived.metadata_json = json.dumps(metadata, indent=2)
            _record_pipeline_stage(
                session,
                stage=STAGE_BUILD_INDEXES,
                version=OSM_SIDECAR_INDEX_VERSION,
                source_id=source.id,
                dataset=derived,
                inputs={
                    "dataset_id": derived.id,
                    "dataset_sha256": derived.sha256,
                    "sidecar_schema": "osm_features_v1",
                    "indexed_columns": ["kind", "mode", "route_scope", "bbox", "route_key", "ref", "identity"],
                },
                outputs=sidecar_metadata.get("osm_storage") if isinstance(sidecar_metadata.get("osm_storage"), dict) else sidecar_metadata,
            )
            source.status = "ok"
            source.last_error = None
            source.last_run_at = datetime.now(timezone.utc)
            session.commit()
            return derived.id


def _find_transport_dataset_by_raw_id(session: Session, source_id: int, raw_dataset_id: int) -> Dataset | None:
    datasets = session.scalars(
        select(Dataset)
        .where(Dataset.source_id == source_id, Dataset.kind == "osm_pbf_transport")
        .order_by(Dataset.id.desc())
    ).all()
    for dataset in datasets:
        metadata = _metadata(dataset)
        if metadata.get("derived_from_dataset_id") == raw_dataset_id and metadata.get("filter") == TRANSPORT_FILTER_VERSION:
            return dataset
    return None


def _find_staged_derived_dataset(session: Session, source_id: int, input_dataset_id: int, extract_hash: str) -> Dataset | None:
    datasets = session.scalars(
        select(Dataset)
        .where(
            Dataset.source_id == source_id,
            Dataset.kind == "osm_geojson",
            Dataset.status.in_(["sidecar_staging", "importing"]),
            Dataset.sha256 == extract_hash,
        )
        .order_by(Dataset.id.desc())
    ).all()
    for dataset in datasets:
        metadata = _metadata(dataset)
        if metadata.get("derived_from_dataset_id") == input_dataset_id and metadata.get("extractor") == EXTRACTOR_VERSION:
            return dataset
    return None


def _dataset_ref(dataset: Dataset) -> _DatasetRef:
    return _DatasetRef(
        id=int(dataset.id),
        source_id=int(dataset.source_id),
        kind=dataset.kind,
        local_path=dataset.local_path,
        sha256=dataset.sha256,
        status=dataset.status,
        metadata=_metadata(dataset),
    )


def _transport_filter_path_for_raw_id(source_id: int, raw_dataset_id: int, raw_path: Path) -> Path:
    raw_format = _raw_format(raw_path)
    suffix = ".osm.pbf" if raw_format == "osm_pbf" else ".osm"
    return settings.data_dir / "derived" / f"source_{source_id}" / f"raw_dataset_{raw_dataset_id}" / f"transport{suffix}"


def _read_json_file(path: Path) -> dict[str, Any]:
    if not path.exists():
        return {}
    try:
        data = json.loads(path.read_text(encoding="utf-8"))
    except (OSError, json.JSONDecodeError):
        return {}
    return data if isinstance(data, dict) else {}


def _prepare_raw_osm_dataset(session: Session, source: Source, progress_callback=None) -> Dataset:
    diff_dataset = _try_prepare_raw_from_diffs(session, source, progress_callback=progress_callback)
    if diff_dataset is not None:
        return diff_dataset

    _emit_progress(progress_callback, "osm_full_snapshot_started", f"Downloading/copying full OSM snapshot for {source.name}.", None, None, {"source_id": source.id})
    raw_path = materialize_source(source)
    raw_hash = sha256_file(raw_path)
    raw_dataset = _find_raw_dataset(session, source, raw_hash) or _commit_raw_dataset(session, source, raw_path, raw_hash)
    _record_current_replication_state_for_snapshot(session, source, raw_dataset, progress_callback=progress_callback)
    _emit_progress(progress_callback, "osm_full_snapshot_completed", f"Prepared raw OSM dataset #{raw_dataset.id}.", None, None, {"dataset_id": raw_dataset.id})
    return raw_dataset


def extract_osm_transport_geojson(input_path: Path, output_path: Path) -> dict[str, Any]:
    scan = _TransportScanHandler()
    scan.apply_file(str(input_path))

    geometry = _TransportGeometryHandler(scan.route_relations, scan.route_way_ids)
    geometry.apply_file(str(input_path), locations=True)

    features = geometry.features()
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(json.dumps({"type": "FeatureCollection", "features": features}), encoding="utf-8")

    route_features = sum(1 for feature in features if feature["properties"].get("type") == "route")
    infra_features = sum(1 for feature in features if feature["properties"].get("kind") == "infra")
    stop_features = len(features) - route_features - infra_features
    return {
        "input_path": str(input_path),
        "output_path": str(output_path),
        "route_relations_seen": len(scan.route_relations),
        "route_relation_member_ways": len(scan.route_way_ids),
        "features": len(features),
        "route_features": route_features,
        "infrastructure_features": infra_features,
        "stop_station_features": stop_features,
        "route_relations_without_geometry": geometry.route_relations_without_geometry,
    }


def _commit_raw_dataset(session: Session, source: Source, path: Path, source_hash: str) -> Dataset:
    for dataset in source.datasets:
        dataset.is_active = False

    dataset = Dataset(
        source_id=source.id,
        kind="osm_pbf_raw",
        local_path=str(path),
        sha256=source_hash,
        is_active=False,
        status="committed",
        metadata_json=json.dumps(
            {
                "stage": "raw_osm",
                "raw_format": _raw_format(path),
                "source_url": source.url,
            },
            indent=2,
        ),
    )
    session.add(dataset)
    session.flush()
    return dataset


def _try_prepare_raw_from_diffs(session: Session, source: Source, progress_callback=None) -> Dataset | None:
    updates_url = _source_updates_url(source)
    if not updates_url:
        return None

    current_state = _latest_diff_state(session, source.id)
    if current_state is None or current_state.raw_dataset_id is None:
        _emit_progress(progress_callback, "osm_diff_fallback", "No local OSM replication state yet; using full snapshot.", None, None, {"updates_url": updates_url})
        return None
    raw_dataset = session.get(Dataset, current_state.raw_dataset_id)
    if raw_dataset is None or not Path(raw_dataset.local_path).exists():
        _emit_progress(progress_callback, "osm_diff_fallback", "Local raw OSM base is missing; using full snapshot.", None, None, {"updates_url": updates_url})
        return None

    try:
        remote_state = fetch_replication_state(updates_url, timeout=settings.osm_diff_state_timeout_seconds)
    except Exception as exc:  # noqa: BLE001 - correctness fallback
        _emit_progress(progress_callback, "osm_diff_fallback", f"Could not read OSM replication state; using full snapshot: {exc}", None, None, {"updates_url": updates_url})
        return None

    if remote_state.sequence_number <= current_state.sequence_number:
        _emit_progress(
            progress_callback,
            "osm_diff_up_to_date",
            "Local raw OSM extract is already at the latest known replication sequence.",
            remote_state.sequence_number,
            remote_state.sequence_number,
            {"updates_url": updates_url, "sequence_number": remote_state.sequence_number},
        )
        return raw_dataset

    gap = remote_state.sequence_number - current_state.sequence_number
    if gap > settings.osm_diff_max_sequence_gap:
        _emit_progress(
            progress_callback,
            "osm_diff_fallback",
            "OSM replication gap is too large; using full snapshot.",
            current_state.sequence_number,
            remote_state.sequence_number,
            {"gap": gap, "max_gap": settings.osm_diff_max_sequence_gap, "updates_url": updates_url},
        )
        return None

    host_tool = _host_tool_path()
    if not host_tool.exists():
        _emit_progress(progress_callback, "osm_diff_fallback", "host_tool.sh is missing; using full snapshot.", None, None, {"host_tool": str(host_tool)})
        return None

    try:
        return _apply_diff_range(
            session=session,
            source=source,
            base_dataset=raw_dataset,
            updates_url=updates_url,
            local_sequence=current_state.sequence_number,
            remote_state=remote_state,
            host_tool=host_tool,
            progress_callback=progress_callback,
        )
    except Exception as exc:  # noqa: BLE001 - fall back to full snapshot rather than risk a bad base
        _emit_progress(progress_callback, "osm_diff_fallback", f"OSM diff application failed; using full snapshot: {exc}", None, None, {"updates_url": updates_url})
        return None


def _apply_diff_range(
    session: Session,
    source: Source,
    base_dataset: Dataset,
    updates_url: str,
    local_sequence: int,
    remote_state: ReplicationState,
    host_tool: Path,
    progress_callback=None,
) -> Dataset:
    update_root = settings.data_dir / "sources" / f"source_{source.id}" / "updates"
    work_root = settings.data_dir / "sources" / f"source_{source.id}" / "diff_work"
    work_root.mkdir(parents=True, exist_ok=True)
    current_path = Path(base_dataset.local_path)
    batch_size = max(1, int(settings.osm_diff_apply_batch_size))
    sequences = list(range(local_sequence + 1, remote_state.sequence_number + 1))
    applied_sequences: list[int] = []
    _emit_progress(
        progress_callback,
        "osm_diff_started",
        f"Applying {len(sequences)} OSM replication diffs.",
        local_sequence,
        remote_state.sequence_number,
        {"updates_url": updates_url, "from_sequence": local_sequence + 1, "to_sequence": remote_state.sequence_number},
    )
    for batch_start in range(0, len(sequences), batch_size):
        batch = sequences[batch_start : batch_start + batch_size]
        diff_paths = []
        for sequence in batch:
            diff_path = download_diff(updates_url, sequence, update_root)
            diff_paths.append(diff_path)
            _emit_progress(
                progress_callback,
                "osm_diff_downloaded",
                f"Downloaded OSM diff sequence {sequence}.",
                sequence,
                remote_state.sequence_number,
                {"path": str(diff_path), "sequence_number": sequence},
            )
        temp_output = work_root / f"source_{source.id}_{batch[0]}_{batch[-1]}.tmp.osm.pbf"
        completed = apply_osm_changes(current_path, diff_paths, temp_output, host_tool)
        current_path = _store_updated_raw_pbf(source, temp_output)
        applied_sequences.extend(batch)
        _emit_progress(
            progress_callback,
            "osm_diff_applied",
            f"Applied OSM diff sequences {batch[0]}-{batch[-1]}.",
            batch[-1],
            remote_state.sequence_number,
            {
                "output_path": str(current_path),
                "stdout": completed.stdout.strip(),
                "stderr": completed.stderr.strip(),
                "batch_start": batch[0],
                "batch_end": batch[-1],
            },
        )
    raw_hash = sha256_file(current_path)
    dataset = _find_raw_dataset(session, source, raw_hash) or _commit_raw_dataset(session, source, current_path, raw_hash)
    _update_dataset_metadata(
        dataset,
        replication_state={
            "updates_url": updates_url,
            "sequence_number": remote_state.sequence_number,
            "timestamp": remote_state.timestamp,
        },
        diff_update={
            "base_dataset_id": base_dataset.id,
            "base_sequence_number": local_sequence,
            "applied_sequences": applied_sequences,
        },
    )
    _record_diff_state(
        session,
        source=source,
        raw_dataset=dataset,
        updates_url=updates_url,
        state=remote_state,
        metadata={"base_dataset_id": base_dataset.id, "applied_sequences": applied_sequences},
    )
    return dataset


def _record_current_replication_state_for_snapshot(session: Session, source: Source, raw_dataset: Dataset, progress_callback=None) -> None:
    updates_url = _source_updates_url(source)
    if not updates_url:
        return
    try:
        state = fetch_replication_state(updates_url, timeout=settings.osm_diff_state_timeout_seconds)
    except Exception as exc:  # noqa: BLE001 - full snapshot is still usable without diff state
        _emit_progress(progress_callback, "osm_diff_state_unavailable", f"Could not record OSM replication state: {exc}", None, None, {"updates_url": updates_url})
        return
    _update_dataset_metadata(
        raw_dataset,
        replication_state={
            "updates_url": updates_url,
            "sequence_number": state.sequence_number,
            "timestamp": state.timestamp,
        },
    )
    _record_diff_state(
        session,
        source=source,
        raw_dataset=raw_dataset,
        updates_url=updates_url,
        state=state,
        metadata={"source": "full_snapshot"},
    )


def _record_diff_state(
    session: Session,
    source: Source,
    raw_dataset: Dataset,
    updates_url: str,
    state: ReplicationState,
    metadata: dict[str, Any] | None = None,
) -> OsmDiffState:
    for existing in session.scalars(select(OsmDiffState).where(OsmDiffState.source_id == source.id, OsmDiffState.status == "active")).all():
        existing.status = "superseded"
    row = OsmDiffState(
        source_id=source.id,
        raw_dataset_id=raw_dataset.id,
        updates_url=updates_url,
        sequence_number=state.sequence_number,
        timestamp=state.timestamp,
        status="active",
        metadata_json=json.dumps({"state": state.raw, **(metadata or {})}, separators=(",", ":")),
    )
    session.add(row)
    session.flush()
    return row


def _latest_diff_state(session: Session, source_id: int) -> OsmDiffState | None:
    return session.scalar(
        select(OsmDiffState)
        .where(OsmDiffState.source_id == source_id, OsmDiffState.status == "active")
        .order_by(OsmDiffState.sequence_number.desc(), OsmDiffState.id.desc())
    )


def _store_updated_raw_pbf(source: Source, temp_path: Path) -> Path:
    source_dir = settings.data_dir / "sources" / f"source_{source.id}"
    source_dir.mkdir(parents=True, exist_ok=True)
    raw_hash = sha256_file(temp_path)
    target = source_dir / f"{raw_hash[:16]}.osm.pbf"
    if target.exists() and sha256_file(target) == raw_hash:
        temp_path.unlink(missing_ok=True)
        return target
    shutil.move(str(temp_path), str(target))
    return target


def _source_updates_url(source: Source) -> str | None:
    notes = source.notes or ""
    for part in notes.split(";"):
        if "=" not in part:
            continue
        key, value = part.strip().split("=", 1)
        if key.strip() == "updates_url" and value.strip():
            return value.strip()
    if source.kind == "osm_diff" and source.url:
        return source.url
    return None


def _host_tool_path() -> Path:
    return Path(__file__).resolve().parents[2] / "scripts" / "host_tool.sh"


def _find_raw_dataset(session: Session, source: Source, raw_hash: str) -> Dataset | None:
    return session.scalar(
        select(Dataset)
        .where(
            Dataset.source_id == source.id,
            Dataset.kind == "osm_pbf_raw",
            Dataset.sha256 == raw_hash,
        )
        .order_by(Dataset.id.desc())
    )


def _prepare_transport_pbf(session: Session, source: Source, raw_dataset: Dataset, raw_path: Path) -> Dataset:
    existing = _find_transport_dataset(session, source, raw_dataset)
    if existing is not None and Path(existing.local_path).exists():
        return existing

    output_path = _transport_filter_path(source, raw_dataset, raw_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    script_path = _prefilter_script_path()
    if not script_path.exists():
        raise FileNotFoundError(f"OSM transport filter script not found: {script_path}")
    command = [str(script_path), str(raw_path), str(output_path)]
    try:
        completed = subprocess.run(command, check=True, capture_output=True, text=True)
    except subprocess.CalledProcessError as exc:
        stderr = (exc.stderr or "").strip()
        stdout = (exc.stdout or "").strip()
        details = stderr or stdout or f"exit code {exc.returncode}"
        raise RuntimeError(f"OSM transport filter failed for {raw_path}: {details}") from exc
    filtered_hash = sha256_file(output_path)

    metadata = {
        "stage": "filtered_osm_transport_pbf",
        "raw_format": _raw_format(output_path),
        "derived_from_dataset_id": raw_dataset.id,
        "source_url": source.url,
        "filter": TRANSPORT_FILTER_VERSION,
        "filter_script": str(script_path),
        "input_path": str(raw_path),
        "input_sha256": raw_dataset.sha256,
        "output_path": str(output_path),
        "stdout": completed.stdout.strip(),
        "stderr": completed.stderr.strip(),
    }
    if existing is None:
        dataset = Dataset(
            source_id=source.id,
            kind="osm_pbf_transport",
            local_path=str(output_path),
            sha256=filtered_hash,
            is_active=False,
            status="filtered",
            metadata_json=json.dumps(metadata, indent=2),
        )
        session.add(dataset)
    else:
        dataset = existing
        dataset.local_path = str(output_path)
        dataset.sha256 = filtered_hash
        dataset.status = "filtered"
        dataset.metadata_json = json.dumps(metadata, indent=2)
    raw_dataset.status = "filtered"
    session.flush()
    return dataset


def _find_transport_dataset(session: Session, source: Source, raw_dataset: Dataset) -> Dataset | None:
    datasets = session.scalars(
        select(Dataset)
        .where(Dataset.source_id == source.id, Dataset.kind == "osm_pbf_transport")
        .order_by(Dataset.id.desc())
    ).all()
    for dataset in datasets:
        metadata = _metadata(dataset)
        if (
            metadata.get("derived_from_dataset_id") == raw_dataset.id
            and metadata.get("filter") == TRANSPORT_FILTER_VERSION
        ):
            return dataset
    return None


def _find_existing_derived(session: Session, source: Source, input_dataset: Dataset) -> Dataset | None:
    derived_datasets = session.scalars(
        select(Dataset)
        .where(
            Dataset.source_id == source.id,
            Dataset.kind == "osm_geojson",
            Dataset.status == "imported",
            Dataset.is_active.is_(True),
        )
        .order_by(Dataset.id.desc())
    ).all()
    for derived_dataset in derived_datasets:
        metadata = _metadata(derived_dataset)
        if (
            metadata.get("derived_from_dataset_id") == input_dataset.id
            and metadata.get("extractor") == EXTRACTOR_VERSION
        ):
            return derived_dataset
    return None


def _metadata(dataset: Dataset) -> dict[str, Any]:
    try:
        return json.loads(dataset.metadata_json or "{}")
    except json.JSONDecodeError:
        return {}


def _update_dataset_metadata(dataset: Dataset, **values: Any) -> None:
    metadata = _metadata(dataset)
    metadata.update(values)
    dataset.metadata_json = json.dumps(metadata, indent=2)


def _emit_progress(progress_callback, event_type: str, message: str, progress_current=None, progress_total=None, metadata: dict[str, Any] | None = None) -> None:
    if progress_callback is not None:
        progress_callback(event_type, message, progress_current, progress_total, metadata)


def _should_prefilter(path: Path) -> bool:
    if not settings.osm_pbf_prefilter_enabled:
        return False
    return _raw_format(path) in _prefilter_formats()


def _prefilter_formats() -> set[str]:
    return {
        value.strip()
        for value in str(settings.osm_pbf_prefilter_formats or "").split(",")
        if value.strip()
    }


def _prefilter_script_path() -> Path:
    path = settings.osm_pbf_prefilter_script
    if path.is_absolute():
        return path
    return Path.cwd() / path


def _transport_filter_path(source: Source, raw_dataset: Dataset, raw_path: Path) -> Path:
    raw_format = _raw_format(raw_path)
    suffix = ".osm.pbf" if raw_format == "osm_pbf" else ".osm"
    return settings.data_dir / "derived" / f"source_{source.id}" / f"raw_dataset_{raw_dataset.id}" / f"transport{suffix}"


class _TransportScanHandler(osmium.SimpleHandler):
    def __init__(self) -> None:
        super().__init__()
        self.route_relations: dict[int, dict[str, Any]] = {}
        self.route_way_ids: set[int] = set()

    def relation(self, relation: osmium.osm.Relation) -> None:
        tags = _tags_dict(relation.tags)
        mode = _route_mode(tags)
        if tags.get("type") != "route" or mode is None:
            return

        way_refs = [member.ref for member in relation.members if member.type == "w"]
        if not way_refs:
            return

        self.route_relations[relation.id] = {
            "tags": tags,
            "way_refs": way_refs,
        }
        self.route_way_ids.update(way_refs)


class _TransportGeometryHandler(osmium.SimpleHandler):
    def __init__(self, route_relations: dict[int, dict[str, Any]], route_way_ids: set[int]) -> None:
        super().__init__()
        self.route_relations = route_relations
        self.route_way_ids = route_way_ids
        self.route_way_lines: dict[int, list[list[float]]] = {}
        self.infrastructure_features: list[dict[str, Any]] = []
        self.stop_features: list[dict[str, Any]] = []
        self.route_relations_without_geometry = 0

    def node(self, node: osmium.osm.Node) -> None:
        tags = _tags_dict(node.tags)
        if not _is_stop_or_station(tags):
            return
        coords = _node_coords(node)
        if coords is None:
            return
        props = {
            **tags,
            "osm_type": "node",
            "osm_id": str(node.id),
        }
        self.stop_features.append({"type": "Feature", "geometry": {"type": "Point", "coordinates": coords}, "properties": props})

    def way(self, way: osmium.osm.Way) -> None:
        tags = _tags_dict(way.tags)
        coords = _way_coords(way)

        if coords is not None and way.id in self.route_way_ids:
            self.route_way_lines[way.id] = coords

        if coords is not None and _is_transport_infrastructure(tags):
            props = {
                **tags,
                "osm_type": "way",
                "osm_id": str(way.id),
                "kind": "infra",
            }
            mode = _infrastructure_mode(tags)
            if mode:
                props.setdefault("mode", mode)
            self.infrastructure_features.append(
                {"type": "Feature", "geometry": {"type": "LineString", "coordinates": coords}, "properties": props}
            )

        if _is_stop_or_station(tags):
            feature = _way_area_or_line_feature(way, tags, coords)
            if feature is not None:
                self.stop_features.append(feature)

    def features(self) -> list[dict[str, Any]]:
        route_features = []
        for relation_id, route in self.route_relations.items():
            lines = [line for way_ref in route["way_refs"] if (line := self.route_way_lines.get(way_ref))]
            if not lines:
                self.route_relations_without_geometry += 1
                continue

            geometry: dict[str, Any]
            ordered_lines = _ordered_route_lines(route["way_refs"], self.route_way_lines)
            if len(ordered_lines) == 1:
                geometry = {"type": "LineString", "coordinates": ordered_lines[0]}
            else:
                geometry = {"type": "MultiLineString", "coordinates": ordered_lines}

            props = {
                **route["tags"],
                "osm_type": "relation",
                "osm_id": str(relation_id),
                "member_way_count": len(route["way_refs"]),
                "geometry_source": "ordered_route_relation_member_ways",
                "geometry_part_count": len(ordered_lines),
            }
            route_features.append({"type": "Feature", "geometry": geometry, "properties": props})
        return route_features + self.infrastructure_features + self.stop_features


def _ordered_route_lines(way_refs: list[int], route_way_lines: dict[int, list[list[float]]]) -> list[list[list[float]]]:
    parts: list[list[list[float]]] = []
    for way_ref in way_refs:
        line = route_way_lines.get(way_ref)
        if not line:
            continue
        coords = [list(coord) for coord in line]
        if len(coords) < 2:
            continue
        if not parts:
            parts.append(coords)
            continue
        if _append_connected(parts[-1], coords):
            continue
        attached = False
        for part in reversed(parts[:-1]):
            if _append_connected(part, coords):
                attached = True
                break
        if not attached:
            parts.append(coords)
    return parts


def _append_connected(part: list[list[float]], coords: list[list[float]]) -> bool:
    if _same_coord(part[-1], coords[0]):
        part.extend(coords[1:])
        return True
    if _same_coord(part[-1], coords[-1]):
        part.extend(reversed(coords[:-1]))
        return True
    if _same_coord(part[0], coords[-1]):
        part[:0] = coords[:-1]
        return True
    if _same_coord(part[0], coords[0]):
        part[:0] = list(reversed(coords[1:]))
        return True
    return False


def _same_coord(left: list[float], right: list[float]) -> bool:
    return len(left) >= 2 and len(right) >= 2 and abs(left[0] - right[0]) < 1e-9 and abs(left[1] - right[1]) < 1e-9


def _tags_dict(tags: osmium.osm.TagList) -> dict[str, str]:
    return {tag.k: tag.v for tag in tags}


def _route_mode(tags: dict[str, str]) -> str | None:
    value = tags.get("route")
    if value in ROUTE_MODES:
        return "train" if value == "railway" else value
    return None


def _is_transport_infrastructure(tags: dict[str, str]) -> bool:
    return _infrastructure_mode(tags) is not None


def _infrastructure_mode(tags: dict[str, str]) -> str | None:
    railway = tags.get("railway")
    if railway in RAILWAY_MODE_BY_TAG:
        return RAILWAY_MODE_BY_TAG[railway]
    if tags.get("route") == "ferry":
        return "ferry"
    aerialway = tags.get("aerialway")
    if aerialway and aerialway != "station":
        return "aerialway"
    return None


def _is_stop_or_station(tags: dict[str, str]) -> bool:
    if tags.get("public_transport") in {"platform", "stop_position", "station"}:
        return True
    if tags.get("railway") in {"station", "halt", "tram_stop", "subway_entrance", "platform"}:
        return True
    if tags.get("highway") == "bus_stop":
        return True
    if tags.get("amenity") in {"bus_station", "ferry_terminal"}:
        return True
    if tags.get("aerialway") == "station":
        return True
    return False


def _node_coords(node: osmium.osm.Node) -> list[float] | None:
    try:
        if not node.location.valid():
            return None
        return [float(node.location.lon), float(node.location.lat)]
    except Exception:
        return None


def _way_coords(way: osmium.osm.Way) -> list[list[float]] | None:
    coords = []
    try:
        for node in way.nodes:
            if not node.location.valid():
                return None
            coords.append([float(node.location.lon), float(node.location.lat)])
    except Exception:
        return None
    return coords if len(coords) >= 2 else None


def _way_area_or_line_feature(way: osmium.osm.Way, tags: dict[str, str], coords: list[list[float]] | None) -> dict[str, Any] | None:
    if coords is None:
        return None
    props = {
        **tags,
        "osm_type": "way",
        "osm_id": str(way.id),
    }
    if len(coords) >= 4 and coords[0] == coords[-1]:
        return {"type": "Feature", "geometry": {"type": "Polygon", "coordinates": [coords]}, "properties": props}
    return {"type": "Feature", "geometry": {"type": "LineString", "coordinates": coords}, "properties": props}


def _record_pipeline_stage(
    session: Session,
    *,
    stage: str,
    version: str,
    source_id: int,
    dataset: Dataset,
    inputs: dict[str, Any],
    outputs: dict[str, Any] | None,
) -> None:
    dependency_hash_value = dependency_hash(inputs)
    run = start_pipeline_run(
        session,
        stage=stage,
        version=version,
        dependency_hash_value=dependency_hash_value,
        source_id=source_id,
        dataset_id=dataset.id,
        inputs=inputs,
    )
    finish_pipeline_run(session, run, outputs=outputs or {})


def _raw_format(path: Path) -> str:
    name = path.name.lower()
    if name.endswith(".osm.pbf") or name.endswith(".pbf"):
        return "osm_pbf"
    if name.endswith(".osm") or name.endswith(".osm.xml") or name.endswith(".xml"):
        return "osm_xml"
    if name.endswith(".osc") or name.endswith(".osc.gz"):
        return "osm_change"
    return "osm"