meubility-workbench/app/harmonization.py

from __future__ import annotations

from datetime import date, datetime, timezone
from typing import Any

from sqlalchemy import and_, func, select
from sqlalchemy.orm import Session, aliased

from app.data_management import dataset_row_counts
from app.models import (
    CanonicalStopLink,
    Dataset,
    GtfsCalendar,
    GtfsCalendarDate,
    GtfsRoute,
    GtfsStop,
    GtfsStopTime,
    GtfsTrip,
    RouteMatch,
    Source,
)


GTFS_QA_NOTE_PREFIX = "[GTFS QA]"


def gtfs_harmonization_inventory(session: Session) -> dict[str, Any]:
    feeds = [_feed_inventory_item(session, source) for source in _gtfs_sources(session)]
    summary = {
        "sources": len(feeds),
        "active_sources": sum(1 for feed in feeds if feed["active_dataset"] is not None),
        "datasets": sum(len(feed["datasets"]) for feed in feeds),
        "ready": sum(1 for feed in feeds if feed["qa_status"] == "ready"),
        "needs_review": sum(1 for feed in feeds if feed["qa_status"] == "needs_review"),
        "blocked": sum(1 for feed in feeds if feed["qa_status"] == "blocked"),
    }
    return {
        "summary": summary,
        "feeds": feeds,
    }


def gtfs_harmonization_feed_detail(session: Session, source_id: int) -> dict[str, Any] | None:
    source = session.get(Source, source_id)
    if source is None or source.kind != "gtfs":
        return None
    feed = _feed_inventory_item(session, source)
    return {
        **feed,
        "sections": _feed_sections(feed),
    }


def _gtfs_sources(session: Session) -> list[Source]:
    return session.scalars(select(Source).where(Source.kind == "gtfs").order_by(Source.country, Source.priority, Source.name, Source.id)).all()


def _feed_inventory_item(session: Session, source: Source) -> dict[str, Any]:
    datasets = sorted([dataset for dataset in source.datasets if dataset.kind == "gtfs"], key=lambda item: (not item.is_active, item.created_at, item.id))
    active_dataset = next((dataset for dataset in datasets if dataset.is_active), None)
    counts = dataset_row_counts(session, active_dataset.id, active_dataset.kind) if active_dataset is not None else {}
    validation = _validate_gtfs_dataset(session, source, active_dataset, counts)
    overlap = _overlap_summary(session, active_dataset)
    service = _service_horizon(session, active_dataset)
    issues = [*validation["issues"], *service["issues"], *overlap["issues"], *_license_issues(source)]
    qa_status = _qa_status(issues, active_dataset)
    return {
        "source": _source_payload(source),
        "active_dataset": None if active_dataset is None else _dataset_payload(active_dataset, counts),
        "datasets": [_dataset_payload(dataset, dataset_row_counts(session, dataset.id, dataset.kind)) for dataset in datasets],
        "counts": counts,
        "validation": validation,
        "service": service,
        "overlap": overlap,
        "license": _license_payload(source),
        "issues": issues,
        "qa_status": qa_status,
    }


def _source_payload(source: Source) -> dict[str, Any]:
    return {
        "id": source.id,
        "name": source.name,
        "country": source.country,
        "license": source.license,
        "priority": source.priority,
        "mode_scope": source.mode_scope,
        "source_basis": source.source_basis,
        "status": source.status,
        "enabled": source.enabled,
        "last_error": source.last_error,
        "last_run_at": _iso(source.last_run_at),
        "url": source.url,
        "catalog_entry_id": source.catalog_entry_id,
        "notes": source.notes,
        "qa_review": _qa_review_payload(source.notes),
    }


def _dataset_payload(dataset: Dataset, counts: dict[str, Any]) -> dict[str, Any]:
    return {
        "id": dataset.id,
        "kind": dataset.kind,
        "is_active": dataset.is_active,
        "status": dataset.status,
        "sha256": dataset.sha256,
        "local_path": dataset.local_path,
        "created_at": _iso(dataset.created_at),
        "counts": counts,
    }


def _validate_gtfs_dataset(session: Session, source: Source, dataset: Dataset | None, counts: dict[str, Any]) -> dict[str, Any]:
    if dataset is None:
        return {
            "status": "blocked",
            "items": [],
            "issues": [_issue("missing_active_dataset", "bad", "No active GTFS dataset", "Import this source before harmonization.")],
        }
    items = [
        _metric("Agencies", counts.get("agencies", 0), "bad" if not counts.get("agencies", 0) else "good"),
        _metric("Stops", counts.get("stops", 0), "bad" if not counts.get("stops", 0) else "good"),
        _metric("Routes", counts.get("routes", 0), "bad" if not counts.get("routes", 0) else "good"),
        _metric("Trips", counts.get("trips", 0), "bad" if not counts.get("trips", 0) else "good"),
        _metric("Stop times", counts.get("stop_times", 0), "bad" if not counts.get("stop_times", 0) else "good"),
        _metric("Shapes", counts.get("shapes", 0), "warn" if not counts.get("shapes", 0) else "good"),
    ]
    missing_coords = _count(session, GtfsStop, dataset.id, (GtfsStop.lat.is_(None) | GtfsStop.lon.is_(None)))
    invalid_coords = _count(
        session,
        GtfsStop,
        dataset.id,
        (GtfsStop.lat < -90) | (GtfsStop.lat > 90) | (GtfsStop.lon < -180) | (GtfsStop.lon > 180),
    )
    routes_without_trips = _routes_without_trips(session, dataset.id)
    trips_without_stop_times = _trips_without_stop_times(session, dataset.id)
    stop_times_without_seconds = _stop_times_without_seconds(session, dataset.id)
    route_geometry_missing = _count(session, GtfsRoute, dataset.id, GtfsRoute.geometry_geojson.is_(None))
    canonical_links = _count(session, CanonicalStopLink, dataset.id, CanonicalStopLink.object_type == "gtfs_stop")
    match_counts = counts.get("match_counts", {}) if isinstance(counts.get("match_counts"), dict) else {}

    items.extend(
        [
            _metric("Stops missing coordinates", missing_coords, "bad" if missing_coords else "good"),
            _metric("Stops with invalid coordinates", invalid_coords, "bad" if invalid_coords else "good"),
            _metric("Routes without trips", routes_without_trips, "bad" if routes_without_trips else "good"),
            _metric("Trips without stop_times", trips_without_stop_times, "bad" if trips_without_stop_times else "good"),
            _metric("Stop times without parsed seconds", stop_times_without_seconds, "warn" if stop_times_without_seconds else "good"),
            _metric("Routes without geometry", route_geometry_missing, "warn" if route_geometry_missing else "good"),
            _metric("Canonical stop links", canonical_links, "warn" if counts.get("stops", 0) and canonical_links == 0 else "good"),
            _metric("Route matches", counts.get("matches", 0), "warn" if counts.get("routes", 0) and not counts.get("matches", 0) else "good"),
        ]
    )
    issues: list[dict[str, str]] = []
    if counts.get("missing_sidecar"):
        issues.append(_issue("missing_sidecar", "bad", "GTFS sidecar is missing", "Queue a recovery import for this dataset."))
    for key, label in [
        ("agencies", "No agencies imported"),
        ("stops", "No stops imported"),
        ("routes", "No routes imported"),
        ("trips", "No trips imported"),
        ("stop_times", "No stop_times imported"),
    ]:
        if not counts.get(key, 0):
            issues.append(_issue(f"missing_{key}", "bad", label, "Required GTFS content is absent or failed to import."))
    if missing_coords:
        issues.append(_issue("missing_stop_coordinates", "bad", f"{missing_coords:,} stops have no coordinates", "Stop coordinates are required for deduplication and routing access."))
    if invalid_coords:
        issues.append(_issue("invalid_stop_coordinates", "bad", f"{invalid_coords:,} stops have invalid coordinates", "Fix or exclude invalid stop coordinates before publication."))
    if routes_without_trips:
        issues.append(_issue("routes_without_trips", "warn", f"{routes_without_trips:,} routes have no trips", "These routes cannot contribute timetable service."))
    if trips_without_stop_times:
        issues.append(_issue("trips_without_stop_times", "bad", f"{trips_without_stop_times:,} trips have no stop_times", "These trips cannot be routed."))
    if route_geometry_missing:
        issues.append(_issue("route_geometry_missing", "warn", f"{route_geometry_missing:,} routes have no geometry", "Use GTFS shapes, route-layer matching, or stop-by-stop fallback."))
    if counts.get("routes", 0) and not counts.get("shapes", 0):
        issues.append(_issue("missing_shapes", "warn", "No GTFS shapes imported", "OSM route matching or generated geometry will be needed."))
    if counts.get("routes", 0) and not match_counts:
        issues.append(_issue("no_route_matching", "warn", "No route-match rows", "Run route matching before route-layer publication QA."))
    return {
        "status": _qa_status(issues, dataset),
        "items": items,
        "issues": issues,
    }


def _service_horizon(session: Session, dataset: Dataset | None) -> dict[str, Any]:
    if dataset is None:
        return {"start_date": None, "end_date": None, "days_until_end": None, "items": [], "issues": []}
    cal_min, cal_max = session.execute(
        select(func.min(GtfsCalendar.start_date), func.max(GtfsCalendar.end_date)).where(GtfsCalendar.dataset_id == dataset.id)
    ).one()
    date_min, date_max = session.execute(
        select(func.min(GtfsCalendarDate.date), func.max(GtfsCalendarDate.date)).where(GtfsCalendarDate.dataset_id == dataset.id)
    ).one()
    start_int = _min_int(cal_min, date_min)
    end_int = _max_int(cal_max, date_max)
    start_date = _gtfs_date(start_int)
    end_date = _gtfs_date(end_int)
    today = datetime.now(timezone.utc).date()
    days_until_end = None if end_date is None else (end_date - today).days
    issues: list[dict[str, str]] = []
    if end_date is None:
        issues.append(_issue("service_horizon_missing", "bad", "No service calendar horizon", "calendar.txt or calendar_dates.txt is required for reliable routing."))
    elif days_until_end is not None and days_until_end < 0:
        issues.append(_issue("service_horizon_expired", "bad", f"Service expired {abs(days_until_end):,} days ago", "Update or exclude this feed."))
    elif days_until_end is not None and days_until_end < 30:
        issues.append(_issue("service_horizon_short", "warn", f"Service ends in {days_until_end:,} days", "Update cadence is too close for publication confidence."))
    return {
        "start_date": None if start_date is None else start_date.isoformat(),
        "end_date": None if end_date is None else end_date.isoformat(),
        "days_until_end": days_until_end,
        "items": [
            _metric("Service starts", start_date.isoformat() if start_date else "n/a", "info"),
            _metric("Service ends", end_date.isoformat() if end_date else "n/a", "bad" if end_date is None or (days_until_end is not None and days_until_end < 0) else "warn" if days_until_end is not None and days_until_end < 30 else "good"),
        ],
        "issues": issues,
    }


def _overlap_summary(session: Session, dataset: Dataset | None) -> dict[str, Any]:
    if dataset is None:
        return {"items": [], "issues": []}
    route_key_overlaps = _shared_route_keys(session, dataset.id)
    canonical_stop_overlaps = _shared_canonical_stops(session, dataset.id)
    issues: list[dict[str, str]] = []
    if route_key_overlaps:
        issues.append(_issue("shared_route_keys", "warn", f"{route_key_overlaps:,} route keys also exist in another active feed", "Deduplicate or rank source authority for overlapping routes."))
    if canonical_stop_overlaps:
        issues.append(_issue("shared_canonical_stops", "warn", f"{canonical_stop_overlaps:,} canonical stops are shared with another active feed", "This is useful linking evidence, but conflicts need review."))
    return {
        "items": [
            _metric("Shared route keys", route_key_overlaps, "warn" if route_key_overlaps else "good"),
            _metric("Shared canonical stops", canonical_stop_overlaps, "warn" if canonical_stop_overlaps else "good"),
        ],
        "issues": issues,
    }


def _license_payload(source: Source) -> dict[str, Any]:
    text = (source.license or "").strip()
    unknown = not text or "unknown" in text.lower()
    return {
        "label": text or "unknown",
        "redistribution_status": "unknown" if unknown else "review_required",
        "tone": "warn" if unknown else "info",
    }


def _license_issues(source: Source) -> list[dict[str, str]]:
    if _license_payload(source)["redistribution_status"] == "unknown":
        return [_issue("license_unknown", "warn", "License/redistribution status is unknown", "Publication needs explicit import, derivation, redistribution, and attribution flags.")]
    return []


def _qa_review_payload(notes: str | None) -> dict[str, Any]:
    if not notes:
        return {"status": "unreviewed", "note": "", "updated_at": None}
    for line in str(notes).splitlines():
        if not line.startswith(GTFS_QA_NOTE_PREFIX):
            continue
        payload: dict[str, str] = {}
        for part in line[len(GTFS_QA_NOTE_PREFIX) :].strip().split(";"):
            if "=" not in part:
                continue
            key, value = part.split("=", 1)
            payload[key.strip()] = value.strip()
        return {
            "status": payload.get("status") or "unreviewed",
            "note": payload.get("note") or "",
            "updated_at": payload.get("updated_at"),
        }
    return {"status": "unreviewed", "note": "", "updated_at": None}


def _routes_without_trips(session: Session, dataset_id: int) -> int:
    trip_exists = select(GtfsTrip.id).where(GtfsTrip.dataset_id == dataset_id, GtfsTrip.route_id == GtfsRoute.route_id).exists()
    return int(session.scalar(select(func.count()).select_from(GtfsRoute).where(GtfsRoute.dataset_id == dataset_id, ~trip_exists)) or 0)


def _trips_without_stop_times(session: Session, dataset_id: int) -> int:
    stop_time_exists = select(GtfsStopTime.id).where(GtfsStopTime.dataset_id == dataset_id, GtfsStopTime.trip_id == GtfsTrip.trip_id).exists()
    return int(session.scalar(select(func.count()).select_from(GtfsTrip).where(GtfsTrip.dataset_id == dataset_id, ~stop_time_exists)) or 0)


def _stop_times_without_seconds(session: Session, dataset_id: int) -> int:
    return int(
        session.scalar(
            select(func.count())
            .select_from(GtfsStopTime)
            .where(GtfsStopTime.dataset_id == dataset_id, GtfsStopTime.arrival_seconds.is_(None), GtfsStopTime.departure_seconds.is_(None))
        )
        or 0
    )


def _shared_route_keys(session: Session, dataset_id: int) -> int:
    current = aliased(GtfsRoute)
    other = aliased(GtfsRoute)
    other_dataset = aliased(Dataset)
    return int(
        session.scalar(
            select(func.count(func.distinct(current.route_key)))
            .select_from(current)
            .join(other, and_(other.route_key == current.route_key, other.dataset_id != current.dataset_id))
            .join(other_dataset, other_dataset.id == other.dataset_id)
            .where(
                current.dataset_id == dataset_id,
                current.route_key.is_not(None),
                current.route_key != "",
                other_dataset.kind == "gtfs",
                other_dataset.is_active.is_(True),
            )
        )
        or 0
    )


def _shared_canonical_stops(session: Session, dataset_id: int) -> int:
    current = aliased(CanonicalStopLink)
    other = aliased(CanonicalStopLink)
    other_dataset = aliased(Dataset)
    return int(
        session.scalar(
            select(func.count(func.distinct(current.canonical_stop_id)))
            .select_from(current)
            .join(other, and_(other.canonical_stop_id == current.canonical_stop_id, other.dataset_id != current.dataset_id))
            .join(other_dataset, other_dataset.id == other.dataset_id)
            .where(
                current.dataset_id == dataset_id,
                current.object_type == "gtfs_stop",
                other.object_type == "gtfs_stop",
                other_dataset.kind == "gtfs",
                other_dataset.is_active.is_(True),
            )
        )
        or 0
    )


def _count(session: Session, model: Any, dataset_id: int, *criteria: Any) -> int:
    stmt = select(func.count()).select_from(model).where(model.dataset_id == dataset_id)
    if criteria:
        stmt = stmt.where(*criteria)
    return int(session.scalar(stmt) or 0)


def _metric(label: str, value: Any, tone: str = "info", description: str = "") -> dict[str, Any]:
    return {"label": label, "value": value, "tone": tone, "description": description}


def _issue(issue_id: str, severity: str, title: str, detail: str) -> dict[str, str]:
    return {"id": issue_id, "severity": severity, "title": title, "detail": detail}


def _qa_status(issues: list[dict[str, str]], dataset: Dataset | None) -> str:
    if dataset is None or any(issue.get("severity") == "bad" for issue in issues):
        return "blocked"
    if any(issue.get("severity") == "warn" for issue in issues):
        return "needs_review"
    return "ready"


def _feed_sections(feed: dict[str, Any]) -> list[dict[str, Any]]:
    return [
        {"id": "validation", "title": "GTFS Validation", "items": feed["validation"]["items"]},
        {"id": "service", "title": "Service Horizon", "items": feed["service"]["items"]},
        {"id": "overlap", "title": "Overlap and Deduplication", "items": feed["overlap"]["items"]},
        {"id": "license", "title": "License", "items": [_metric("Redistribution", feed["license"]["redistribution_status"], feed["license"]["tone"]), _metric("License", feed["license"]["label"], feed["license"]["tone"])]},
    ]


def _gtfs_date(value: int | None) -> date | None:
    if value is None:
        return None
    try:
        return datetime.strptime(str(int(value)), "%Y%m%d").date()
    except ValueError:
        return None


def _min_int(*values: int | None) -> int | None:
    clean = [int(value) for value in values if value is not None]
    return min(clean) if clean else None


def _max_int(*values: int | None) -> int | None:
    clean = [int(value) for value in values if value is not None]
    return max(clean) if clean else None


def _iso(value: datetime | None) -> str | None:
    return None if value is None else value.isoformat()