from __future__ import annotations import json from datetime import datetime, timezone from typing import Any from sqlalchemy import func, select from sqlalchemy.orm import Session from app.gtfs_storage import missing_sidecar_paths as missing_gtfs_sidecar_paths from app.models import ( CanonicalStop, CanonicalStopLink, Dataset, GtfsAgency, GtfsCalendar, GtfsCalendarDate, GtfsRoute, GtfsShape, GtfsStop, GtfsTrip, Job, OsmFeature, RouteMatch, RoutePattern, RoutePatternStop, Source, SourceCatalogEntry, ) from app.osm_storage import missing_sidecar_paths as missing_osm_sidecar_paths from app.pipeline.osm_addresses import ADDRESS_INDEX_VERSION from app.pipeline.routing_layer import active_routing_dataset def qa_summary(session: Session) -> dict[str, Any]: active_gtfs_datasets = session.scalars( select(Dataset).where(Dataset.kind == "gtfs", Dataset.is_active.is_(True)).order_by(Dataset.id) ).all() active_osm_datasets = session.scalars( select(Dataset).where(Dataset.kind == "osm_geojson", Dataset.is_active.is_(True)).order_by(Dataset.id) ).all() active_gtfs_ids = [int(dataset.id) for dataset in active_gtfs_datasets] active_osm_ids = [int(dataset.id) for dataset in active_osm_datasets] source_catalog_total = _count(session, SourceCatalogEntry) registered_sources = _count(session, Source) linked_catalog_entries = int( session.scalar( select(func.count(func.distinct(Source.catalog_entry_id))).where(Source.catalog_entry_id.is_not(None)) ) or 0 ) priority_backlog = _priority_catalog_backlog(session) failed_sources = int( session.scalar( select(func.count()) .select_from(Source) .where((Source.last_error.is_not(None)) | Source.status.in_(["failed", "error"])) ) or 0 ) active_jobs = _job_status_counts(session) missing_gtfs_sidecars = sum(1 for dataset in active_gtfs_datasets if missing_gtfs_sidecar_paths(dataset)) missing_osm_sidecars = sum(1 for dataset in active_osm_datasets if missing_osm_sidecar_paths(dataset)) gtfs_counts = _gtfs_validation_counts(session, active_gtfs_ids) link_counts = _link_quality_counts(session, active_gtfs_ids, active_osm_ids) route_counts = _route_quality_counts(session, active_gtfs_ids) address_status = _lightweight_address_index_status(session) license_unknown = int( session.scalar( select(func.count()) .select_from(Source) .where(Source.kind == "gtfs", (Source.license.is_(None)) | (func.lower(Source.license).in_(["", "unknown"]))) ) or 0 ) return { "generated_at": datetime.now(timezone.utc).isoformat(), "decision": { "deployment": "same_workbench_for_now", "database": "same_postgresql_database_for_now", "split_trigger": "Split when third-party API, accounts/billing, heavy export jobs, or independent scaling are needed.", "api_contract": "/api/qa/summary is intentionally display-ready but stable enough to become a harmonization-service summary endpoint.", }, "sections": [ { "id": "source_discovery", "title": "Source Discovery", "items": [ _item("Identified sources", source_catalog_total, "info", "Rows in the source catalog."), _item("Registered sources", registered_sources, "info", "Sources known to the importer."), _item("Catalog entries linked", linked_catalog_entries, "good" if linked_catalog_entries else "warn", "Catalog rows connected to importer sources."), _item("Priority catalog backlog", priority_backlog, "warn" if priority_backlog else "good", "P0/P1 catalog rows without a registered source."), ], }, { "id": "import_health", "title": "Import Health", "items": [ _item("Active GTFS datasets", len(active_gtfs_ids), "good" if active_gtfs_ids else "warn", "Feeds currently participating in harmonization."), _item("Active OSM datasets", len(active_osm_ids), "good" if active_osm_ids else "warn", "Visual/spatial datasets currently active."), _item("Running jobs", active_jobs.get("running", 0), "warn" if active_jobs.get("running", 0) else "info", "Currently running queued work."), _item("Queued jobs", active_jobs.get("queued", 0), "info", "Outstanding queued work."), _item("Failed sources", failed_sources, "bad" if failed_sources else "good", "Sources with failed status or last_error."), _item("Missing GTFS sidecars", missing_gtfs_sidecars, "bad" if missing_gtfs_sidecars else "good", "Active GTFS datasets whose sidecar is unavailable."), _item("Missing OSM sidecars", missing_osm_sidecars, "bad" if missing_osm_sidecars else "good", "Active OSM datasets whose sidecar is unavailable."), ], }, { "id": "gtfs_validation", "title": "GTFS Validation", "items": [ _item("Agencies", gtfs_counts["agencies"], "info", "Imported agency.txt rows."), _item("Stops", gtfs_counts["stops"], "info", "Imported stops."), _item("Routes", gtfs_counts["routes"], "info", "Imported routes."), _item("Trips", gtfs_counts["trips"], "info", "Imported trips."), _item("Shapes", gtfs_counts["shapes"], "info", "Imported shape records."), _item("Stops without coordinates", gtfs_counts["stops_without_coordinates"], "bad" if gtfs_counts["stops_without_coordinates"] else "good", "Stops that cannot be spatially linked or routed."), _item("Routes without geometry", gtfs_counts["routes_without_geometry"], "warn" if gtfs_counts["routes_without_geometry"] else "good", "Routes with no stored GTFS shape geometry."), _item("Routes without agency", gtfs_counts["routes_without_agency"], "warn" if gtfs_counts["routes_without_agency"] else "good", "Routes missing agency/operator references."), _item("Calendar range", gtfs_counts["calendar_range"], "info", "Min/max imported service dates from calendars and exceptions."), ], }, { "id": "deduplication", "title": "Deduplication and Stop Links", "items": [ _item("Canonical stops", link_counts["canonical_stops"], "info", "Current normalized stop/station records."), _item("GTFS stop links", link_counts["gtfs_stop_links"], "good" if link_counts["gtfs_stop_links"] else "warn", "Timetable stops linked into canonical stops."), _item("GTFS stops without canonical link", link_counts["gtfs_stops_without_canonical"], "bad" if link_counts["gtfs_stops_without_canonical"] else "good", "Imported active stops that still need deduplication/linking."), _item("OSM visual stop links", link_counts["osm_stop_links"], "good" if link_counts["osm_stop_links"] else "warn", "OSM stop/station features linked to canonical stops."), _item("OSM stops without canonical link", link_counts["osm_stops_without_canonical"], "warn" if link_counts["osm_stops_without_canonical"] else "good", "Visual stops that are not yet linked to GTFS/canonical stops."), _item("Multi-source stop groups", link_counts["multi_source_stop_groups"], "info", "Canonical stops that merge GTFS stops from multiple datasets."), _item("Long-distance OSM links", link_counts["long_distance_osm_links"], "warn" if link_counts["long_distance_osm_links"] else "good", "OSM stop links over 150m from the canonical stop."), ], }, { "id": "route_quality", "title": "Route Matching and Geometry", "items": [ _item("Matched/accepted routes", route_counts["matched_or_accepted"], "good" if route_counts["matched_or_accepted"] else "warn", "GTFS routes with accepted or automatic OSM matches."), _item("Probable matches", route_counts["probable"], "warn" if route_counts["probable"] else "info", "Potential conflicts needing review."), _item("Weak matches", route_counts["weak"], "warn" if route_counts["weak"] else "good", "Low-confidence route links."), _item("Missing route matches", route_counts["missing"], "bad" if route_counts["missing"] else "good", "Routes with no visual match."), _item("Unreviewed GTFS routes", route_counts["routes_without_match"], "warn" if route_counts["routes_without_match"] else "good", "Active GTFS routes without a RouteMatch row."), _item("Route patterns", route_counts["route_patterns"], "info", "Published visual route-layer patterns."), _item("Route patterns without stops", route_counts["route_patterns_without_stops"], "warn" if route_counts["route_patterns_without_stops"] else "good", "Visual patterns missing canonical stop sequence evidence."), ], }, { "id": "publication_readiness", "title": "Publication Readiness", "items": [ _item("Address index stale", "yes" if address_status.get("stale") else "no", "warn" if address_status.get("stale") else "good", "Address polygons/search index version status."), _item("GTFS licenses unknown", license_unknown, "warn" if license_unknown else "good", "GTFS sources without explicit redistribution/license status."), _item("Canonical export", "draft", "warn", "Canonical Europe dataset export tables/API are not versioned yet."), _item("Third-party API", "later", "info", "Accounts, billing, quotas, and API backend are intentionally out of scope for this step."), ], }, ], "next_actions": [ "Add review queues for each non-zero bad/warn metric.", "Persist source authority and redistribution policy before publishing third-party exports.", "Create versioned canonical snapshots and export manifests.", ], } def _item(label: str, value: object, tone: str, description: str) -> dict[str, object]: return {"label": label, "value": value, "tone": tone, "description": description} def _lightweight_address_index_status(session: Session) -> dict[str, object]: dataset = active_routing_dataset(session) if dataset is None or not dataset.metadata_json: return {"stale": False, "version": None, "current_version": ADDRESS_INDEX_VERSION} try: metadata = json.loads(dataset.metadata_json or "{}") except json.JSONDecodeError: metadata = {} address_index = metadata.get("address_index") if isinstance(metadata, dict) else {} if not isinstance(address_index, dict): address_index = {} version = address_index.get("version") return { "stale": bool(address_index and version != ADDRESS_INDEX_VERSION), "version": version, "current_version": ADDRESS_INDEX_VERSION, } def _count(session: Session, model, *where) -> int: stmt = select(func.count()).select_from(model) if where: stmt = stmt.where(*where) return int(session.scalar(stmt) or 0) def _priority_catalog_backlog(session: Session) -> int: linked = select(Source.id).where(Source.catalog_entry_id == SourceCatalogEntry.id).exists() return int( session.scalar( select(func.count()) .select_from(SourceCatalogEntry) .where(SourceCatalogEntry.priority.in_(["P0", "P0 fallback", "P1"]), ~linked) ) or 0 ) def _job_status_counts(session: Session) -> dict[str, int]: return { str(status): int(count) for status, count in session.execute( select(Job.status, func.count()) .where(Job.dismissed_at.is_(None), Job.status.in_(["queued", "running", "paused", "failed"])) .group_by(Job.status) ).all() } def _gtfs_validation_counts(session: Session, dataset_ids: list[int]) -> dict[str, object]: if not dataset_ids: return { "agencies": 0, "stops": 0, "routes": 0, "trips": 0, "shapes": 0, "stops_without_coordinates": 0, "routes_without_geometry": 0, "routes_without_agency": 0, "calendar_range": "none", } calendar_min, calendar_max = session.execute( select(func.min(GtfsCalendar.start_date), func.max(GtfsCalendar.end_date)).where(GtfsCalendar.dataset_id.in_(dataset_ids)) ).one() exception_min, exception_max = session.execute( select(func.min(GtfsCalendarDate.date), func.max(GtfsCalendarDate.date)).where(GtfsCalendarDate.dataset_id.in_(dataset_ids)) ).one() min_date = min(value for value in [calendar_min, exception_min] if value is not None) if (calendar_min or exception_min) else None max_date = max(value for value in [calendar_max, exception_max] if value is not None) if (calendar_max or exception_max) else None return { "agencies": _count(session, GtfsAgency, GtfsAgency.dataset_id.in_(dataset_ids)), "stops": _count(session, GtfsStop, GtfsStop.dataset_id.in_(dataset_ids)), "routes": _count(session, GtfsRoute, GtfsRoute.dataset_id.in_(dataset_ids)), "trips": _count(session, GtfsTrip, GtfsTrip.dataset_id.in_(dataset_ids)), "shapes": _count(session, GtfsShape, GtfsShape.dataset_id.in_(dataset_ids)), "stops_without_coordinates": _count( session, GtfsStop, GtfsStop.dataset_id.in_(dataset_ids), (GtfsStop.lat.is_(None)) | (GtfsStop.lon.is_(None)), ), "routes_without_geometry": _count( session, GtfsRoute, GtfsRoute.dataset_id.in_(dataset_ids), (GtfsRoute.geometry_geojson.is_(None)) | (GtfsRoute.geometry_geojson == ""), ), "routes_without_agency": _count( session, GtfsRoute, GtfsRoute.dataset_id.in_(dataset_ids), (GtfsRoute.agency_id.is_(None)) | (GtfsRoute.agency_id == ""), ), "calendar_range": f"{min_date or 'unknown'} -> {max_date or 'unknown'}", } def _link_quality_counts(session: Session, gtfs_dataset_ids: list[int], osm_dataset_ids: list[int]) -> dict[str, int]: if gtfs_dataset_ids: gtfs_link_exists = ( select(CanonicalStopLink.id) .where( CanonicalStopLink.object_type == "gtfs_stop", CanonicalStopLink.dataset_id == GtfsStop.dataset_id, CanonicalStopLink.object_id == GtfsStop.id, ) .exists() ) gtfs_stops_without_canonical = _count( session, GtfsStop, GtfsStop.dataset_id.in_(gtfs_dataset_ids), ~gtfs_link_exists, ) gtfs_stop_links = _count( session, CanonicalStopLink, CanonicalStopLink.object_type == "gtfs_stop", CanonicalStopLink.dataset_id.in_(gtfs_dataset_ids), ) multi_source_subquery = ( select(CanonicalStopLink.canonical_stop_id) .where(CanonicalStopLink.object_type == "gtfs_stop", CanonicalStopLink.dataset_id.in_(gtfs_dataset_ids)) .group_by(CanonicalStopLink.canonical_stop_id) .having(func.count(func.distinct(CanonicalStopLink.dataset_id)) > 1) .subquery() ) multi_source_stop_groups = int(session.scalar(select(func.count()).select_from(multi_source_subquery)) or 0) else: gtfs_stops_without_canonical = 0 gtfs_stop_links = 0 multi_source_stop_groups = 0 if osm_dataset_ids: osm_link_exists = ( select(CanonicalStopLink.id) .where( CanonicalStopLink.object_type == "osm_feature", CanonicalStopLink.dataset_id == OsmFeature.dataset_id, CanonicalStopLink.object_id == OsmFeature.id, ) .exists() ) osm_stops_without_canonical = _count( session, OsmFeature, OsmFeature.dataset_id.in_(osm_dataset_ids), OsmFeature.kind.in_(["stop", "station", "terminal"]), ~osm_link_exists, ) osm_stop_links = _count( session, CanonicalStopLink, CanonicalStopLink.object_type == "osm_feature", CanonicalStopLink.dataset_id.in_(osm_dataset_ids), ) long_distance_osm_links = _count( session, CanonicalStopLink, CanonicalStopLink.object_type == "osm_feature", CanonicalStopLink.dataset_id.in_(osm_dataset_ids), CanonicalStopLink.distance_m > 150, ) else: osm_stops_without_canonical = 0 osm_stop_links = 0 long_distance_osm_links = 0 return { "canonical_stops": _count(session, CanonicalStop), "gtfs_stop_links": gtfs_stop_links, "gtfs_stops_without_canonical": gtfs_stops_without_canonical, "osm_stop_links": osm_stop_links, "osm_stops_without_canonical": osm_stops_without_canonical, "multi_source_stop_groups": multi_source_stop_groups, "long_distance_osm_links": long_distance_osm_links, } def _route_quality_counts(session: Session, gtfs_dataset_ids: list[int]) -> dict[str, int]: route_patterns = _count(session, RoutePattern) route_pattern_stop_exists = ( select(RoutePatternStop.id) .where(RoutePatternStop.route_pattern_id == RoutePattern.id) .exists() ) route_patterns_without_stops = _count(session, RoutePattern, ~route_pattern_stop_exists) if not gtfs_dataset_ids: return { "matched_or_accepted": 0, "probable": 0, "weak": 0, "missing": 0, "routes_without_match": 0, "route_patterns": route_patterns, "route_patterns_without_stops": route_patterns_without_stops, } match_rows = { str(status): int(count) for status, count in session.execute( select(RouteMatch.status, func.count()) .join(GtfsRoute, GtfsRoute.id == RouteMatch.gtfs_route_id) .where(GtfsRoute.dataset_id.in_(gtfs_dataset_ids)) .group_by(RouteMatch.status) ).all() } match_exists = select(RouteMatch.id).where(RouteMatch.gtfs_route_id == GtfsRoute.id).exists() routes_without_match = _count(session, GtfsRoute, GtfsRoute.dataset_id.in_(gtfs_dataset_ids), ~match_exists) return { "matched_or_accepted": match_rows.get("matched", 0) + match_rows.get("accepted", 0), "probable": match_rows.get("probable", 0), "weak": match_rows.get("weak", 0), "missing": match_rows.get("missing", 0), "routes_without_match": routes_without_match, "route_patterns": route_patterns, "route_patterns_without_stops": route_patterns_without_stops, }