from __future__ import annotations import csv import hashlib import json import re from dataclasses import dataclass, field from datetime import datetime, timezone from html import unescape from html.parser import HTMLParser from pathlib import Path from typing import Iterable from urllib.parse import parse_qs, urljoin, urlparse import requests MOBILITY_DATABASE_FEEDS_URL = "https://files.mobilitydatabase.org/feeds_v2.csv" MOBILITY_DATABASE_ACCEPTANCE_TEST_URL = ( "https://raw.githubusercontent.com/MobilityData/gtfs-validator/master/" "scripts/mobility-database-harvester/acceptance_test_feed_list.csv" ) PTNA_GTFS_INDEX_URL = "https://ptna.openstreetmap.de/gtfs/index.html" PTNA_COUNTRY_URL_TEMPLATE = "https://ptna.openstreetmap.de/gtfs/{country}/index.php" DEFAULT_DISCOVERY_COUNTRIES = ["DE", "AT", "CH", "NL", "DK", "FR", "BE", "LU", "NO", "SE", "FI", "IE", "GB"] CURATED_TEST_COUNTRIES = ["DE", "CH", "AT", "NL", "DK", "FI", "NO", "SE", "IE", "GB", "FR", "BE", "LU"] DIRECT_INGEST_HEADERS = ["name", "kind", "url", "country", "license", "mode_scope", "source_basis", "priority", "notes"] CANONICAL_HEADERS = [ "candidate_id", "discovery_source", "country", "subdivision", "provider", "feed_name", "stable_id", "ptna_feed_id", "data_type", "status", "is_official", "selected_url", "direct_download_url", "latest_url", "original_release_url", "license_url", "license_text", "osm_license_text", "details_url", "routes_url", "valid_from", "valid_to", "release_date", "feed_version", "bbox", "features", "priority", "availability_status", "http_status", "content_type", "content_length", "final_url", "source_basis", "notes", ] @dataclass class FeedCandidate: discovery_source: str country: str = "" subdivision: str = "" provider: str = "" feed_name: str = "" stable_id: str = "" ptna_feed_id: str = "" data_type: str = "gtfs" status: str = "" is_official: str = "" selected_url: str = "" direct_download_url: str = "" latest_url: str = "" original_release_url: str = "" license_url: str = "" license_text: str = "" osm_license_text: str = "" details_url: str = "" routes_url: str = "" valid_from: str = "" valid_to: str = "" release_date: str = "" feed_version: str = "" bbox: str = "" features: str = "" priority: str = "" availability_status: str = "unchecked" http_status: str = "" content_type: str = "" content_length: str = "" final_url: str = "" source_basis: str = "" notes: str = "" evidence_sources: list[str] = field(default_factory=list) def key(self) -> str: if self.stable_id: return f"stable:{self.stable_id}" if self.selected_url: return f"url:{_normalize_url_key(self.selected_url)}" if self.ptna_feed_id: return f"ptna:{self.ptna_feed_id}" return "hash:" + hashlib.sha256(json.dumps(self.row(), sort_keys=True).encode("utf-8")).hexdigest() def candidate_id(self) -> str: seed = "|".join( [ self.discovery_source, self.country, self.stable_id, self.ptna_feed_id, self.selected_url, self.provider, self.feed_name, ] ) return hashlib.sha256(seed.encode("utf-8")).hexdigest()[:16] def row(self) -> dict[str, str]: payload = {header: _string(getattr(self, header, "")) for header in CANONICAL_HEADERS if header != "candidate_id"} payload["candidate_id"] = self.candidate_id() return payload def ingestable_row(self) -> dict[str, str]: name = _feed_source_name(self.country, self.provider or self.feed_name) license_value = self.license_text or (f"see {self.license_url}" if self.license_url else "") basis_parts = [self.source_basis or self.discovery_source] if self.details_url: basis_parts.append(f"details: {self.details_url}") if self.original_release_url and self.original_release_url != self.selected_url: basis_parts.append(f"release: {self.original_release_url}") notes = self.notes or "" if self.latest_url and self.latest_url != self.selected_url: notes = _join_notes(notes, f"Mobility Database mirror: {self.latest_url}") if self.osm_license_text: notes = _join_notes(notes, f"OSM permission note: {_truncate(self.osm_license_text, 240)}") return { "name": _truncate(name, 240), "kind": "gtfs", "url": self.selected_url, "country": self.country, "license": _truncate(license_value, 240), "mode_scope": _mode_scope_from_features(self.features), "source_basis": _truncate("; ".join(part for part in basis_parts if part), 500), "priority": self.priority or _candidate_priority(self), "notes": _truncate(notes, 1200), } def default_generated_dir() -> Path: return Path(__file__).resolve().parents[1] / "docs" / "generated" def build_gtfs_discovery_manifests( *, output_dir: Path | str | None = None, countries: Iterable[str] | None = None, include_mobility_database: bool = True, include_acceptance_test_list: bool = True, include_ptna: bool = True, max_ptna_details: int = 80, test_limit: int = 24, check_urls: bool = False, timeout: float = 30.0, ) -> dict[str, object]: selected_countries = _normalize_countries(countries) out_dir = Path(output_dir) if output_dir is not None else default_generated_dir() out_dir.mkdir(parents=True, exist_ok=True) candidates: list[FeedCandidate] = [] candidates.extend(load_curated_ingestable_seed(countries=selected_countries)) if include_mobility_database: candidates.extend(fetch_mobility_database_candidates(countries=selected_countries, timeout=timeout)) if include_acceptance_test_list: candidates.extend(fetch_mobility_acceptance_candidates(countries=selected_countries, timeout=timeout)) if include_ptna: candidates.extend(fetch_ptna_candidates(countries=selected_countries, max_details=max_ptna_details, timeout=timeout)) merged = merge_candidates(candidates) ingestable = [candidate for candidate in merged if candidate.selected_url and candidate.data_type == "gtfs"] if check_urls: for candidate in ingestable: annotate_url_availability(candidate, timeout=min(timeout, 12.0)) test_run = select_test_run_candidates(ingestable, limit=test_limit) candidates_path = out_dir / "gtfs_feed_candidates.csv" ingestable_path = out_dir / "gtfs_ingestable_sources.csv" test_path = out_dir / "gtfs_test_run_sources.csv" report_path = out_dir / "gtfs_discovery_report.json" _write_csv(candidates_path, CANONICAL_HEADERS, [candidate.row() for candidate in merged]) _write_csv(ingestable_path, DIRECT_INGEST_HEADERS, [candidate.ingestable_row() for candidate in ingestable]) _write_csv(test_path, DIRECT_INGEST_HEADERS, [candidate.ingestable_row() for candidate in test_run]) by_source = _count_by(merged, lambda item: item.discovery_source) by_country = _count_by(ingestable, lambda item: item.country or "unknown") report = { "generated_at": datetime.now(timezone.utc).isoformat(), "countries": selected_countries or "all", "sources": { "mobility_database": MOBILITY_DATABASE_FEEDS_URL if include_mobility_database else None, "mobility_acceptance_test_list": MOBILITY_DATABASE_ACCEPTANCE_TEST_URL if include_acceptance_test_list else None, "ptna": PTNA_GTFS_INDEX_URL if include_ptna else None, }, "counts": { "candidates": len(merged), "ingestable": len(ingestable), "test_run": len(test_run), "by_source": by_source, "ingestable_by_country": by_country, }, "files": { "candidates": str(candidates_path), "ingestable": str(ingestable_path), "test_run": str(test_path), }, } report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") return report def fetch_mobility_database_candidates( *, countries: list[str] | None = None, timeout: float = 30.0, url: str = MOBILITY_DATABASE_FEEDS_URL, ) -> list[FeedCandidate]: text = _fetch_text(url, timeout=timeout) rows = csv.DictReader(text.splitlines()) candidates: list[FeedCandidate] = [] for row in rows: if _value(row, "data_type").lower() != "gtfs": continue country = _value(row, "location.country_code").upper() if countries and country not in countries: continue direct_url = _normalize_feed_url(_value(row, "urls.direct_download")) latest_url = _normalize_feed_url(_value(row, "urls.latest")) selected_url = _choose_feed_url(direct_url, latest_url) candidate = FeedCandidate( discovery_source="mobility_database", country=country, subdivision=_value(row, "location.subdivision_name"), provider=_value(row, "provider"), feed_name=_value(row, "name"), stable_id=_value(row, "id"), data_type="gtfs", status=_value(row, "status"), is_official=_value(row, "is_official"), selected_url=selected_url, direct_download_url=direct_url, latest_url=latest_url, license_url=_value(row, "urls.license"), bbox=_bbox_from_mobility_row(row), features=_value(row, "features"), source_basis="Mobility Database feed catalog", notes=_value(row, "note"), ) normalize_candidate_geography(candidate) apply_known_download_overrides(candidate) candidate.priority = _candidate_priority(candidate) candidates.append(candidate) return candidates def fetch_mobility_acceptance_candidates( *, countries: list[str] | None = None, timeout: float = 30.0, url: str = MOBILITY_DATABASE_ACCEPTANCE_TEST_URL, ) -> list[FeedCandidate]: text = _fetch_text(url, timeout=timeout) rows = csv.DictReader(text.splitlines()) candidates: list[FeedCandidate] = [] for row in rows: country = _value(row, "country_code").upper() if countries and country not in countries: continue latest_url = _normalize_feed_url(_value(row, "urls.latest")) if not latest_url: continue candidate = FeedCandidate( discovery_source="mobility_validator_acceptance", country=country, subdivision=_value(row, "subdivision_name"), provider=_value(row, "provider"), feed_name=_value(row, "provider"), stable_id=_value(row, "stable_id"), status="acceptance_test", selected_url=latest_url, latest_url=latest_url, source_basis="MobilityData validator acceptance-test feed list", notes="Useful smoke-test feed list; prefer Mobility Database feeds_v2 metadata for production source review.", priority="P3", ) normalize_candidate_geography(candidate) apply_known_download_overrides(candidate) candidates.append(candidate) return candidates def fetch_ptna_candidates( *, countries: list[str] | None = None, max_details: int = 80, timeout: float = 30.0, ) -> list[FeedCandidate]: country_codes = countries or DEFAULT_DISCOVERY_COUNTRIES if not country_codes: country_codes = discover_ptna_country_codes(timeout=timeout) candidates: list[FeedCandidate] = [] detail_fetches = 0 for country in country_codes: country_url = PTNA_COUNTRY_URL_TEMPLATE.format(country=country) try: html = _fetch_text(country_url, timeout=timeout) except requests.RequestException: continue for candidate in parse_ptna_country_page(html, country=country, page_url=country_url): if candidate.details_url and detail_fetches < max_details: try: detail_html = _fetch_text(candidate.details_url, timeout=timeout) enrich_ptna_candidate_from_details(candidate, detail_html, candidate.details_url) detail_fetches += 1 except requests.RequestException: candidate.notes = _join_notes(candidate.notes, "PTNA detail page could not be fetched during discovery.") candidate.priority = _candidate_priority(candidate) candidates.append(candidate) return candidates def discover_ptna_country_codes(*, timeout: float = 30.0) -> list[str]: html = _fetch_text(PTNA_GTFS_INDEX_URL, timeout=timeout) links = _all_links(html, PTNA_GTFS_INDEX_URL) codes: list[str] = [] for link in links: match = re.search(r"/gtfs/([A-Z]{2})/index\.php$", urlparse(link).path) if match and match.group(1) not in codes: codes.append(match.group(1)) return codes def parse_ptna_country_page(html: str, *, country: str, page_url: str) -> list[FeedCandidate]: rows = _parse_table_rows(html, page_url) candidates: list[FeedCandidate] = [] for row in rows: links = [link for cell in row.cells for link in cell.links] routes_url = _first_link_matching(links, "routes.php?feed=") details_url = _first_link_matching(links, "gtfs-details.php?feed=") if not routes_url and not details_url: continue feed_id = _feed_id_from_url(routes_url or details_url) if not feed_id: continue texts = [cell.text for cell in row.cells] release_link = _normalize_feed_url(row.cells[6].first_external_link if len(row.cells) > 6 else "") direct_url = release_link if _looks_like_download_url(release_link) else "" candidate = FeedCandidate( discovery_source="ptna", country=country, provider=texts[2] if len(texts) > 2 else "", feed_name=texts[1] if len(texts) > 1 else feed_id, ptna_feed_id=feed_id, selected_url=direct_url, direct_download_url=direct_url, original_release_url=release_link, details_url=details_url, routes_url=routes_url, valid_from=texts[3] if len(texts) > 3 else "", valid_to=texts[4] if len(texts) > 4 else "", feed_version=texts[5] if len(texts) > 5 else "", release_date=texts[6] if len(texts) > 6 else "", source_basis="PTNA GTFS analysis", notes="PTNA candidate; use original publisher URL where available.", ) normalize_candidate_geography(candidate) apply_known_download_overrides(candidate) candidates.append(candidate) return candidates def enrich_ptna_candidate_from_details(candidate: FeedCandidate, html: str, page_url: str) -> None: fields = parse_ptna_detail_fields(html, page_url) candidate.original_release_url = _normalize_feed_url(fields.get("release url href") or fields.get("release url") or candidate.original_release_url) candidate.license_url = fields.get("publisher's license href") or candidate.license_url candidate.license_text = fields.get("publisher's license") or candidate.license_text candidate.osm_license_text = fields.get("license given for use in osm") or candidate.osm_license_text candidate.valid_from = fields.get("feed start date") or candidate.valid_from candidate.valid_to = fields.get("feed end date") or candidate.valid_to candidate.feed_version = fields.get("feed version") or candidate.feed_version candidate.release_date = fields.get("release date") or candidate.release_date network_guid = fields.get('"network:guid"') if network_guid: candidate.notes = _join_notes(candidate.notes, f"PTNA network:guid={network_guid}") if not candidate.selected_url and _looks_like_download_url(candidate.original_release_url): candidate.selected_url = _normalize_feed_url(candidate.original_release_url) candidate.direct_download_url = candidate.selected_url normalize_candidate_geography(candidate) def parse_ptna_detail_fields(html: str, page_url: str) -> dict[str, str]: parsed: dict[str, str] = {} for row in _parse_table_rows(html, page_url): if len(row.cells) < 2: continue label = _clean_text(row.cells[0].text).lower() if not label: continue detail = _clean_text(row.cells[1].text) parsed[label] = detail if row.cells[1].first_external_link: parsed[f"{label} href"] = row.cells[1].first_external_link return parsed def load_curated_ingestable_seed( *, countries: list[str] | None = None, path: Path | str | None = None, ) -> list[FeedCandidate]: seed_path = Path(path) if path is not None else Path(__file__).resolve().parents[1] / "docs" / "ingestable_sources_seed.csv" if not seed_path.exists(): return [] candidates: list[FeedCandidate] = [] with seed_path.open("r", encoding="utf-8-sig", newline="") as handle: for row in csv.DictReader(handle): if _value(row, "kind").lower() != "gtfs": continue country = _value(row, "country").upper() if countries and country not in countries and country != "EU": continue candidate = FeedCandidate( discovery_source="curated_seed", country=country, provider=_value(row, "name").removesuffix(" GTFS"), feed_name=_value(row, "name"), selected_url=_normalize_feed_url(_value(row, "url")), direct_download_url=_normalize_feed_url(_value(row, "url")), license_text=_value(row, "license"), features=_value(row, "mode_scope"), priority=_value(row, "priority"), source_basis=_value(row, "source_basis") or "curated seed", notes=_value(row, "notes"), ) normalize_candidate_geography(candidate) apply_known_download_overrides(candidate) candidates.append(candidate) return candidates def merge_candidates(candidates: Iterable[FeedCandidate]) -> list[FeedCandidate]: by_key: dict[str, FeedCandidate] = {} alias_to_key: dict[str, str] = {} for candidate in candidates: keys = _candidate_alias_keys(candidate) primary_key = keys[0] existing_key = next((alias_to_key[key] for key in keys if key in alias_to_key), None) existing = by_key.get(existing_key) if existing_key is not None else None if existing is None: by_key[primary_key] = candidate for key in keys: alias_to_key[key] = primary_key continue _merge_candidate(existing, candidate) for key in keys: alias_to_key[key] = existing_key or primary_key return sorted(by_key.values(), key=lambda item: (_priority_sort_key(item.priority), item.country, item.provider.lower(), item.feed_name.lower())) def select_test_run_candidates(candidates: Iterable[FeedCandidate], *, limit: int = 24) -> list[FeedCandidate]: sorted_candidates = sorted( [ candidate for candidate in candidates if candidate.discovery_source != "mobility_validator_acceptance" and _test_candidate_eligible(candidate) ], key=_test_candidate_sort_key, ) selected: list[FeedCandidate] = [] seen_urls: set[str] = set() per_country: dict[str, int] = {} def add(candidate: FeedCandidate, *, force: bool = False) -> None: if len(selected) >= limit: return url_key = _normalize_url_key(candidate.selected_url) if not candidate.selected_url or url_key in seen_urls: return country = candidate.country or "unknown" country_limit = 7 if force and country == "DE" else 3 if per_country.get(country, 0) >= country_limit: return selected.append(candidate) seen_urls.add(url_key) per_country[country] = per_country.get(country, 0) + 1 preferred_tokens = [ "opendata-oepnv.de", "download.gtfs.de/germany/", "vbb.de/vbbgtfs", "rnv-online.de", "vrn.de", "gtfs.geops.ch", "wienerlinien.at", "gtfs.openov.nl", "gtfs.ovapi.nl", "rejseplanen.info", "dev.hsl.fi/gtfs", "hsldev.com/gtfs", "rb_norway-aggregated-gtfs", "data.bus-data.dft.gov.uk", "transportforireland", "gtfs.irail.be/de-lijn", ] for candidate in sorted_candidates: text = " ".join([candidate.provider, candidate.feed_name, candidate.source_basis, candidate.selected_url]).lower() if any(token in text for token in preferred_tokens): add(candidate, force=True) for country in CURATED_TEST_COUNTRIES: for candidate in sorted_candidates: if candidate.country == country: add(candidate) if len(selected) >= limit: break if len(selected) >= limit: break for candidate in sorted_candidates: add(candidate) if len(selected) >= limit: break return selected def _test_candidate_eligible(candidate: FeedCandidate) -> bool: if not candidate.selected_url: return False if _priority_sort_key(candidate.priority) > 2: return False text = " ".join([candidate.status, candidate.selected_url, candidate.provider, candidate.feed_name, candidate.notes]).lower() if "deprecated" in text or "inactive" in text or "{apikey}" in text: return False if "registration required" in text or "authentication" in text: return False return True def annotate_url_availability(candidate: FeedCandidate, *, timeout: float = 10.0) -> FeedCandidate: if not candidate.selected_url: candidate.availability_status = "missing_url" return candidate headers = {"User-Agent": "meubility-workbench-feed-discovery/0.1"} try: response = requests.head(candidate.selected_url, allow_redirects=True, timeout=timeout, headers=headers) if response.status_code in {405, 403} or response.status_code >= 500: response = requests.get( candidate.selected_url, allow_redirects=True, timeout=timeout, headers={**headers, "Range": "bytes=0-0"}, stream=True, ) candidate.http_status = str(response.status_code) candidate.content_type = response.headers.get("content-type", "") candidate.content_length = response.headers.get("content-length", "") candidate.final_url = response.url candidate.availability_status = "ok" if response.status_code < 400 else "error" response.close() except requests.RequestException as exc: candidate.availability_status = "error" candidate.notes = _join_notes(candidate.notes, f"Availability check failed: {exc}") return candidate def normalize_candidate_geography(candidate: FeedCandidate) -> None: text = " ".join( [ candidate.selected_url, candidate.direct_download_url, candidate.latest_url, candidate.original_release_url, candidate.provider, candidate.feed_name, candidate.source_basis, ] ).lower() if "download.gtfs.de/germany/" in text or "gtfs for germany" in text: candidate.country = "DE" elif "storage.googleapis.com/marduk-production/outbound/gtfs/rb_norway" in text: candidate.country = "NO" elif "gtfs.ovapi.nl" in text or "openov.nl" in text: candidate.country = "NL" elif "www.nvbw.de/fileadmin/user_upload/service/open_data/" in text: candidate.country = "DE" def apply_known_download_overrides(candidate: FeedCandidate) -> None: stale_direct_ids = {"mdb-684", "mdb-777"} if candidate.stable_id in stale_direct_ids and candidate.latest_url: candidate.selected_url = candidate.latest_url candidate.notes = _join_notes( candidate.notes, "Selected Mobility Database latest.zip mirror because the catalog direct URL is known to be stale.", ) @dataclass class _HtmlCell: text: str = "" links: list[str] = field(default_factory=list) @property def first_external_link(self) -> str: for link in self.links: parsed = urlparse(link) if parsed.scheme in {"http", "https"} and "ptna.openstreetmap.de" not in parsed.netloc: return link return "" @dataclass class _HtmlRow: cells: list[_HtmlCell] = field(default_factory=list) class _TableParser(HTMLParser): def __init__(self, base_url: str): super().__init__(convert_charrefs=True) self.base_url = base_url self.rows: list[_HtmlRow] = [] self._row: _HtmlRow | None = None self._cell: _HtmlCell | None = None self._active_link: str = "" def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: attrs_dict = {key: value or "" for key, value in attrs} if tag == "tr": self._row = _HtmlRow() elif tag in {"td", "th"} and self._row is not None: self._cell = _HtmlCell() elif tag == "a" and self._cell is not None: href = attrs_dict.get("href", "") if href: self._active_link = urljoin(self.base_url, href) self._cell.links.append(self._active_link) def handle_endtag(self, tag: str) -> None: if tag in {"td", "th"} and self._row is not None and self._cell is not None: self._cell.text = _clean_text(self._cell.text) self._row.cells.append(self._cell) self._cell = None self._active_link = "" elif tag == "a": self._active_link = "" elif tag == "tr": if self._row is not None and self._row.cells: self.rows.append(self._row) self._row = None self._cell = None self._active_link = "" def handle_data(self, data: str) -> None: if self._cell is not None: self._cell.text += data class _LinkParser(HTMLParser): def __init__(self, base_url: str): super().__init__(convert_charrefs=True) self.base_url = base_url self.links: list[str] = [] def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: if tag != "a": return for key, value in attrs: if key == "href" and value: self.links.append(urljoin(self.base_url, value)) def _parse_table_rows(html: str, base_url: str) -> list[_HtmlRow]: parser = _TableParser(base_url) parser.feed(html) return parser.rows def _all_links(html: str, base_url: str) -> list[str]: parser = _LinkParser(base_url) parser.feed(html) return parser.links def _fetch_text(url: str, *, timeout: float) -> str: response = requests.get(url, timeout=timeout, headers={"User-Agent": "meubility-workbench-feed-discovery/0.1"}) response.raise_for_status() return response.text def _first_link_matching(links: Iterable[str], needle: str) -> str: for link in links: if needle in link: return link return "" def _feed_id_from_url(url: str) -> str: query = parse_qs(urlparse(url).query) return (query.get("feed") or [""])[0] def _looks_like_download_url(url: str) -> bool: if not url: return False parsed = urlparse(url) lower_path = parsed.path.lower() lower_url = url.lower() if lower_path.endswith(".zip"): return True if "exportformat=gtfs" in lower_url or "google_transit" in lower_url: return True if lower_path.rstrip("/").endswith(("current_gtfs", "gtfs")): return True if "gtfs.ovapi.nl" in parsed.netloc.lower() and "gtfs" in lower_path: return True return False def _normalize_feed_url(url: str) -> str: cleaned = _clean_text(url) if not cleaned: return "" parsed = urlparse(cleaned) if parsed.scheme: return cleaned first = cleaned.split("/", 1)[0] if "." in first: return f"https://{cleaned}" return cleaned def _choose_feed_url(direct_url: str, latest_url: str) -> str: if direct_url: return direct_url return latest_url def _candidate_priority(candidate: FeedCandidate) -> str: status = candidate.status.lower() official = candidate.is_official.lower() == "true" if candidate.discovery_source == "curated_seed": return candidate.priority or "P1" if status == "active" and official and candidate.direct_download_url: return "P0" if status == "active" and candidate.direct_download_url: return "P1" if status == "active" and candidate.latest_url: return "P2" if candidate.discovery_source == "ptna": return "P2" if candidate.selected_url else "P4" return "P3" def _test_candidate_sort_key(candidate: FeedCandidate) -> tuple[int, int, str, str]: source_bonus = 0 if candidate.discovery_source == "curated_seed" else 1 country_bonus = CURATED_TEST_COUNTRIES.index(candidate.country) if candidate.country in CURATED_TEST_COUNTRIES else 99 return (_priority_sort_key(candidate.priority), source_bonus + country_bonus, candidate.country, candidate.provider.lower()) def _priority_sort_key(priority: str) -> int: match = re.match(r"P(\d+)", priority or "") return int(match.group(1)) if match else 9 def _candidate_alias_keys(candidate: FeedCandidate) -> list[str]: keys = [candidate.key()] if candidate.stable_id: keys.append(f"stable:{candidate.stable_id}") for url in [candidate.selected_url, candidate.direct_download_url, candidate.latest_url]: if url: keys.append(f"url:{_normalize_url_key(url)}") if candidate.ptna_feed_id: keys.append(f"ptna:{candidate.ptna_feed_id}") deduped: list[str] = [] for key in keys: if key not in deduped: deduped.append(key) return deduped def _merge_candidate(existing: FeedCandidate, incoming: FeedCandidate) -> None: if incoming.discovery_source == "curated_seed": for field_name in ["country", "provider", "feed_name", "license_text", "features", "source_basis", "notes"]: new_value = getattr(incoming, field_name, "") if new_value: setattr(existing, field_name, new_value) existing.discovery_source = _join_unique(existing.discovery_source, incoming.discovery_source) for field_name in CANONICAL_HEADERS: if field_name == "candidate_id": continue current = getattr(existing, field_name, "") new_value = getattr(incoming, field_name, "") if not current and new_value: setattr(existing, field_name, new_value) existing.priority = _better_priority(existing.priority, incoming.priority) existing.source_basis = _join_unique(existing.source_basis, incoming.source_basis) existing.notes = _join_notes(existing.notes, incoming.notes) def _better_priority(left: str, right: str) -> str: return left if _priority_sort_key(left) <= _priority_sort_key(right) else right def _join_unique(left: str, right: str) -> str: parts: list[str] = [] for value in [left, right]: for part in value.split(";"): cleaned = part.strip() if cleaned and cleaned not in parts: parts.append(cleaned) return "; ".join(parts) def _join_notes(left: str, right: str) -> str: return _join_unique(left, right) def _compact_name(value: str) -> str: return re.sub(r"\s+", " ", _clean_text(value)).strip() def _feed_source_name(country: str, value: str) -> str: base = _compact_name(value) or "GTFS feed" prefix = country.upper() display = base if prefix and not base.upper().startswith(f"{prefix} "): display = f"{prefix} {base}" if "gtfs" not in display.lower(): display = f"{display} GTFS" return display def _clean_text(value: str) -> str: cleaned = unescape(value or "").replace("\xa0", " ") cleaned = re.sub(r"\s+", " ", cleaned) return cleaned.strip() def _mode_scope_from_features(features: str) -> str: lower = features.lower() modes = [] if "rail" in lower or "train" in lower: modes.append("rail") if "tram" in lower or "light_rail" in lower: modes.append("tram") if "subway" in lower or "metro" in lower: modes.append("metro") if "bus" in lower or not modes: modes.append("bus") if "ferry" in lower: modes.append("ferry") return ",".join(dict.fromkeys(modes)) def _bbox_from_mobility_row(row: dict[str, str]) -> str: min_lat = _value(row, "location.bounding_box.minimum_latitude") max_lat = _value(row, "location.bounding_box.maximum_latitude") min_lon = _value(row, "location.bounding_box.minimum_longitude") max_lon = _value(row, "location.bounding_box.maximum_longitude") if not all([min_lat, max_lat, min_lon, max_lon]): return "" return f"{min_lon},{min_lat},{max_lon},{max_lat}" def _normalize_countries(countries: Iterable[str] | None) -> list[str] | None: if countries is None: return DEFAULT_DISCOVERY_COUNTRIES normalized = [country.strip().upper() for country in countries if country and country.strip()] if any(country == "ALL" for country in normalized): return None return normalized def _normalize_url_key(url: str) -> str: parsed = urlparse(url.strip()) scheme = parsed.scheme.lower() netloc = parsed.netloc.lower() path = parsed.path.rstrip("/") query = parsed.query return f"{scheme}://{netloc}{path}" + (f"?{query}" if query else "") def _write_csv(path: Path, headers: list[str], rows: list[dict[str, str]]) -> None: with path.open("w", encoding="utf-8", newline="") as handle: writer = csv.DictWriter(handle, fieldnames=headers, extrasaction="ignore") writer.writeheader() writer.writerows(rows) def _count_by(items: Iterable[FeedCandidate], key_fn) -> dict[str, int]: counts: dict[str, int] = {} for item in items: key = key_fn(item) counts[key] = counts.get(key, 0) + 1 return dict(sorted(counts.items())) def _value(row: dict[str, str], key: str) -> str: return _clean_text(row.get(key, "")) def _string(value: object) -> str: return "" if value is None else str(value) def _truncate(value: str, length: int) -> str: return value[:length] if value else ""