meubility-workbench/app/feed_discovery.py

from __future__ import annotations

import csv
import hashlib
import json
import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
from html import unescape
from html.parser import HTMLParser
from pathlib import Path
from typing import Iterable
from urllib.parse import parse_qs, urljoin, urlparse

import requests


MOBILITY_DATABASE_FEEDS_URL = "https://files.mobilitydatabase.org/feeds_v2.csv"
MOBILITY_DATABASE_ACCEPTANCE_TEST_URL = (
    "https://raw.githubusercontent.com/MobilityData/gtfs-validator/master/"
    "scripts/mobility-database-harvester/acceptance_test_feed_list.csv"
)
PTNA_GTFS_INDEX_URL = "https://ptna.openstreetmap.de/gtfs/index.html"
PTNA_COUNTRY_URL_TEMPLATE = "https://ptna.openstreetmap.de/gtfs/{country}/index.php"

DEFAULT_DISCOVERY_COUNTRIES = ["DE", "AT", "CH", "NL", "DK", "FR", "BE", "LU", "NO", "SE", "FI", "IE", "GB"]
CURATED_TEST_COUNTRIES = ["DE", "CH", "AT", "NL", "DK", "FI", "NO", "SE", "IE", "GB", "FR", "BE", "LU"]
DIRECT_INGEST_HEADERS = ["name", "kind", "url", "country", "license", "mode_scope", "source_basis", "priority", "notes"]
CANONICAL_HEADERS = [
    "candidate_id",
    "discovery_source",
    "country",
    "subdivision",
    "provider",
    "feed_name",
    "stable_id",
    "ptna_feed_id",
    "data_type",
    "status",
    "is_official",
    "selected_url",
    "direct_download_url",
    "latest_url",
    "original_release_url",
    "license_url",
    "license_text",
    "osm_license_text",
    "details_url",
    "routes_url",
    "valid_from",
    "valid_to",
    "release_date",
    "feed_version",
    "bbox",
    "features",
    "priority",
    "availability_status",
    "http_status",
    "content_type",
    "content_length",
    "final_url",
    "source_basis",
    "notes",
]


@dataclass
class FeedCandidate:
    discovery_source: str
    country: str = ""
    subdivision: str = ""
    provider: str = ""
    feed_name: str = ""
    stable_id: str = ""
    ptna_feed_id: str = ""
    data_type: str = "gtfs"
    status: str = ""
    is_official: str = ""
    selected_url: str = ""
    direct_download_url: str = ""
    latest_url: str = ""
    original_release_url: str = ""
    license_url: str = ""
    license_text: str = ""
    osm_license_text: str = ""
    details_url: str = ""
    routes_url: str = ""
    valid_from: str = ""
    valid_to: str = ""
    release_date: str = ""
    feed_version: str = ""
    bbox: str = ""
    features: str = ""
    priority: str = ""
    availability_status: str = "unchecked"
    http_status: str = ""
    content_type: str = ""
    content_length: str = ""
    final_url: str = ""
    source_basis: str = ""
    notes: str = ""
    evidence_sources: list[str] = field(default_factory=list)

    def key(self) -> str:
        if self.stable_id:
            return f"stable:{self.stable_id}"
        if self.selected_url:
            return f"url:{_normalize_url_key(self.selected_url)}"
        if self.ptna_feed_id:
            return f"ptna:{self.ptna_feed_id}"
        return "hash:" + hashlib.sha256(json.dumps(self.row(), sort_keys=True).encode("utf-8")).hexdigest()

    def candidate_id(self) -> str:
        seed = "|".join(
            [
                self.discovery_source,
                self.country,
                self.stable_id,
                self.ptna_feed_id,
                self.selected_url,
                self.provider,
                self.feed_name,
            ]
        )
        return hashlib.sha256(seed.encode("utf-8")).hexdigest()[:16]

    def row(self) -> dict[str, str]:
        payload = {header: _string(getattr(self, header, "")) for header in CANONICAL_HEADERS if header != "candidate_id"}
        payload["candidate_id"] = self.candidate_id()
        return payload

    def ingestable_row(self) -> dict[str, str]:
        name = _feed_source_name(self.country, self.provider or self.feed_name)
        license_value = self.license_text or (f"see {self.license_url}" if self.license_url else "")
        basis_parts = [self.source_basis or self.discovery_source]
        if self.details_url:
            basis_parts.append(f"details: {self.details_url}")
        if self.original_release_url and self.original_release_url != self.selected_url:
            basis_parts.append(f"release: {self.original_release_url}")
        notes = self.notes or ""
        if self.latest_url and self.latest_url != self.selected_url:
            notes = _join_notes(notes, f"Mobility Database mirror: {self.latest_url}")
        if self.osm_license_text:
            notes = _join_notes(notes, f"OSM permission note: {_truncate(self.osm_license_text, 240)}")
        return {
            "name": _truncate(name, 240),
            "kind": "gtfs",
            "url": self.selected_url,
            "country": self.country,
            "license": _truncate(license_value, 240),
            "mode_scope": _mode_scope_from_features(self.features),
            "source_basis": _truncate("; ".join(part for part in basis_parts if part), 500),
            "priority": self.priority or _candidate_priority(self),
            "notes": _truncate(notes, 1200),
        }


def default_generated_dir() -> Path:
    return Path(__file__).resolve().parents[1] / "docs" / "generated"


def build_gtfs_discovery_manifests(
    *,
    output_dir: Path | str | None = None,
    countries: Iterable[str] | None = None,
    include_mobility_database: bool = True,
    include_acceptance_test_list: bool = True,
    include_ptna: bool = True,
    max_ptna_details: int = 80,
    test_limit: int = 24,
    check_urls: bool = False,
    timeout: float = 30.0,
) -> dict[str, object]:
    selected_countries = _normalize_countries(countries)
    out_dir = Path(output_dir) if output_dir is not None else default_generated_dir()
    out_dir.mkdir(parents=True, exist_ok=True)

    candidates: list[FeedCandidate] = []
    candidates.extend(load_curated_ingestable_seed(countries=selected_countries))
    if include_mobility_database:
        candidates.extend(fetch_mobility_database_candidates(countries=selected_countries, timeout=timeout))
    if include_acceptance_test_list:
        candidates.extend(fetch_mobility_acceptance_candidates(countries=selected_countries, timeout=timeout))
    if include_ptna:
        candidates.extend(fetch_ptna_candidates(countries=selected_countries, max_details=max_ptna_details, timeout=timeout))

    merged = merge_candidates(candidates)
    ingestable = [candidate for candidate in merged if candidate.selected_url and candidate.data_type == "gtfs"]
    if check_urls:
        for candidate in ingestable:
            annotate_url_availability(candidate, timeout=min(timeout, 12.0))
    test_run = select_test_run_candidates(ingestable, limit=test_limit)

    candidates_path = out_dir / "gtfs_feed_candidates.csv"
    ingestable_path = out_dir / "gtfs_ingestable_sources.csv"
    test_path = out_dir / "gtfs_test_run_sources.csv"
    report_path = out_dir / "gtfs_discovery_report.json"

    _write_csv(candidates_path, CANONICAL_HEADERS, [candidate.row() for candidate in merged])
    _write_csv(ingestable_path, DIRECT_INGEST_HEADERS, [candidate.ingestable_row() for candidate in ingestable])
    _write_csv(test_path, DIRECT_INGEST_HEADERS, [candidate.ingestable_row() for candidate in test_run])

    by_source = _count_by(merged, lambda item: item.discovery_source)
    by_country = _count_by(ingestable, lambda item: item.country or "unknown")
    report = {
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "countries": selected_countries or "all",
        "sources": {
            "mobility_database": MOBILITY_DATABASE_FEEDS_URL if include_mobility_database else None,
            "mobility_acceptance_test_list": MOBILITY_DATABASE_ACCEPTANCE_TEST_URL if include_acceptance_test_list else None,
            "ptna": PTNA_GTFS_INDEX_URL if include_ptna else None,
        },
        "counts": {
            "candidates": len(merged),
            "ingestable": len(ingestable),
            "test_run": len(test_run),
            "by_source": by_source,
            "ingestable_by_country": by_country,
        },
        "files": {
            "candidates": str(candidates_path),
            "ingestable": str(ingestable_path),
            "test_run": str(test_path),
        },
    }
    report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
    return report


def fetch_mobility_database_candidates(
    *,
    countries: list[str] | None = None,
    timeout: float = 30.0,
    url: str = MOBILITY_DATABASE_FEEDS_URL,
) -> list[FeedCandidate]:
    text = _fetch_text(url, timeout=timeout)
    rows = csv.DictReader(text.splitlines())
    candidates: list[FeedCandidate] = []
    for row in rows:
        if _value(row, "data_type").lower() != "gtfs":
            continue
        country = _value(row, "location.country_code").upper()
        if countries and country not in countries:
            continue
        direct_url = _normalize_feed_url(_value(row, "urls.direct_download"))
        latest_url = _normalize_feed_url(_value(row, "urls.latest"))
        selected_url = _choose_feed_url(direct_url, latest_url)
        candidate = FeedCandidate(
            discovery_source="mobility_database",
            country=country,
            subdivision=_value(row, "location.subdivision_name"),
            provider=_value(row, "provider"),
            feed_name=_value(row, "name"),
            stable_id=_value(row, "id"),
            data_type="gtfs",
            status=_value(row, "status"),
            is_official=_value(row, "is_official"),
            selected_url=selected_url,
            direct_download_url=direct_url,
            latest_url=latest_url,
            license_url=_value(row, "urls.license"),
            bbox=_bbox_from_mobility_row(row),
            features=_value(row, "features"),
            source_basis="Mobility Database feed catalog",
            notes=_value(row, "note"),
        )
        normalize_candidate_geography(candidate)
        apply_known_download_overrides(candidate)
        candidate.priority = _candidate_priority(candidate)
        candidates.append(candidate)
    return candidates


def fetch_mobility_acceptance_candidates(
    *,
    countries: list[str] | None = None,
    timeout: float = 30.0,
    url: str = MOBILITY_DATABASE_ACCEPTANCE_TEST_URL,
) -> list[FeedCandidate]:
    text = _fetch_text(url, timeout=timeout)
    rows = csv.DictReader(text.splitlines())
    candidates: list[FeedCandidate] = []
    for row in rows:
        country = _value(row, "country_code").upper()
        if countries and country not in countries:
            continue
        latest_url = _normalize_feed_url(_value(row, "urls.latest"))
        if not latest_url:
            continue
        candidate = FeedCandidate(
            discovery_source="mobility_validator_acceptance",
            country=country,
            subdivision=_value(row, "subdivision_name"),
            provider=_value(row, "provider"),
            feed_name=_value(row, "provider"),
            stable_id=_value(row, "stable_id"),
            status="acceptance_test",
            selected_url=latest_url,
            latest_url=latest_url,
            source_basis="MobilityData validator acceptance-test feed list",
            notes="Useful smoke-test feed list; prefer Mobility Database feeds_v2 metadata for production source review.",
            priority="P3",
        )
        normalize_candidate_geography(candidate)
        apply_known_download_overrides(candidate)
        candidates.append(candidate)
    return candidates


def fetch_ptna_candidates(
    *,
    countries: list[str] | None = None,
    max_details: int = 80,
    timeout: float = 30.0,
) -> list[FeedCandidate]:
    country_codes = countries or DEFAULT_DISCOVERY_COUNTRIES
    if not country_codes:
        country_codes = discover_ptna_country_codes(timeout=timeout)
    candidates: list[FeedCandidate] = []
    detail_fetches = 0
    for country in country_codes:
        country_url = PTNA_COUNTRY_URL_TEMPLATE.format(country=country)
        try:
            html = _fetch_text(country_url, timeout=timeout)
        except requests.RequestException:
            continue
        for candidate in parse_ptna_country_page(html, country=country, page_url=country_url):
            if candidate.details_url and detail_fetches < max_details:
                try:
                    detail_html = _fetch_text(candidate.details_url, timeout=timeout)
                    enrich_ptna_candidate_from_details(candidate, detail_html, candidate.details_url)
                    detail_fetches += 1
                except requests.RequestException:
                    candidate.notes = _join_notes(candidate.notes, "PTNA detail page could not be fetched during discovery.")
            candidate.priority = _candidate_priority(candidate)
            candidates.append(candidate)
    return candidates


def discover_ptna_country_codes(*, timeout: float = 30.0) -> list[str]:
    html = _fetch_text(PTNA_GTFS_INDEX_URL, timeout=timeout)
    links = _all_links(html, PTNA_GTFS_INDEX_URL)
    codes: list[str] = []
    for link in links:
        match = re.search(r"/gtfs/([A-Z]{2})/index\.php$", urlparse(link).path)
        if match and match.group(1) not in codes:
            codes.append(match.group(1))
    return codes


def parse_ptna_country_page(html: str, *, country: str, page_url: str) -> list[FeedCandidate]:
    rows = _parse_table_rows(html, page_url)
    candidates: list[FeedCandidate] = []
    for row in rows:
        links = [link for cell in row.cells for link in cell.links]
        routes_url = _first_link_matching(links, "routes.php?feed=")
        details_url = _first_link_matching(links, "gtfs-details.php?feed=")
        if not routes_url and not details_url:
            continue
        feed_id = _feed_id_from_url(routes_url or details_url)
        if not feed_id:
            continue
        texts = [cell.text for cell in row.cells]
        release_link = _normalize_feed_url(row.cells[6].first_external_link if len(row.cells) > 6 else "")
        direct_url = release_link if _looks_like_download_url(release_link) else ""
        candidate = FeedCandidate(
            discovery_source="ptna",
            country=country,
            provider=texts[2] if len(texts) > 2 else "",
            feed_name=texts[1] if len(texts) > 1 else feed_id,
            ptna_feed_id=feed_id,
            selected_url=direct_url,
            direct_download_url=direct_url,
            original_release_url=release_link,
            details_url=details_url,
            routes_url=routes_url,
            valid_from=texts[3] if len(texts) > 3 else "",
            valid_to=texts[4] if len(texts) > 4 else "",
            feed_version=texts[5] if len(texts) > 5 else "",
            release_date=texts[6] if len(texts) > 6 else "",
            source_basis="PTNA GTFS analysis",
            notes="PTNA candidate; use original publisher URL where available.",
        )
        normalize_candidate_geography(candidate)
        apply_known_download_overrides(candidate)
        candidates.append(candidate)
    return candidates


def enrich_ptna_candidate_from_details(candidate: FeedCandidate, html: str, page_url: str) -> None:
    fields = parse_ptna_detail_fields(html, page_url)
    candidate.original_release_url = _normalize_feed_url(fields.get("release url href") or fields.get("release url") or candidate.original_release_url)
    candidate.license_url = fields.get("publisher's license href") or candidate.license_url
    candidate.license_text = fields.get("publisher's license") or candidate.license_text
    candidate.osm_license_text = fields.get("license given for use in osm") or candidate.osm_license_text
    candidate.valid_from = fields.get("feed start date") or candidate.valid_from
    candidate.valid_to = fields.get("feed end date") or candidate.valid_to
    candidate.feed_version = fields.get("feed version") or candidate.feed_version
    candidate.release_date = fields.get("release date") or candidate.release_date
    network_guid = fields.get('"network:guid"')
    if network_guid:
        candidate.notes = _join_notes(candidate.notes, f"PTNA network:guid={network_guid}")
    if not candidate.selected_url and _looks_like_download_url(candidate.original_release_url):
        candidate.selected_url = _normalize_feed_url(candidate.original_release_url)
        candidate.direct_download_url = candidate.selected_url
    normalize_candidate_geography(candidate)


def parse_ptna_detail_fields(html: str, page_url: str) -> dict[str, str]:
    parsed: dict[str, str] = {}
    for row in _parse_table_rows(html, page_url):
        if len(row.cells) < 2:
            continue
        label = _clean_text(row.cells[0].text).lower()
        if not label:
            continue
        detail = _clean_text(row.cells[1].text)
        parsed[label] = detail
        if row.cells[1].first_external_link:
            parsed[f"{label} href"] = row.cells[1].first_external_link
    return parsed


def load_curated_ingestable_seed(
    *,
    countries: list[str] | None = None,
    path: Path | str | None = None,
) -> list[FeedCandidate]:
    seed_path = Path(path) if path is not None else Path(__file__).resolve().parents[1] / "docs" / "ingestable_sources_seed.csv"
    if not seed_path.exists():
        return []
    candidates: list[FeedCandidate] = []
    with seed_path.open("r", encoding="utf-8-sig", newline="") as handle:
        for row in csv.DictReader(handle):
            if _value(row, "kind").lower() != "gtfs":
                continue
            country = _value(row, "country").upper()
            if countries and country not in countries and country != "EU":
                continue
            candidate = FeedCandidate(
                discovery_source="curated_seed",
                country=country,
                provider=_value(row, "name").removesuffix(" GTFS"),
                feed_name=_value(row, "name"),
                selected_url=_normalize_feed_url(_value(row, "url")),
                direct_download_url=_normalize_feed_url(_value(row, "url")),
                license_text=_value(row, "license"),
                features=_value(row, "mode_scope"),
                priority=_value(row, "priority"),
                source_basis=_value(row, "source_basis") or "curated seed",
                notes=_value(row, "notes"),
            )
            normalize_candidate_geography(candidate)
            apply_known_download_overrides(candidate)
            candidates.append(candidate)
    return candidates


def merge_candidates(candidates: Iterable[FeedCandidate]) -> list[FeedCandidate]:
    by_key: dict[str, FeedCandidate] = {}
    alias_to_key: dict[str, str] = {}
    for candidate in candidates:
        keys = _candidate_alias_keys(candidate)
        primary_key = keys[0]
        existing_key = next((alias_to_key[key] for key in keys if key in alias_to_key), None)
        existing = by_key.get(existing_key) if existing_key is not None else None
        if existing is None:
            by_key[primary_key] = candidate
            for key in keys:
                alias_to_key[key] = primary_key
            continue
        _merge_candidate(existing, candidate)
        for key in keys:
            alias_to_key[key] = existing_key or primary_key
    return sorted(by_key.values(), key=lambda item: (_priority_sort_key(item.priority), item.country, item.provider.lower(), item.feed_name.lower()))


def select_test_run_candidates(candidates: Iterable[FeedCandidate], *, limit: int = 24) -> list[FeedCandidate]:
    sorted_candidates = sorted(
        [
            candidate
            for candidate in candidates
            if candidate.discovery_source != "mobility_validator_acceptance" and _test_candidate_eligible(candidate)
        ],
        key=_test_candidate_sort_key,
    )
    selected: list[FeedCandidate] = []
    seen_urls: set[str] = set()
    per_country: dict[str, int] = {}

    def add(candidate: FeedCandidate, *, force: bool = False) -> None:
        if len(selected) >= limit:
            return
        url_key = _normalize_url_key(candidate.selected_url)
        if not candidate.selected_url or url_key in seen_urls:
            return
        country = candidate.country or "unknown"
        country_limit = 7 if force and country == "DE" else 3
        if per_country.get(country, 0) >= country_limit:
            return
        selected.append(candidate)
        seen_urls.add(url_key)
        per_country[country] = per_country.get(country, 0) + 1

    preferred_tokens = [
        "opendata-oepnv.de",
        "download.gtfs.de/germany/",
        "vbb.de/vbbgtfs",
        "rnv-online.de",
        "vrn.de",
        "gtfs.geops.ch",
        "wienerlinien.at",
        "gtfs.openov.nl",
        "gtfs.ovapi.nl",
        "rejseplanen.info",
        "dev.hsl.fi/gtfs",
        "hsldev.com/gtfs",
        "rb_norway-aggregated-gtfs",
        "data.bus-data.dft.gov.uk",
        "transportforireland",
        "gtfs.irail.be/de-lijn",
    ]
    for candidate in sorted_candidates:
        text = " ".join([candidate.provider, candidate.feed_name, candidate.source_basis, candidate.selected_url]).lower()
        if any(token in text for token in preferred_tokens):
            add(candidate, force=True)
    for country in CURATED_TEST_COUNTRIES:
        for candidate in sorted_candidates:
            if candidate.country == country:
                add(candidate)
            if len(selected) >= limit:
                break
        if len(selected) >= limit:
            break
    for candidate in sorted_candidates:
        add(candidate)
        if len(selected) >= limit:
            break
    return selected


def _test_candidate_eligible(candidate: FeedCandidate) -> bool:
    if not candidate.selected_url:
        return False
    if _priority_sort_key(candidate.priority) > 2:
        return False
    text = " ".join([candidate.status, candidate.selected_url, candidate.provider, candidate.feed_name, candidate.notes]).lower()
    if "deprecated" in text or "inactive" in text or "{apikey}" in text:
        return False
    if "registration required" in text or "authentication" in text:
        return False
    return True


def annotate_url_availability(candidate: FeedCandidate, *, timeout: float = 10.0) -> FeedCandidate:
    if not candidate.selected_url:
        candidate.availability_status = "missing_url"
        return candidate
    headers = {"User-Agent": "meubility-workbench-feed-discovery/0.1"}
    try:
        response = requests.head(candidate.selected_url, allow_redirects=True, timeout=timeout, headers=headers)
        if response.status_code in {405, 403} or response.status_code >= 500:
            response = requests.get(
                candidate.selected_url,
                allow_redirects=True,
                timeout=timeout,
                headers={**headers, "Range": "bytes=0-0"},
                stream=True,
            )
        candidate.http_status = str(response.status_code)
        candidate.content_type = response.headers.get("content-type", "")
        candidate.content_length = response.headers.get("content-length", "")
        candidate.final_url = response.url
        candidate.availability_status = "ok" if response.status_code < 400 else "error"
        response.close()
    except requests.RequestException as exc:
        candidate.availability_status = "error"
        candidate.notes = _join_notes(candidate.notes, f"Availability check failed: {exc}")
    return candidate


def normalize_candidate_geography(candidate: FeedCandidate) -> None:
    text = " ".join(
        [
            candidate.selected_url,
            candidate.direct_download_url,
            candidate.latest_url,
            candidate.original_release_url,
            candidate.provider,
            candidate.feed_name,
            candidate.source_basis,
        ]
    ).lower()
    if "download.gtfs.de/germany/" in text or "gtfs for germany" in text:
        candidate.country = "DE"
    elif "storage.googleapis.com/marduk-production/outbound/gtfs/rb_norway" in text:
        candidate.country = "NO"
    elif "gtfs.ovapi.nl" in text or "openov.nl" in text:
        candidate.country = "NL"
    elif "www.nvbw.de/fileadmin/user_upload/service/open_data/" in text:
        candidate.country = "DE"


def apply_known_download_overrides(candidate: FeedCandidate) -> None:
    stale_direct_ids = {"mdb-684", "mdb-777"}
    if candidate.stable_id in stale_direct_ids and candidate.latest_url:
        candidate.selected_url = candidate.latest_url
        candidate.notes = _join_notes(
            candidate.notes,
            "Selected Mobility Database latest.zip mirror because the catalog direct URL is known to be stale.",
        )


@dataclass
class _HtmlCell:
    text: str = ""
    links: list[str] = field(default_factory=list)

    @property
    def first_external_link(self) -> str:
        for link in self.links:
            parsed = urlparse(link)
            if parsed.scheme in {"http", "https"} and "ptna.openstreetmap.de" not in parsed.netloc:
                return link
        return ""


@dataclass
class _HtmlRow:
    cells: list[_HtmlCell] = field(default_factory=list)


class _TableParser(HTMLParser):
    def __init__(self, base_url: str):
        super().__init__(convert_charrefs=True)
        self.base_url = base_url
        self.rows: list[_HtmlRow] = []
        self._row: _HtmlRow | None = None
        self._cell: _HtmlCell | None = None
        self._active_link: str = ""

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        attrs_dict = {key: value or "" for key, value in attrs}
        if tag == "tr":
            self._row = _HtmlRow()
        elif tag in {"td", "th"} and self._row is not None:
            self._cell = _HtmlCell()
        elif tag == "a" and self._cell is not None:
            href = attrs_dict.get("href", "")
            if href:
                self._active_link = urljoin(self.base_url, href)
                self._cell.links.append(self._active_link)

    def handle_endtag(self, tag: str) -> None:
        if tag in {"td", "th"} and self._row is not None and self._cell is not None:
            self._cell.text = _clean_text(self._cell.text)
            self._row.cells.append(self._cell)
            self._cell = None
            self._active_link = ""
        elif tag == "a":
            self._active_link = ""
        elif tag == "tr":
            if self._row is not None and self._row.cells:
                self.rows.append(self._row)
            self._row = None
            self._cell = None
            self._active_link = ""

    def handle_data(self, data: str) -> None:
        if self._cell is not None:
            self._cell.text += data


class _LinkParser(HTMLParser):
    def __init__(self, base_url: str):
        super().__init__(convert_charrefs=True)
        self.base_url = base_url
        self.links: list[str] = []

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        if tag != "a":
            return
        for key, value in attrs:
            if key == "href" and value:
                self.links.append(urljoin(self.base_url, value))


def _parse_table_rows(html: str, base_url: str) -> list[_HtmlRow]:
    parser = _TableParser(base_url)
    parser.feed(html)
    return parser.rows


def _all_links(html: str, base_url: str) -> list[str]:
    parser = _LinkParser(base_url)
    parser.feed(html)
    return parser.links


def _fetch_text(url: str, *, timeout: float) -> str:
    response = requests.get(url, timeout=timeout, headers={"User-Agent": "meubility-workbench-feed-discovery/0.1"})
    response.raise_for_status()
    return response.text


def _first_link_matching(links: Iterable[str], needle: str) -> str:
    for link in links:
        if needle in link:
            return link
    return ""


def _feed_id_from_url(url: str) -> str:
    query = parse_qs(urlparse(url).query)
    return (query.get("feed") or [""])[0]


def _looks_like_download_url(url: str) -> bool:
    if not url:
        return False
    parsed = urlparse(url)
    lower_path = parsed.path.lower()
    lower_url = url.lower()
    if lower_path.endswith(".zip"):
        return True
    if "exportformat=gtfs" in lower_url or "google_transit" in lower_url:
        return True
    if lower_path.rstrip("/").endswith(("current_gtfs", "gtfs")):
        return True
    if "gtfs.ovapi.nl" in parsed.netloc.lower() and "gtfs" in lower_path:
        return True
    return False


def _normalize_feed_url(url: str) -> str:
    cleaned = _clean_text(url)
    if not cleaned:
        return ""
    parsed = urlparse(cleaned)
    if parsed.scheme:
        return cleaned
    first = cleaned.split("/", 1)[0]
    if "." in first:
        return f"https://{cleaned}"
    return cleaned


def _choose_feed_url(direct_url: str, latest_url: str) -> str:
    if direct_url:
        return direct_url
    return latest_url


def _candidate_priority(candidate: FeedCandidate) -> str:
    status = candidate.status.lower()
    official = candidate.is_official.lower() == "true"
    if candidate.discovery_source == "curated_seed":
        return candidate.priority or "P1"
    if status == "active" and official and candidate.direct_download_url:
        return "P0"
    if status == "active" and candidate.direct_download_url:
        return "P1"
    if status == "active" and candidate.latest_url:
        return "P2"
    if candidate.discovery_source == "ptna":
        return "P2" if candidate.selected_url else "P4"
    return "P3"


def _test_candidate_sort_key(candidate: FeedCandidate) -> tuple[int, int, str, str]:
    source_bonus = 0 if candidate.discovery_source == "curated_seed" else 1
    country_bonus = CURATED_TEST_COUNTRIES.index(candidate.country) if candidate.country in CURATED_TEST_COUNTRIES else 99
    return (_priority_sort_key(candidate.priority), source_bonus + country_bonus, candidate.country, candidate.provider.lower())


def _priority_sort_key(priority: str) -> int:
    match = re.match(r"P(\d+)", priority or "")
    return int(match.group(1)) if match else 9


def _candidate_alias_keys(candidate: FeedCandidate) -> list[str]:
    keys = [candidate.key()]
    if candidate.stable_id:
        keys.append(f"stable:{candidate.stable_id}")
    for url in [candidate.selected_url, candidate.direct_download_url, candidate.latest_url]:
        if url:
            keys.append(f"url:{_normalize_url_key(url)}")
    if candidate.ptna_feed_id:
        keys.append(f"ptna:{candidate.ptna_feed_id}")
    deduped: list[str] = []
    for key in keys:
        if key not in deduped:
            deduped.append(key)
    return deduped


def _merge_candidate(existing: FeedCandidate, incoming: FeedCandidate) -> None:
    if incoming.discovery_source == "curated_seed":
        for field_name in ["country", "provider", "feed_name", "license_text", "features", "source_basis", "notes"]:
            new_value = getattr(incoming, field_name, "")
            if new_value:
                setattr(existing, field_name, new_value)
    existing.discovery_source = _join_unique(existing.discovery_source, incoming.discovery_source)
    for field_name in CANONICAL_HEADERS:
        if field_name == "candidate_id":
            continue
        current = getattr(existing, field_name, "")
        new_value = getattr(incoming, field_name, "")
        if not current and new_value:
            setattr(existing, field_name, new_value)
    existing.priority = _better_priority(existing.priority, incoming.priority)
    existing.source_basis = _join_unique(existing.source_basis, incoming.source_basis)
    existing.notes = _join_notes(existing.notes, incoming.notes)


def _better_priority(left: str, right: str) -> str:
    return left if _priority_sort_key(left) <= _priority_sort_key(right) else right


def _join_unique(left: str, right: str) -> str:
    parts: list[str] = []
    for value in [left, right]:
        for part in value.split(";"):
            cleaned = part.strip()
            if cleaned and cleaned not in parts:
                parts.append(cleaned)
    return "; ".join(parts)


def _join_notes(left: str, right: str) -> str:
    return _join_unique(left, right)


def _compact_name(value: str) -> str:
    return re.sub(r"\s+", " ", _clean_text(value)).strip()


def _feed_source_name(country: str, value: str) -> str:
    base = _compact_name(value) or "GTFS feed"
    prefix = country.upper()
    display = base
    if prefix and not base.upper().startswith(f"{prefix} "):
        display = f"{prefix} {base}"
    if "gtfs" not in display.lower():
        display = f"{display} GTFS"
    return display


def _clean_text(value: str) -> str:
    cleaned = unescape(value or "").replace("\xa0", " ")
    cleaned = re.sub(r"\s+", " ", cleaned)
    return cleaned.strip()


def _mode_scope_from_features(features: str) -> str:
    lower = features.lower()
    modes = []
    if "rail" in lower or "train" in lower:
        modes.append("rail")
    if "tram" in lower or "light_rail" in lower:
        modes.append("tram")
    if "subway" in lower or "metro" in lower:
        modes.append("metro")
    if "bus" in lower or not modes:
        modes.append("bus")
    if "ferry" in lower:
        modes.append("ferry")
    return ",".join(dict.fromkeys(modes))


def _bbox_from_mobility_row(row: dict[str, str]) -> str:
    min_lat = _value(row, "location.bounding_box.minimum_latitude")
    max_lat = _value(row, "location.bounding_box.maximum_latitude")
    min_lon = _value(row, "location.bounding_box.minimum_longitude")
    max_lon = _value(row, "location.bounding_box.maximum_longitude")
    if not all([min_lat, max_lat, min_lon, max_lon]):
        return ""
    return f"{min_lon},{min_lat},{max_lon},{max_lat}"


def _normalize_countries(countries: Iterable[str] | None) -> list[str] | None:
    if countries is None:
        return DEFAULT_DISCOVERY_COUNTRIES
    normalized = [country.strip().upper() for country in countries if country and country.strip()]
    if any(country == "ALL" for country in normalized):
        return None
    return normalized


def _normalize_url_key(url: str) -> str:
    parsed = urlparse(url.strip())
    scheme = parsed.scheme.lower()
    netloc = parsed.netloc.lower()
    path = parsed.path.rstrip("/")
    query = parsed.query
    return f"{scheme}://{netloc}{path}" + (f"?{query}" if query else "")


def _write_csv(path: Path, headers: list[str], rows: list[dict[str, str]]) -> None:
    with path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(handle, fieldnames=headers, extrasaction="ignore")
        writer.writeheader()
        writer.writerows(rows)


def _count_by(items: Iterable[FeedCandidate], key_fn) -> dict[str, int]:
    counts: dict[str, int] = {}
    for item in items:
        key = key_fn(item)
        counts[key] = counts.get(key, 0) + 1
    return dict(sorted(counts.items()))


def _value(row: dict[str, str], key: str) -> str:
    return _clean_text(row.get(key, ""))


def _string(value: object) -> str:
    return "" if value is None else str(value)


def _truncate(value: str, length: int) -> str:
    return value[:length] if value else ""