Alpha stage commit
This commit is contained in:
923
app/feed_discovery.py
Normal file
923
app/feed_discovery.py
Normal file
@@ -0,0 +1,923 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from html import unescape
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from urllib.parse import parse_qs, urljoin, urlparse
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
MOBILITY_DATABASE_FEEDS_URL = "https://files.mobilitydatabase.org/feeds_v2.csv"
|
||||
MOBILITY_DATABASE_ACCEPTANCE_TEST_URL = (
|
||||
"https://raw.githubusercontent.com/MobilityData/gtfs-validator/master/"
|
||||
"scripts/mobility-database-harvester/acceptance_test_feed_list.csv"
|
||||
)
|
||||
PTNA_GTFS_INDEX_URL = "https://ptna.openstreetmap.de/gtfs/index.html"
|
||||
PTNA_COUNTRY_URL_TEMPLATE = "https://ptna.openstreetmap.de/gtfs/{country}/index.php"
|
||||
|
||||
DEFAULT_DISCOVERY_COUNTRIES = ["DE", "AT", "CH", "NL", "DK", "FR", "BE", "LU", "NO", "SE", "FI", "IE", "GB"]
|
||||
CURATED_TEST_COUNTRIES = ["DE", "CH", "AT", "NL", "DK", "FI", "NO", "SE", "IE", "GB", "FR", "BE", "LU"]
|
||||
DIRECT_INGEST_HEADERS = ["name", "kind", "url", "country", "license", "mode_scope", "source_basis", "priority", "notes"]
|
||||
CANONICAL_HEADERS = [
|
||||
"candidate_id",
|
||||
"discovery_source",
|
||||
"country",
|
||||
"subdivision",
|
||||
"provider",
|
||||
"feed_name",
|
||||
"stable_id",
|
||||
"ptna_feed_id",
|
||||
"data_type",
|
||||
"status",
|
||||
"is_official",
|
||||
"selected_url",
|
||||
"direct_download_url",
|
||||
"latest_url",
|
||||
"original_release_url",
|
||||
"license_url",
|
||||
"license_text",
|
||||
"osm_license_text",
|
||||
"details_url",
|
||||
"routes_url",
|
||||
"valid_from",
|
||||
"valid_to",
|
||||
"release_date",
|
||||
"feed_version",
|
||||
"bbox",
|
||||
"features",
|
||||
"priority",
|
||||
"availability_status",
|
||||
"http_status",
|
||||
"content_type",
|
||||
"content_length",
|
||||
"final_url",
|
||||
"source_basis",
|
||||
"notes",
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class FeedCandidate:
|
||||
discovery_source: str
|
||||
country: str = ""
|
||||
subdivision: str = ""
|
||||
provider: str = ""
|
||||
feed_name: str = ""
|
||||
stable_id: str = ""
|
||||
ptna_feed_id: str = ""
|
||||
data_type: str = "gtfs"
|
||||
status: str = ""
|
||||
is_official: str = ""
|
||||
selected_url: str = ""
|
||||
direct_download_url: str = ""
|
||||
latest_url: str = ""
|
||||
original_release_url: str = ""
|
||||
license_url: str = ""
|
||||
license_text: str = ""
|
||||
osm_license_text: str = ""
|
||||
details_url: str = ""
|
||||
routes_url: str = ""
|
||||
valid_from: str = ""
|
||||
valid_to: str = ""
|
||||
release_date: str = ""
|
||||
feed_version: str = ""
|
||||
bbox: str = ""
|
||||
features: str = ""
|
||||
priority: str = ""
|
||||
availability_status: str = "unchecked"
|
||||
http_status: str = ""
|
||||
content_type: str = ""
|
||||
content_length: str = ""
|
||||
final_url: str = ""
|
||||
source_basis: str = ""
|
||||
notes: str = ""
|
||||
evidence_sources: list[str] = field(default_factory=list)
|
||||
|
||||
def key(self) -> str:
|
||||
if self.stable_id:
|
||||
return f"stable:{self.stable_id}"
|
||||
if self.selected_url:
|
||||
return f"url:{_normalize_url_key(self.selected_url)}"
|
||||
if self.ptna_feed_id:
|
||||
return f"ptna:{self.ptna_feed_id}"
|
||||
return "hash:" + hashlib.sha256(json.dumps(self.row(), sort_keys=True).encode("utf-8")).hexdigest()
|
||||
|
||||
def candidate_id(self) -> str:
|
||||
seed = "|".join(
|
||||
[
|
||||
self.discovery_source,
|
||||
self.country,
|
||||
self.stable_id,
|
||||
self.ptna_feed_id,
|
||||
self.selected_url,
|
||||
self.provider,
|
||||
self.feed_name,
|
||||
]
|
||||
)
|
||||
return hashlib.sha256(seed.encode("utf-8")).hexdigest()[:16]
|
||||
|
||||
def row(self) -> dict[str, str]:
|
||||
payload = {header: _string(getattr(self, header, "")) for header in CANONICAL_HEADERS if header != "candidate_id"}
|
||||
payload["candidate_id"] = self.candidate_id()
|
||||
return payload
|
||||
|
||||
def ingestable_row(self) -> dict[str, str]:
|
||||
name = _feed_source_name(self.country, self.provider or self.feed_name)
|
||||
license_value = self.license_text or (f"see {self.license_url}" if self.license_url else "")
|
||||
basis_parts = [self.source_basis or self.discovery_source]
|
||||
if self.details_url:
|
||||
basis_parts.append(f"details: {self.details_url}")
|
||||
if self.original_release_url and self.original_release_url != self.selected_url:
|
||||
basis_parts.append(f"release: {self.original_release_url}")
|
||||
notes = self.notes or ""
|
||||
if self.latest_url and self.latest_url != self.selected_url:
|
||||
notes = _join_notes(notes, f"Mobility Database mirror: {self.latest_url}")
|
||||
if self.osm_license_text:
|
||||
notes = _join_notes(notes, f"OSM permission note: {_truncate(self.osm_license_text, 240)}")
|
||||
return {
|
||||
"name": _truncate(name, 240),
|
||||
"kind": "gtfs",
|
||||
"url": self.selected_url,
|
||||
"country": self.country,
|
||||
"license": _truncate(license_value, 240),
|
||||
"mode_scope": _mode_scope_from_features(self.features),
|
||||
"source_basis": _truncate("; ".join(part for part in basis_parts if part), 500),
|
||||
"priority": self.priority or _candidate_priority(self),
|
||||
"notes": _truncate(notes, 1200),
|
||||
}
|
||||
|
||||
|
||||
def default_generated_dir() -> Path:
|
||||
return Path(__file__).resolve().parents[1] / "docs" / "generated"
|
||||
|
||||
|
||||
def build_gtfs_discovery_manifests(
|
||||
*,
|
||||
output_dir: Path | str | None = None,
|
||||
countries: Iterable[str] | None = None,
|
||||
include_mobility_database: bool = True,
|
||||
include_acceptance_test_list: bool = True,
|
||||
include_ptna: bool = True,
|
||||
max_ptna_details: int = 80,
|
||||
test_limit: int = 24,
|
||||
check_urls: bool = False,
|
||||
timeout: float = 30.0,
|
||||
) -> dict[str, object]:
|
||||
selected_countries = _normalize_countries(countries)
|
||||
out_dir = Path(output_dir) if output_dir is not None else default_generated_dir()
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
candidates: list[FeedCandidate] = []
|
||||
candidates.extend(load_curated_ingestable_seed(countries=selected_countries))
|
||||
if include_mobility_database:
|
||||
candidates.extend(fetch_mobility_database_candidates(countries=selected_countries, timeout=timeout))
|
||||
if include_acceptance_test_list:
|
||||
candidates.extend(fetch_mobility_acceptance_candidates(countries=selected_countries, timeout=timeout))
|
||||
if include_ptna:
|
||||
candidates.extend(fetch_ptna_candidates(countries=selected_countries, max_details=max_ptna_details, timeout=timeout))
|
||||
|
||||
merged = merge_candidates(candidates)
|
||||
ingestable = [candidate for candidate in merged if candidate.selected_url and candidate.data_type == "gtfs"]
|
||||
if check_urls:
|
||||
for candidate in ingestable:
|
||||
annotate_url_availability(candidate, timeout=min(timeout, 12.0))
|
||||
test_run = select_test_run_candidates(ingestable, limit=test_limit)
|
||||
|
||||
candidates_path = out_dir / "gtfs_feed_candidates.csv"
|
||||
ingestable_path = out_dir / "gtfs_ingestable_sources.csv"
|
||||
test_path = out_dir / "gtfs_test_run_sources.csv"
|
||||
report_path = out_dir / "gtfs_discovery_report.json"
|
||||
|
||||
_write_csv(candidates_path, CANONICAL_HEADERS, [candidate.row() for candidate in merged])
|
||||
_write_csv(ingestable_path, DIRECT_INGEST_HEADERS, [candidate.ingestable_row() for candidate in ingestable])
|
||||
_write_csv(test_path, DIRECT_INGEST_HEADERS, [candidate.ingestable_row() for candidate in test_run])
|
||||
|
||||
by_source = _count_by(merged, lambda item: item.discovery_source)
|
||||
by_country = _count_by(ingestable, lambda item: item.country or "unknown")
|
||||
report = {
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"countries": selected_countries or "all",
|
||||
"sources": {
|
||||
"mobility_database": MOBILITY_DATABASE_FEEDS_URL if include_mobility_database else None,
|
||||
"mobility_acceptance_test_list": MOBILITY_DATABASE_ACCEPTANCE_TEST_URL if include_acceptance_test_list else None,
|
||||
"ptna": PTNA_GTFS_INDEX_URL if include_ptna else None,
|
||||
},
|
||||
"counts": {
|
||||
"candidates": len(merged),
|
||||
"ingestable": len(ingestable),
|
||||
"test_run": len(test_run),
|
||||
"by_source": by_source,
|
||||
"ingestable_by_country": by_country,
|
||||
},
|
||||
"files": {
|
||||
"candidates": str(candidates_path),
|
||||
"ingestable": str(ingestable_path),
|
||||
"test_run": str(test_path),
|
||||
},
|
||||
}
|
||||
report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
||||
return report
|
||||
|
||||
|
||||
def fetch_mobility_database_candidates(
|
||||
*,
|
||||
countries: list[str] | None = None,
|
||||
timeout: float = 30.0,
|
||||
url: str = MOBILITY_DATABASE_FEEDS_URL,
|
||||
) -> list[FeedCandidate]:
|
||||
text = _fetch_text(url, timeout=timeout)
|
||||
rows = csv.DictReader(text.splitlines())
|
||||
candidates: list[FeedCandidate] = []
|
||||
for row in rows:
|
||||
if _value(row, "data_type").lower() != "gtfs":
|
||||
continue
|
||||
country = _value(row, "location.country_code").upper()
|
||||
if countries and country not in countries:
|
||||
continue
|
||||
direct_url = _normalize_feed_url(_value(row, "urls.direct_download"))
|
||||
latest_url = _normalize_feed_url(_value(row, "urls.latest"))
|
||||
selected_url = _choose_feed_url(direct_url, latest_url)
|
||||
candidate = FeedCandidate(
|
||||
discovery_source="mobility_database",
|
||||
country=country,
|
||||
subdivision=_value(row, "location.subdivision_name"),
|
||||
provider=_value(row, "provider"),
|
||||
feed_name=_value(row, "name"),
|
||||
stable_id=_value(row, "id"),
|
||||
data_type="gtfs",
|
||||
status=_value(row, "status"),
|
||||
is_official=_value(row, "is_official"),
|
||||
selected_url=selected_url,
|
||||
direct_download_url=direct_url,
|
||||
latest_url=latest_url,
|
||||
license_url=_value(row, "urls.license"),
|
||||
bbox=_bbox_from_mobility_row(row),
|
||||
features=_value(row, "features"),
|
||||
source_basis="Mobility Database feed catalog",
|
||||
notes=_value(row, "note"),
|
||||
)
|
||||
normalize_candidate_geography(candidate)
|
||||
apply_known_download_overrides(candidate)
|
||||
candidate.priority = _candidate_priority(candidate)
|
||||
candidates.append(candidate)
|
||||
return candidates
|
||||
|
||||
|
||||
def fetch_mobility_acceptance_candidates(
|
||||
*,
|
||||
countries: list[str] | None = None,
|
||||
timeout: float = 30.0,
|
||||
url: str = MOBILITY_DATABASE_ACCEPTANCE_TEST_URL,
|
||||
) -> list[FeedCandidate]:
|
||||
text = _fetch_text(url, timeout=timeout)
|
||||
rows = csv.DictReader(text.splitlines())
|
||||
candidates: list[FeedCandidate] = []
|
||||
for row in rows:
|
||||
country = _value(row, "country_code").upper()
|
||||
if countries and country not in countries:
|
||||
continue
|
||||
latest_url = _normalize_feed_url(_value(row, "urls.latest"))
|
||||
if not latest_url:
|
||||
continue
|
||||
candidate = FeedCandidate(
|
||||
discovery_source="mobility_validator_acceptance",
|
||||
country=country,
|
||||
subdivision=_value(row, "subdivision_name"),
|
||||
provider=_value(row, "provider"),
|
||||
feed_name=_value(row, "provider"),
|
||||
stable_id=_value(row, "stable_id"),
|
||||
status="acceptance_test",
|
||||
selected_url=latest_url,
|
||||
latest_url=latest_url,
|
||||
source_basis="MobilityData validator acceptance-test feed list",
|
||||
notes="Useful smoke-test feed list; prefer Mobility Database feeds_v2 metadata for production source review.",
|
||||
priority="P3",
|
||||
)
|
||||
normalize_candidate_geography(candidate)
|
||||
apply_known_download_overrides(candidate)
|
||||
candidates.append(candidate)
|
||||
return candidates
|
||||
|
||||
|
||||
def fetch_ptna_candidates(
|
||||
*,
|
||||
countries: list[str] | None = None,
|
||||
max_details: int = 80,
|
||||
timeout: float = 30.0,
|
||||
) -> list[FeedCandidate]:
|
||||
country_codes = countries or DEFAULT_DISCOVERY_COUNTRIES
|
||||
if not country_codes:
|
||||
country_codes = discover_ptna_country_codes(timeout=timeout)
|
||||
candidates: list[FeedCandidate] = []
|
||||
detail_fetches = 0
|
||||
for country in country_codes:
|
||||
country_url = PTNA_COUNTRY_URL_TEMPLATE.format(country=country)
|
||||
try:
|
||||
html = _fetch_text(country_url, timeout=timeout)
|
||||
except requests.RequestException:
|
||||
continue
|
||||
for candidate in parse_ptna_country_page(html, country=country, page_url=country_url):
|
||||
if candidate.details_url and detail_fetches < max_details:
|
||||
try:
|
||||
detail_html = _fetch_text(candidate.details_url, timeout=timeout)
|
||||
enrich_ptna_candidate_from_details(candidate, detail_html, candidate.details_url)
|
||||
detail_fetches += 1
|
||||
except requests.RequestException:
|
||||
candidate.notes = _join_notes(candidate.notes, "PTNA detail page could not be fetched during discovery.")
|
||||
candidate.priority = _candidate_priority(candidate)
|
||||
candidates.append(candidate)
|
||||
return candidates
|
||||
|
||||
|
||||
def discover_ptna_country_codes(*, timeout: float = 30.0) -> list[str]:
|
||||
html = _fetch_text(PTNA_GTFS_INDEX_URL, timeout=timeout)
|
||||
links = _all_links(html, PTNA_GTFS_INDEX_URL)
|
||||
codes: list[str] = []
|
||||
for link in links:
|
||||
match = re.search(r"/gtfs/([A-Z]{2})/index\.php$", urlparse(link).path)
|
||||
if match and match.group(1) not in codes:
|
||||
codes.append(match.group(1))
|
||||
return codes
|
||||
|
||||
|
||||
def parse_ptna_country_page(html: str, *, country: str, page_url: str) -> list[FeedCandidate]:
|
||||
rows = _parse_table_rows(html, page_url)
|
||||
candidates: list[FeedCandidate] = []
|
||||
for row in rows:
|
||||
links = [link for cell in row.cells for link in cell.links]
|
||||
routes_url = _first_link_matching(links, "routes.php?feed=")
|
||||
details_url = _first_link_matching(links, "gtfs-details.php?feed=")
|
||||
if not routes_url and not details_url:
|
||||
continue
|
||||
feed_id = _feed_id_from_url(routes_url or details_url)
|
||||
if not feed_id:
|
||||
continue
|
||||
texts = [cell.text for cell in row.cells]
|
||||
release_link = _normalize_feed_url(row.cells[6].first_external_link if len(row.cells) > 6 else "")
|
||||
direct_url = release_link if _looks_like_download_url(release_link) else ""
|
||||
candidate = FeedCandidate(
|
||||
discovery_source="ptna",
|
||||
country=country,
|
||||
provider=texts[2] if len(texts) > 2 else "",
|
||||
feed_name=texts[1] if len(texts) > 1 else feed_id,
|
||||
ptna_feed_id=feed_id,
|
||||
selected_url=direct_url,
|
||||
direct_download_url=direct_url,
|
||||
original_release_url=release_link,
|
||||
details_url=details_url,
|
||||
routes_url=routes_url,
|
||||
valid_from=texts[3] if len(texts) > 3 else "",
|
||||
valid_to=texts[4] if len(texts) > 4 else "",
|
||||
feed_version=texts[5] if len(texts) > 5 else "",
|
||||
release_date=texts[6] if len(texts) > 6 else "",
|
||||
source_basis="PTNA GTFS analysis",
|
||||
notes="PTNA candidate; use original publisher URL where available.",
|
||||
)
|
||||
normalize_candidate_geography(candidate)
|
||||
apply_known_download_overrides(candidate)
|
||||
candidates.append(candidate)
|
||||
return candidates
|
||||
|
||||
|
||||
def enrich_ptna_candidate_from_details(candidate: FeedCandidate, html: str, page_url: str) -> None:
|
||||
fields = parse_ptna_detail_fields(html, page_url)
|
||||
candidate.original_release_url = _normalize_feed_url(fields.get("release url href") or fields.get("release url") or candidate.original_release_url)
|
||||
candidate.license_url = fields.get("publisher's license href") or candidate.license_url
|
||||
candidate.license_text = fields.get("publisher's license") or candidate.license_text
|
||||
candidate.osm_license_text = fields.get("license given for use in osm") or candidate.osm_license_text
|
||||
candidate.valid_from = fields.get("feed start date") or candidate.valid_from
|
||||
candidate.valid_to = fields.get("feed end date") or candidate.valid_to
|
||||
candidate.feed_version = fields.get("feed version") or candidate.feed_version
|
||||
candidate.release_date = fields.get("release date") or candidate.release_date
|
||||
network_guid = fields.get('"network:guid"')
|
||||
if network_guid:
|
||||
candidate.notes = _join_notes(candidate.notes, f"PTNA network:guid={network_guid}")
|
||||
if not candidate.selected_url and _looks_like_download_url(candidate.original_release_url):
|
||||
candidate.selected_url = _normalize_feed_url(candidate.original_release_url)
|
||||
candidate.direct_download_url = candidate.selected_url
|
||||
normalize_candidate_geography(candidate)
|
||||
|
||||
|
||||
def parse_ptna_detail_fields(html: str, page_url: str) -> dict[str, str]:
|
||||
parsed: dict[str, str] = {}
|
||||
for row in _parse_table_rows(html, page_url):
|
||||
if len(row.cells) < 2:
|
||||
continue
|
||||
label = _clean_text(row.cells[0].text).lower()
|
||||
if not label:
|
||||
continue
|
||||
detail = _clean_text(row.cells[1].text)
|
||||
parsed[label] = detail
|
||||
if row.cells[1].first_external_link:
|
||||
parsed[f"{label} href"] = row.cells[1].first_external_link
|
||||
return parsed
|
||||
|
||||
|
||||
def load_curated_ingestable_seed(
|
||||
*,
|
||||
countries: list[str] | None = None,
|
||||
path: Path | str | None = None,
|
||||
) -> list[FeedCandidate]:
|
||||
seed_path = Path(path) if path is not None else Path(__file__).resolve().parents[1] / "docs" / "ingestable_sources_seed.csv"
|
||||
if not seed_path.exists():
|
||||
return []
|
||||
candidates: list[FeedCandidate] = []
|
||||
with seed_path.open("r", encoding="utf-8-sig", newline="") as handle:
|
||||
for row in csv.DictReader(handle):
|
||||
if _value(row, "kind").lower() != "gtfs":
|
||||
continue
|
||||
country = _value(row, "country").upper()
|
||||
if countries and country not in countries and country != "EU":
|
||||
continue
|
||||
candidate = FeedCandidate(
|
||||
discovery_source="curated_seed",
|
||||
country=country,
|
||||
provider=_value(row, "name").removesuffix(" GTFS"),
|
||||
feed_name=_value(row, "name"),
|
||||
selected_url=_normalize_feed_url(_value(row, "url")),
|
||||
direct_download_url=_normalize_feed_url(_value(row, "url")),
|
||||
license_text=_value(row, "license"),
|
||||
features=_value(row, "mode_scope"),
|
||||
priority=_value(row, "priority"),
|
||||
source_basis=_value(row, "source_basis") or "curated seed",
|
||||
notes=_value(row, "notes"),
|
||||
)
|
||||
normalize_candidate_geography(candidate)
|
||||
apply_known_download_overrides(candidate)
|
||||
candidates.append(candidate)
|
||||
return candidates
|
||||
|
||||
|
||||
def merge_candidates(candidates: Iterable[FeedCandidate]) -> list[FeedCandidate]:
|
||||
by_key: dict[str, FeedCandidate] = {}
|
||||
alias_to_key: dict[str, str] = {}
|
||||
for candidate in candidates:
|
||||
keys = _candidate_alias_keys(candidate)
|
||||
primary_key = keys[0]
|
||||
existing_key = next((alias_to_key[key] for key in keys if key in alias_to_key), None)
|
||||
existing = by_key.get(existing_key) if existing_key is not None else None
|
||||
if existing is None:
|
||||
by_key[primary_key] = candidate
|
||||
for key in keys:
|
||||
alias_to_key[key] = primary_key
|
||||
continue
|
||||
_merge_candidate(existing, candidate)
|
||||
for key in keys:
|
||||
alias_to_key[key] = existing_key or primary_key
|
||||
return sorted(by_key.values(), key=lambda item: (_priority_sort_key(item.priority), item.country, item.provider.lower(), item.feed_name.lower()))
|
||||
|
||||
|
||||
def select_test_run_candidates(candidates: Iterable[FeedCandidate], *, limit: int = 24) -> list[FeedCandidate]:
|
||||
sorted_candidates = sorted(
|
||||
[
|
||||
candidate
|
||||
for candidate in candidates
|
||||
if candidate.discovery_source != "mobility_validator_acceptance" and _test_candidate_eligible(candidate)
|
||||
],
|
||||
key=_test_candidate_sort_key,
|
||||
)
|
||||
selected: list[FeedCandidate] = []
|
||||
seen_urls: set[str] = set()
|
||||
per_country: dict[str, int] = {}
|
||||
|
||||
def add(candidate: FeedCandidate, *, force: bool = False) -> None:
|
||||
if len(selected) >= limit:
|
||||
return
|
||||
url_key = _normalize_url_key(candidate.selected_url)
|
||||
if not candidate.selected_url or url_key in seen_urls:
|
||||
return
|
||||
country = candidate.country or "unknown"
|
||||
country_limit = 7 if force and country == "DE" else 3
|
||||
if per_country.get(country, 0) >= country_limit:
|
||||
return
|
||||
selected.append(candidate)
|
||||
seen_urls.add(url_key)
|
||||
per_country[country] = per_country.get(country, 0) + 1
|
||||
|
||||
preferred_tokens = [
|
||||
"opendata-oepnv.de",
|
||||
"download.gtfs.de/germany/",
|
||||
"vbb.de/vbbgtfs",
|
||||
"rnv-online.de",
|
||||
"vrn.de",
|
||||
"gtfs.geops.ch",
|
||||
"wienerlinien.at",
|
||||
"gtfs.openov.nl",
|
||||
"gtfs.ovapi.nl",
|
||||
"rejseplanen.info",
|
||||
"dev.hsl.fi/gtfs",
|
||||
"hsldev.com/gtfs",
|
||||
"rb_norway-aggregated-gtfs",
|
||||
"data.bus-data.dft.gov.uk",
|
||||
"transportforireland",
|
||||
"gtfs.irail.be/de-lijn",
|
||||
]
|
||||
for candidate in sorted_candidates:
|
||||
text = " ".join([candidate.provider, candidate.feed_name, candidate.source_basis, candidate.selected_url]).lower()
|
||||
if any(token in text for token in preferred_tokens):
|
||||
add(candidate, force=True)
|
||||
for country in CURATED_TEST_COUNTRIES:
|
||||
for candidate in sorted_candidates:
|
||||
if candidate.country == country:
|
||||
add(candidate)
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
for candidate in sorted_candidates:
|
||||
add(candidate)
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
return selected
|
||||
|
||||
|
||||
def _test_candidate_eligible(candidate: FeedCandidate) -> bool:
|
||||
if not candidate.selected_url:
|
||||
return False
|
||||
if _priority_sort_key(candidate.priority) > 2:
|
||||
return False
|
||||
text = " ".join([candidate.status, candidate.selected_url, candidate.provider, candidate.feed_name, candidate.notes]).lower()
|
||||
if "deprecated" in text or "inactive" in text or "{apikey}" in text:
|
||||
return False
|
||||
if "registration required" in text or "authentication" in text:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def annotate_url_availability(candidate: FeedCandidate, *, timeout: float = 10.0) -> FeedCandidate:
|
||||
if not candidate.selected_url:
|
||||
candidate.availability_status = "missing_url"
|
||||
return candidate
|
||||
headers = {"User-Agent": "meubility-workbench-feed-discovery/0.1"}
|
||||
try:
|
||||
response = requests.head(candidate.selected_url, allow_redirects=True, timeout=timeout, headers=headers)
|
||||
if response.status_code in {405, 403} or response.status_code >= 500:
|
||||
response = requests.get(
|
||||
candidate.selected_url,
|
||||
allow_redirects=True,
|
||||
timeout=timeout,
|
||||
headers={**headers, "Range": "bytes=0-0"},
|
||||
stream=True,
|
||||
)
|
||||
candidate.http_status = str(response.status_code)
|
||||
candidate.content_type = response.headers.get("content-type", "")
|
||||
candidate.content_length = response.headers.get("content-length", "")
|
||||
candidate.final_url = response.url
|
||||
candidate.availability_status = "ok" if response.status_code < 400 else "error"
|
||||
response.close()
|
||||
except requests.RequestException as exc:
|
||||
candidate.availability_status = "error"
|
||||
candidate.notes = _join_notes(candidate.notes, f"Availability check failed: {exc}")
|
||||
return candidate
|
||||
|
||||
|
||||
def normalize_candidate_geography(candidate: FeedCandidate) -> None:
|
||||
text = " ".join(
|
||||
[
|
||||
candidate.selected_url,
|
||||
candidate.direct_download_url,
|
||||
candidate.latest_url,
|
||||
candidate.original_release_url,
|
||||
candidate.provider,
|
||||
candidate.feed_name,
|
||||
candidate.source_basis,
|
||||
]
|
||||
).lower()
|
||||
if "download.gtfs.de/germany/" in text or "gtfs for germany" in text:
|
||||
candidate.country = "DE"
|
||||
elif "storage.googleapis.com/marduk-production/outbound/gtfs/rb_norway" in text:
|
||||
candidate.country = "NO"
|
||||
elif "gtfs.ovapi.nl" in text or "openov.nl" in text:
|
||||
candidate.country = "NL"
|
||||
elif "www.nvbw.de/fileadmin/user_upload/service/open_data/" in text:
|
||||
candidate.country = "DE"
|
||||
|
||||
|
||||
def apply_known_download_overrides(candidate: FeedCandidate) -> None:
|
||||
stale_direct_ids = {"mdb-684", "mdb-777"}
|
||||
if candidate.stable_id in stale_direct_ids and candidate.latest_url:
|
||||
candidate.selected_url = candidate.latest_url
|
||||
candidate.notes = _join_notes(
|
||||
candidate.notes,
|
||||
"Selected Mobility Database latest.zip mirror because the catalog direct URL is known to be stale.",
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class _HtmlCell:
|
||||
text: str = ""
|
||||
links: list[str] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def first_external_link(self) -> str:
|
||||
for link in self.links:
|
||||
parsed = urlparse(link)
|
||||
if parsed.scheme in {"http", "https"} and "ptna.openstreetmap.de" not in parsed.netloc:
|
||||
return link
|
||||
return ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class _HtmlRow:
|
||||
cells: list[_HtmlCell] = field(default_factory=list)
|
||||
|
||||
|
||||
class _TableParser(HTMLParser):
|
||||
def __init__(self, base_url: str):
|
||||
super().__init__(convert_charrefs=True)
|
||||
self.base_url = base_url
|
||||
self.rows: list[_HtmlRow] = []
|
||||
self._row: _HtmlRow | None = None
|
||||
self._cell: _HtmlCell | None = None
|
||||
self._active_link: str = ""
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||||
attrs_dict = {key: value or "" for key, value in attrs}
|
||||
if tag == "tr":
|
||||
self._row = _HtmlRow()
|
||||
elif tag in {"td", "th"} and self._row is not None:
|
||||
self._cell = _HtmlCell()
|
||||
elif tag == "a" and self._cell is not None:
|
||||
href = attrs_dict.get("href", "")
|
||||
if href:
|
||||
self._active_link = urljoin(self.base_url, href)
|
||||
self._cell.links.append(self._active_link)
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
if tag in {"td", "th"} and self._row is not None and self._cell is not None:
|
||||
self._cell.text = _clean_text(self._cell.text)
|
||||
self._row.cells.append(self._cell)
|
||||
self._cell = None
|
||||
self._active_link = ""
|
||||
elif tag == "a":
|
||||
self._active_link = ""
|
||||
elif tag == "tr":
|
||||
if self._row is not None and self._row.cells:
|
||||
self.rows.append(self._row)
|
||||
self._row = None
|
||||
self._cell = None
|
||||
self._active_link = ""
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if self._cell is not None:
|
||||
self._cell.text += data
|
||||
|
||||
|
||||
class _LinkParser(HTMLParser):
|
||||
def __init__(self, base_url: str):
|
||||
super().__init__(convert_charrefs=True)
|
||||
self.base_url = base_url
|
||||
self.links: list[str] = []
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||||
if tag != "a":
|
||||
return
|
||||
for key, value in attrs:
|
||||
if key == "href" and value:
|
||||
self.links.append(urljoin(self.base_url, value))
|
||||
|
||||
|
||||
def _parse_table_rows(html: str, base_url: str) -> list[_HtmlRow]:
|
||||
parser = _TableParser(base_url)
|
||||
parser.feed(html)
|
||||
return parser.rows
|
||||
|
||||
|
||||
def _all_links(html: str, base_url: str) -> list[str]:
|
||||
parser = _LinkParser(base_url)
|
||||
parser.feed(html)
|
||||
return parser.links
|
||||
|
||||
|
||||
def _fetch_text(url: str, *, timeout: float) -> str:
|
||||
response = requests.get(url, timeout=timeout, headers={"User-Agent": "meubility-workbench-feed-discovery/0.1"})
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
||||
|
||||
def _first_link_matching(links: Iterable[str], needle: str) -> str:
|
||||
for link in links:
|
||||
if needle in link:
|
||||
return link
|
||||
return ""
|
||||
|
||||
|
||||
def _feed_id_from_url(url: str) -> str:
|
||||
query = parse_qs(urlparse(url).query)
|
||||
return (query.get("feed") or [""])[0]
|
||||
|
||||
|
||||
def _looks_like_download_url(url: str) -> bool:
|
||||
if not url:
|
||||
return False
|
||||
parsed = urlparse(url)
|
||||
lower_path = parsed.path.lower()
|
||||
lower_url = url.lower()
|
||||
if lower_path.endswith(".zip"):
|
||||
return True
|
||||
if "exportformat=gtfs" in lower_url or "google_transit" in lower_url:
|
||||
return True
|
||||
if lower_path.rstrip("/").endswith(("current_gtfs", "gtfs")):
|
||||
return True
|
||||
if "gtfs.ovapi.nl" in parsed.netloc.lower() and "gtfs" in lower_path:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _normalize_feed_url(url: str) -> str:
|
||||
cleaned = _clean_text(url)
|
||||
if not cleaned:
|
||||
return ""
|
||||
parsed = urlparse(cleaned)
|
||||
if parsed.scheme:
|
||||
return cleaned
|
||||
first = cleaned.split("/", 1)[0]
|
||||
if "." in first:
|
||||
return f"https://{cleaned}"
|
||||
return cleaned
|
||||
|
||||
|
||||
def _choose_feed_url(direct_url: str, latest_url: str) -> str:
|
||||
if direct_url:
|
||||
return direct_url
|
||||
return latest_url
|
||||
|
||||
|
||||
def _candidate_priority(candidate: FeedCandidate) -> str:
|
||||
status = candidate.status.lower()
|
||||
official = candidate.is_official.lower() == "true"
|
||||
if candidate.discovery_source == "curated_seed":
|
||||
return candidate.priority or "P1"
|
||||
if status == "active" and official and candidate.direct_download_url:
|
||||
return "P0"
|
||||
if status == "active" and candidate.direct_download_url:
|
||||
return "P1"
|
||||
if status == "active" and candidate.latest_url:
|
||||
return "P2"
|
||||
if candidate.discovery_source == "ptna":
|
||||
return "P2" if candidate.selected_url else "P4"
|
||||
return "P3"
|
||||
|
||||
|
||||
def _test_candidate_sort_key(candidate: FeedCandidate) -> tuple[int, int, str, str]:
|
||||
source_bonus = 0 if candidate.discovery_source == "curated_seed" else 1
|
||||
country_bonus = CURATED_TEST_COUNTRIES.index(candidate.country) if candidate.country in CURATED_TEST_COUNTRIES else 99
|
||||
return (_priority_sort_key(candidate.priority), source_bonus + country_bonus, candidate.country, candidate.provider.lower())
|
||||
|
||||
|
||||
def _priority_sort_key(priority: str) -> int:
|
||||
match = re.match(r"P(\d+)", priority or "")
|
||||
return int(match.group(1)) if match else 9
|
||||
|
||||
|
||||
def _candidate_alias_keys(candidate: FeedCandidate) -> list[str]:
|
||||
keys = [candidate.key()]
|
||||
if candidate.stable_id:
|
||||
keys.append(f"stable:{candidate.stable_id}")
|
||||
for url in [candidate.selected_url, candidate.direct_download_url, candidate.latest_url]:
|
||||
if url:
|
||||
keys.append(f"url:{_normalize_url_key(url)}")
|
||||
if candidate.ptna_feed_id:
|
||||
keys.append(f"ptna:{candidate.ptna_feed_id}")
|
||||
deduped: list[str] = []
|
||||
for key in keys:
|
||||
if key not in deduped:
|
||||
deduped.append(key)
|
||||
return deduped
|
||||
|
||||
|
||||
def _merge_candidate(existing: FeedCandidate, incoming: FeedCandidate) -> None:
|
||||
if incoming.discovery_source == "curated_seed":
|
||||
for field_name in ["country", "provider", "feed_name", "license_text", "features", "source_basis", "notes"]:
|
||||
new_value = getattr(incoming, field_name, "")
|
||||
if new_value:
|
||||
setattr(existing, field_name, new_value)
|
||||
existing.discovery_source = _join_unique(existing.discovery_source, incoming.discovery_source)
|
||||
for field_name in CANONICAL_HEADERS:
|
||||
if field_name == "candidate_id":
|
||||
continue
|
||||
current = getattr(existing, field_name, "")
|
||||
new_value = getattr(incoming, field_name, "")
|
||||
if not current and new_value:
|
||||
setattr(existing, field_name, new_value)
|
||||
existing.priority = _better_priority(existing.priority, incoming.priority)
|
||||
existing.source_basis = _join_unique(existing.source_basis, incoming.source_basis)
|
||||
existing.notes = _join_notes(existing.notes, incoming.notes)
|
||||
|
||||
|
||||
def _better_priority(left: str, right: str) -> str:
|
||||
return left if _priority_sort_key(left) <= _priority_sort_key(right) else right
|
||||
|
||||
|
||||
def _join_unique(left: str, right: str) -> str:
|
||||
parts: list[str] = []
|
||||
for value in [left, right]:
|
||||
for part in value.split(";"):
|
||||
cleaned = part.strip()
|
||||
if cleaned and cleaned not in parts:
|
||||
parts.append(cleaned)
|
||||
return "; ".join(parts)
|
||||
|
||||
|
||||
def _join_notes(left: str, right: str) -> str:
|
||||
return _join_unique(left, right)
|
||||
|
||||
|
||||
def _compact_name(value: str) -> str:
|
||||
return re.sub(r"\s+", " ", _clean_text(value)).strip()
|
||||
|
||||
|
||||
def _feed_source_name(country: str, value: str) -> str:
|
||||
base = _compact_name(value) or "GTFS feed"
|
||||
prefix = country.upper()
|
||||
display = base
|
||||
if prefix and not base.upper().startswith(f"{prefix} "):
|
||||
display = f"{prefix} {base}"
|
||||
if "gtfs" not in display.lower():
|
||||
display = f"{display} GTFS"
|
||||
return display
|
||||
|
||||
|
||||
def _clean_text(value: str) -> str:
|
||||
cleaned = unescape(value or "").replace("\xa0", " ")
|
||||
cleaned = re.sub(r"\s+", " ", cleaned)
|
||||
return cleaned.strip()
|
||||
|
||||
|
||||
def _mode_scope_from_features(features: str) -> str:
|
||||
lower = features.lower()
|
||||
modes = []
|
||||
if "rail" in lower or "train" in lower:
|
||||
modes.append("rail")
|
||||
if "tram" in lower or "light_rail" in lower:
|
||||
modes.append("tram")
|
||||
if "subway" in lower or "metro" in lower:
|
||||
modes.append("metro")
|
||||
if "bus" in lower or not modes:
|
||||
modes.append("bus")
|
||||
if "ferry" in lower:
|
||||
modes.append("ferry")
|
||||
return ",".join(dict.fromkeys(modes))
|
||||
|
||||
|
||||
def _bbox_from_mobility_row(row: dict[str, str]) -> str:
|
||||
min_lat = _value(row, "location.bounding_box.minimum_latitude")
|
||||
max_lat = _value(row, "location.bounding_box.maximum_latitude")
|
||||
min_lon = _value(row, "location.bounding_box.minimum_longitude")
|
||||
max_lon = _value(row, "location.bounding_box.maximum_longitude")
|
||||
if not all([min_lat, max_lat, min_lon, max_lon]):
|
||||
return ""
|
||||
return f"{min_lon},{min_lat},{max_lon},{max_lat}"
|
||||
|
||||
|
||||
def _normalize_countries(countries: Iterable[str] | None) -> list[str] | None:
|
||||
if countries is None:
|
||||
return DEFAULT_DISCOVERY_COUNTRIES
|
||||
normalized = [country.strip().upper() for country in countries if country and country.strip()]
|
||||
if any(country == "ALL" for country in normalized):
|
||||
return None
|
||||
return normalized
|
||||
|
||||
|
||||
def _normalize_url_key(url: str) -> str:
|
||||
parsed = urlparse(url.strip())
|
||||
scheme = parsed.scheme.lower()
|
||||
netloc = parsed.netloc.lower()
|
||||
path = parsed.path.rstrip("/")
|
||||
query = parsed.query
|
||||
return f"{scheme}://{netloc}{path}" + (f"?{query}" if query else "")
|
||||
|
||||
|
||||
def _write_csv(path: Path, headers: list[str], rows: list[dict[str, str]]) -> None:
|
||||
with path.open("w", encoding="utf-8", newline="") as handle:
|
||||
writer = csv.DictWriter(handle, fieldnames=headers, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
|
||||
def _count_by(items: Iterable[FeedCandidate], key_fn) -> dict[str, int]:
|
||||
counts: dict[str, int] = {}
|
||||
for item in items:
|
||||
key = key_fn(item)
|
||||
counts[key] = counts.get(key, 0) + 1
|
||||
return dict(sorted(counts.items()))
|
||||
|
||||
|
||||
def _value(row: dict[str, str], key: str) -> str:
|
||||
return _clean_text(row.get(key, ""))
|
||||
|
||||
|
||||
def _string(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _truncate(value: str, length: int) -> str:
|
||||
return value[:length] if value else ""
|
||||
Reference in New Issue
Block a user