257 lines
9.7 KiB
Python
257 lines
9.7 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse
|
|
|
|
import requests
|
|
from sqlalchemy import select
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.config import settings
|
|
from app.models import Dataset, Source, SourceUpdateCheck
|
|
from app.pipeline.utils import norm_text, sha256_file
|
|
|
|
|
|
def check_source_for_update(session: Session, source: Source) -> SourceUpdateCheck:
|
|
active_dataset = session.scalar(
|
|
select(Dataset)
|
|
.where(Dataset.source_id == source.id, Dataset.is_active.is_(True))
|
|
.order_by(Dataset.created_at.desc(), Dataset.id.desc())
|
|
)
|
|
recovery = _recover_missing_managed_cache_url(source)
|
|
remote = _source_remote_metadata(source)
|
|
if recovery is not None:
|
|
remote["recovered_source_url"] = recovery["url"]
|
|
remote["previous_source_url"] = recovery["previous_url"]
|
|
update_available, reason = _update_decision(active_dataset, remote)
|
|
check = SourceUpdateCheck(
|
|
source_id=source.id,
|
|
status=remote["status"],
|
|
update_available=update_available,
|
|
reason=reason,
|
|
remote_url=source.url,
|
|
etag=remote.get("etag"),
|
|
last_modified=remote.get("last_modified"),
|
|
content_length=remote.get("content_length"),
|
|
content_type=remote.get("content_type"),
|
|
local_mtime=remote.get("local_mtime"),
|
|
local_size=remote.get("local_size"),
|
|
local_sha256=remote.get("local_sha256"),
|
|
active_dataset_id=None if active_dataset is None else active_dataset.id,
|
|
active_dataset_sha256=None if active_dataset is None else active_dataset.sha256,
|
|
metadata_json=json.dumps(remote, separators=(",", ":"), default=_json_default),
|
|
)
|
|
session.add(check)
|
|
source.status = "update_check_error" if remote["status"] != "checked" else "update_available" if update_available else "up_to_date"
|
|
source.last_error = None if remote["status"] == "checked" else reason
|
|
session.flush()
|
|
return check
|
|
|
|
|
|
def latest_source_update_check(session: Session, source_id: int) -> SourceUpdateCheck | None:
|
|
return session.scalar(
|
|
select(SourceUpdateCheck)
|
|
.where(SourceUpdateCheck.source_id == source_id)
|
|
.order_by(SourceUpdateCheck.checked_at.desc(), SourceUpdateCheck.id.desc())
|
|
)
|
|
|
|
|
|
def update_check_payload(check: SourceUpdateCheck | None) -> dict | None:
|
|
if check is None:
|
|
return None
|
|
try:
|
|
metadata = json.loads(check.metadata_json or "{}")
|
|
except json.JSONDecodeError:
|
|
metadata = {}
|
|
return {
|
|
"id": check.id,
|
|
"source_id": check.source_id,
|
|
"checked_at": check.checked_at.isoformat() if check.checked_at else None,
|
|
"status": check.status,
|
|
"update_available": check.update_available,
|
|
"reason": check.reason,
|
|
"etag": check.etag,
|
|
"last_modified": check.last_modified,
|
|
"content_length": check.content_length,
|
|
"content_type": check.content_type,
|
|
"local_mtime": check.local_mtime.isoformat() if check.local_mtime else None,
|
|
"local_size": check.local_size,
|
|
"local_sha256": check.local_sha256,
|
|
"active_dataset_id": check.active_dataset_id,
|
|
"active_dataset_sha256": check.active_dataset_sha256,
|
|
"metadata": metadata,
|
|
}
|
|
|
|
|
|
def record_dataset_update_metadata(dataset: Dataset, check: SourceUpdateCheck | None) -> None:
|
|
if check is None:
|
|
return
|
|
try:
|
|
metadata = json.loads(dataset.metadata_json or "{}")
|
|
except json.JSONDecodeError:
|
|
metadata = {}
|
|
metadata["source_update_check"] = {
|
|
"id": check.id,
|
|
"checked_at": check.checked_at.isoformat() if check.checked_at else None,
|
|
"etag": check.etag,
|
|
"last_modified": check.last_modified,
|
|
"content_length": check.content_length,
|
|
"content_type": check.content_type,
|
|
"local_mtime": check.local_mtime.isoformat() if check.local_mtime else None,
|
|
"local_size": check.local_size,
|
|
"local_sha256": check.local_sha256,
|
|
"metadata": update_check_payload(check).get("metadata", {}),
|
|
}
|
|
dataset.metadata_json = json.dumps(metadata, indent=2, default=_json_default)
|
|
|
|
|
|
def _source_remote_metadata(source: Source) -> dict:
|
|
parsed = urlparse(source.url)
|
|
if parsed.scheme in {"http", "https"}:
|
|
return _http_metadata(source.url)
|
|
path = Path(parsed.path) if parsed.scheme == "file" else Path(source.url)
|
|
return _local_metadata(path)
|
|
|
|
|
|
def _recover_missing_managed_cache_url(source: Source) -> dict | None:
|
|
parsed = urlparse(source.url)
|
|
if parsed.scheme in {"http", "https"}:
|
|
return None
|
|
path = Path(parsed.path) if parsed.scheme == "file" else Path(source.url)
|
|
if path.exists() or not _is_managed_source_cache_path(path, source.id):
|
|
return None
|
|
replacement = _seed_source_url_for(source)
|
|
if replacement is None:
|
|
return None
|
|
previous_url = source.url
|
|
source.url = replacement
|
|
return {"previous_url": previous_url, "url": replacement}
|
|
|
|
|
|
def _is_managed_source_cache_path(path: Path, source_id: int) -> bool:
|
|
source_dir = f"source_{source_id}"
|
|
try:
|
|
resolved = path.resolve()
|
|
managed_dir = (settings.data_dir / "sources" / source_dir).resolve()
|
|
resolved.relative_to(managed_dir)
|
|
return True
|
|
except ValueError:
|
|
pass
|
|
parts = path.parts
|
|
return any(part == "sources" and index + 1 < len(parts) and parts[index + 1] == source_dir for index, part in enumerate(parts))
|
|
|
|
|
|
def _seed_source_url_for(source: Source) -> str | None:
|
|
seed_path = Path(__file__).resolve().parents[1] / "scripts" / "example_sources.json"
|
|
if not seed_path.exists():
|
|
return None
|
|
try:
|
|
rows = json.loads(seed_path.read_text(encoding="utf-8"))
|
|
except (OSError, json.JSONDecodeError):
|
|
return None
|
|
source_tokens = set(norm_text(source.name).split())
|
|
for row in rows if isinstance(rows, list) else []:
|
|
if not isinstance(row, dict):
|
|
continue
|
|
url = str(row.get("url") or "")
|
|
if urlparse(url).scheme not in {"http", "https"}:
|
|
continue
|
|
if row.get("kind") != source.kind:
|
|
continue
|
|
if source.country and row.get("country") and str(row.get("country")) != source.country:
|
|
continue
|
|
row_tokens = set(norm_text(row.get("name")).split())
|
|
if row_tokens and (row_tokens <= source_tokens or source_tokens <= row_tokens):
|
|
return url
|
|
return None
|
|
|
|
|
|
def _http_metadata(url: str) -> dict:
|
|
response = None
|
|
try:
|
|
response = requests.head(url, allow_redirects=True, timeout=30)
|
|
if response.status_code in {405, 501}:
|
|
response.close()
|
|
response = requests.get(url, stream=True, timeout=30)
|
|
response.raise_for_status()
|
|
except Exception as exc: # noqa: BLE001 - persisted as update-check status
|
|
return {"status": "error", "error": str(exc)}
|
|
finally:
|
|
if response is not None:
|
|
response.close()
|
|
headers = response.headers
|
|
content_length = headers.get("Content-Length")
|
|
return {
|
|
"status": "checked",
|
|
"etag": headers.get("ETag"),
|
|
"last_modified": headers.get("Last-Modified"),
|
|
"content_length": int(content_length) if content_length and content_length.isdigit() else None,
|
|
"content_type": headers.get("Content-Type"),
|
|
"final_url": response.url,
|
|
"update_artifact": _update_artifact(url, headers.get("Content-Type")),
|
|
}
|
|
|
|
|
|
def _local_metadata(path: Path) -> dict:
|
|
if not path.exists():
|
|
return {"status": "error", "error": f"Source file does not exist: {path}"}
|
|
stat = path.stat()
|
|
return {
|
|
"status": "checked",
|
|
"local_mtime": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc),
|
|
"local_size": stat.st_size,
|
|
"local_sha256": sha256_file(path),
|
|
"update_artifact": _update_artifact(str(path), None),
|
|
}
|
|
|
|
|
|
def _update_decision(active_dataset: Dataset | None, remote: dict) -> tuple[bool, str]:
|
|
if remote["status"] != "checked":
|
|
return False, remote.get("error") or "update check failed"
|
|
if active_dataset is None:
|
|
return True, "no active dataset imported"
|
|
if remote.get("local_sha256"):
|
|
if remote["local_sha256"] == active_dataset.sha256:
|
|
return False, "local file hash matches active dataset"
|
|
return True, "local file hash differs from active dataset"
|
|
|
|
previous = _dataset_update_metadata(active_dataset)
|
|
comparable = []
|
|
for key in ("etag", "last_modified", "content_length"):
|
|
current = remote.get(key)
|
|
old = previous.get(key)
|
|
if current is not None and old is not None:
|
|
comparable.append(key)
|
|
if str(current) != str(old):
|
|
return True, f"remote {key} changed"
|
|
if comparable:
|
|
return False, "remote metadata matches active dataset"
|
|
return True, "no previous remote metadata recorded"
|
|
|
|
|
|
def _dataset_update_metadata(dataset: Dataset) -> dict:
|
|
try:
|
|
metadata = json.loads(dataset.metadata_json or "{}")
|
|
except json.JSONDecodeError:
|
|
return {}
|
|
return metadata.get("source_update_check") or {}
|
|
|
|
|
|
def _json_default(value):
|
|
if isinstance(value, datetime):
|
|
return value.isoformat()
|
|
raise TypeError(f"Object of type {type(value).__name__} is not JSON serializable")
|
|
|
|
|
|
def _update_artifact(url_or_path: str, content_type: str | None) -> dict:
|
|
lower = url_or_path.lower()
|
|
is_osm_diff = lower.endswith(".osc") or lower.endswith(".osc.gz")
|
|
is_gtfs_zip = lower.endswith(".zip") or (content_type or "").lower() in {"application/zip", "application/x-zip-compressed"}
|
|
return {
|
|
"kind": "osm_diff" if is_osm_diff else "gtfs_or_archive" if is_gtfs_zip else "full_snapshot",
|
|
"is_diff": is_osm_diff,
|
|
"content_type": content_type,
|
|
}
|