Files
meubility-workbench/app/pipeline/utils.py
2026-07-01 23:29:51 +02:00

90 lines
2.9 KiB
Python

from __future__ import annotations
import hashlib
import json
import re
from pathlib import Path
from typing import Iterable, Optional
from shapely.geometry import shape
def sha256_file(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
h.update(chunk)
return h.hexdigest()
def norm_text(value: object) -> str:
if value is None:
return ""
value = str(value).lower().strip()
value = value.replace("ß", "ss")
value = re.sub(r"[^a-z0-9]+", " ", value)
return re.sub(r"\s+", " ", value).strip()
def norm_ref(value: object) -> str:
if value is None:
return ""
return re.sub(r"[^a-z0-9]+", "", str(value).lower())
def first_nonempty(*values: object) -> str:
for value in values:
if value is None:
continue
text = str(value).strip()
if text:
return text
return ""
def geometry_json_and_bbox(geometry: object) -> tuple[Optional[str], tuple[Optional[float], Optional[float], Optional[float], Optional[float]]]:
if geometry is None:
return None, (None, None, None, None)
try:
geom = shape(geometry) if isinstance(geometry, dict) else geometry
if geom.is_empty:
return None, (None, None, None, None)
min_lon, min_lat, max_lon, max_lat = geom.bounds
return json.dumps(geom.__geo_interface__, separators=(",", ":")), (min_lon, min_lat, max_lon, max_lat)
except Exception:
return None, (None, None, None, None)
def bbox_overlap(a: tuple[float | None, float | None, float | None, float | None], b: tuple[float | None, float | None, float | None, float | None]) -> bool:
if any(v is None for v in (*a, *b)):
return False
aminx, aminy, amaxx, amaxy = a # type: ignore[misc]
bminx, bminy, bmaxx, bmaxy = b # type: ignore[misc]
return not (amaxx < bminx or bmaxx < aminx or amaxy < bminy or bmaxy < aminy)
def bbox_center(b: tuple[float | None, float | None, float | None, float | None]) -> Optional[tuple[float, float]]:
if any(v is None for v in b):
return None
minx, miny, maxx, maxy = b # type: ignore[misc]
return ((minx + maxx) / 2, (miny + maxy) / 2)
def approx_bbox_center_distance_deg(a: tuple[float | None, float | None, float | None, float | None], b: tuple[float | None, float | None, float | None, float | None]) -> Optional[float]:
ca = bbox_center(a)
cb = bbox_center(b)
if ca is None or cb is None:
return None
return ((ca[0] - cb[0]) ** 2 + (ca[1] - cb[1]) ** 2) ** 0.5
def batched(iterable: Iterable[dict], batch_size: int = 1000) -> Iterable[list[dict]]:
batch: list[dict] = []
for item in iterable:
batch.append(item)
if len(batch) >= batch_size:
yield batch
batch = []
if batch:
yield batch