509 lines
18 KiB
Python
509 lines
18 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import math
|
|
import re
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Callable
|
|
|
|
import osmium
|
|
from sqlalchemy import delete, func, select, text
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.config import settings
|
|
from app.models import Dataset, OsmAddress
|
|
from app.pipeline.routing_layer import active_routing_dataset
|
|
from app.spatial import analyze_postgresql_tables, refresh_postgis_geometries
|
|
|
|
|
|
ProgressCallback = Callable[[str, str, int | None, int | None, dict[str, object] | None], None]
|
|
ADDRESS_INDEX_VERSION = "osm_addresses_v2_nodes_ways_area_geometry"
|
|
ADDRESS_TAGS = {
|
|
"addr:housenumber",
|
|
"addr:housename",
|
|
"addr:street",
|
|
"addr:place",
|
|
"addr:postcode",
|
|
"addr:city",
|
|
"addr:country",
|
|
"addr:unit",
|
|
"addr:suburb",
|
|
"addr:district",
|
|
"addr:municipality",
|
|
"entrance",
|
|
"name",
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class AddressIndexResult:
|
|
dataset_id: int
|
|
input_path: str
|
|
addresses: int
|
|
node_addresses: int
|
|
way_addresses: int
|
|
skipped: int
|
|
version: str = ADDRESS_INDEX_VERSION
|
|
|
|
def as_dict(self) -> dict[str, object]:
|
|
return {
|
|
"version": self.version,
|
|
"dataset_id": self.dataset_id,
|
|
"input_path": self.input_path,
|
|
"addresses": self.addresses,
|
|
"node_addresses": self.node_addresses,
|
|
"way_addresses": self.way_addresses,
|
|
"skipped": self.skipped,
|
|
}
|
|
|
|
|
|
def rebuild_address_index(
|
|
session: Session,
|
|
*,
|
|
dataset_id: int | None = None,
|
|
input_path: str | Path | None = None,
|
|
reset: bool = True,
|
|
batch_size: int = 20_000,
|
|
progress_callback: ProgressCallback | None = None,
|
|
) -> dict[str, object]:
|
|
dataset = session.get(Dataset, dataset_id) if dataset_id is not None else active_routing_dataset(session)
|
|
if dataset is None:
|
|
raise ValueError("No OSM PBF dataset is available for address indexing.")
|
|
path = Path(input_path or dataset.local_path)
|
|
if not path.exists():
|
|
raise FileNotFoundError(f"Address index PBF does not exist: {path}")
|
|
|
|
if reset:
|
|
_emit(progress_callback, "address_index_clear_started", "Clearing existing OSM address index.", None, None, {"dataset_id": dataset.id})
|
|
_clear_address_rows(session, dataset_id=int(dataset.id))
|
|
session.commit()
|
|
|
|
if settings.is_postgresql_database:
|
|
_emit(progress_callback, "address_index_indexes_dropped", "Dropping address lookup indexes before bulk import.", None, None, {"dataset_id": dataset.id})
|
|
_drop_address_indexes(session)
|
|
session.commit()
|
|
|
|
_emit(progress_callback, "address_index_import_started", "Importing OSM address nodes and ways.", None, None, {"dataset_id": dataset.id, "path": str(path)})
|
|
handler = _AddressHandler(
|
|
session=session,
|
|
dataset_id=dataset.id,
|
|
batch_size=batch_size,
|
|
progress_callback=progress_callback,
|
|
)
|
|
if hasattr(osmium, "FileProcessor"):
|
|
_apply_address_file_processor(handler, path)
|
|
else:
|
|
handler.apply_file(str(path), locations=True)
|
|
handler.flush()
|
|
|
|
return finalize_address_index(
|
|
session,
|
|
dataset_id=dataset.id,
|
|
input_path=path,
|
|
node_addresses=handler.node_address_count,
|
|
way_addresses=handler.way_address_count,
|
|
skipped=handler.skipped_count,
|
|
progress_callback=progress_callback,
|
|
)
|
|
|
|
|
|
def finalize_address_index(
|
|
session: Session,
|
|
*,
|
|
dataset_id: int,
|
|
input_path: str | Path,
|
|
node_addresses: int = 0,
|
|
way_addresses: int = 0,
|
|
skipped: int = 0,
|
|
progress_callback: ProgressCallback | None = None,
|
|
) -> dict[str, object]:
|
|
dataset = session.get(Dataset, dataset_id)
|
|
if dataset is None:
|
|
raise ValueError("Address index dataset does not exist.")
|
|
if settings.is_postgresql_database:
|
|
_emit(progress_callback, "address_index_geometry_started", "Refreshing address point geometries.", None, None, {"dataset_id": dataset.id})
|
|
refresh_postgis_geometries(session, dataset_id=dataset.id, tables=["osm_addresses"], only_missing=False)
|
|
session.commit()
|
|
_emit(progress_callback, "address_index_indexes_started", "Rebuilding address lookup indexes.", None, None, {"dataset_id": dataset.id})
|
|
_create_address_indexes(session)
|
|
session.commit()
|
|
analyze_postgresql_tables(session, ["osm_addresses"])
|
|
address_count = int(session.scalar(select(func.count()).select_from(OsmAddress).where(OsmAddress.dataset_id == dataset.id)) or 0)
|
|
metadata = _metadata(dataset)
|
|
metadata["address_index"] = {
|
|
"version": ADDRESS_INDEX_VERSION,
|
|
"addresses": address_count,
|
|
"node_addresses": int(node_addresses),
|
|
"way_addresses": int(way_addresses),
|
|
"skipped": int(skipped),
|
|
"input_path": str(input_path),
|
|
}
|
|
dataset.metadata_json = json.dumps(metadata, indent=2)
|
|
session.commit()
|
|
result = AddressIndexResult(
|
|
dataset_id=dataset.id,
|
|
input_path=str(input_path),
|
|
addresses=address_count,
|
|
node_addresses=node_addresses,
|
|
way_addresses=way_addresses,
|
|
skipped=skipped,
|
|
).as_dict()
|
|
_emit(progress_callback, "address_index_import_completed", "OSM address index import completed.", address_count, address_count, result)
|
|
return result
|
|
|
|
|
|
def _clear_address_rows(session: Session, *, dataset_id: int) -> None:
|
|
if settings.is_postgresql_database:
|
|
other_dataset_count = int(
|
|
session.scalar(
|
|
select(func.count(func.distinct(OsmAddress.dataset_id))).where(OsmAddress.dataset_id != int(dataset_id))
|
|
)
|
|
or 0
|
|
)
|
|
if other_dataset_count == 0:
|
|
session.execute(text("TRUNCATE TABLE osm_addresses RESTART IDENTITY"))
|
|
return
|
|
session.execute(delete(OsmAddress).where(OsmAddress.dataset_id == int(dataset_id)))
|
|
|
|
|
|
def address_index_status(session: Session) -> dict[str, object]:
|
|
dataset = active_routing_dataset(session)
|
|
dataset_id = None if dataset is None else int(dataset.id)
|
|
address_count = 0
|
|
metadata: dict[str, object] = {}
|
|
if dataset is not None:
|
|
metadata = _metadata(dataset).get("address_index") or {}
|
|
if isinstance(metadata, dict):
|
|
try:
|
|
address_count = int(metadata.get("addresses") or 0)
|
|
except (TypeError, ValueError):
|
|
address_count = 0
|
|
if not address_count:
|
|
address_count = int(session.scalar(select(func.count()).select_from(OsmAddress).where(OsmAddress.dataset_id == dataset.id)) or 0)
|
|
installed_version = metadata.get("version") if isinstance(metadata, dict) else None
|
|
return {
|
|
"dataset_id": dataset_id,
|
|
"addresses": address_count,
|
|
"available": address_count > 0,
|
|
"version": installed_version,
|
|
"current_version": ADDRESS_INDEX_VERSION,
|
|
"stale": bool(address_count and installed_version != ADDRESS_INDEX_VERSION),
|
|
"input_path": metadata.get("input_path") if isinstance(metadata, dict) else None,
|
|
}
|
|
|
|
|
|
class _AddressHandler(osmium.SimpleHandler):
|
|
def __init__(
|
|
self,
|
|
*,
|
|
session: Session,
|
|
dataset_id: int,
|
|
batch_size: int,
|
|
progress_callback: ProgressCallback | None,
|
|
) -> None:
|
|
super().__init__()
|
|
self.session = session
|
|
self.dataset_id = int(dataset_id)
|
|
self.batch_size = max(1_000, int(batch_size))
|
|
self.progress_callback = progress_callback
|
|
self.rows: list[dict[str, object]] = []
|
|
self.address_count = 0
|
|
self.node_address_count = 0
|
|
self.way_address_count = 0
|
|
self.skipped_count = 0
|
|
self.processed_count = 0
|
|
|
|
def node(self, node) -> None:
|
|
self.process_node(node)
|
|
|
|
def way(self, way) -> None:
|
|
self.process_way(way)
|
|
|
|
def process_object(self, obj) -> None:
|
|
if hasattr(obj, "nodes"):
|
|
self.process_way(obj)
|
|
elif hasattr(obj, "location"):
|
|
self.process_node(obj)
|
|
|
|
def process_node(self, node) -> None:
|
|
self.processed_count += 1
|
|
tags = {tag.k: tag.v for tag in node.tags}
|
|
if not _has_address(tags):
|
|
return
|
|
if not node.location.valid():
|
|
self.skipped_count += 1
|
|
return
|
|
row = _address_row(
|
|
dataset_id=self.dataset_id,
|
|
osm_type="node",
|
|
osm_id=str(node.id),
|
|
tags=tags,
|
|
lon=float(node.location.lon),
|
|
lat=float(node.location.lat),
|
|
bounds=(float(node.location.lon), float(node.location.lat), float(node.location.lon), float(node.location.lat)),
|
|
geometry_geojson=None,
|
|
)
|
|
if row is None:
|
|
self.skipped_count += 1
|
|
return
|
|
self.rows.append(row)
|
|
self.node_address_count += 1
|
|
self._after_address()
|
|
|
|
def process_way(self, way) -> None:
|
|
self.processed_count += 1
|
|
tags = {tag.k: tag.v for tag in way.tags}
|
|
if not _has_address(tags):
|
|
return
|
|
coords = [
|
|
(float(node.location.lon), float(node.location.lat))
|
|
for node in way.nodes
|
|
if node.location.valid()
|
|
]
|
|
if not coords:
|
|
self.skipped_count += 1
|
|
return
|
|
lon, lat = _centroid(coords)
|
|
min_lon = min(coord[0] for coord in coords)
|
|
max_lon = max(coord[0] for coord in coords)
|
|
min_lat = min(coord[1] for coord in coords)
|
|
max_lat = max(coord[1] for coord in coords)
|
|
row = _address_row(
|
|
dataset_id=self.dataset_id,
|
|
osm_type="way",
|
|
osm_id=str(way.id),
|
|
tags=tags,
|
|
lon=lon,
|
|
lat=lat,
|
|
bounds=(min_lon, min_lat, max_lon, max_lat),
|
|
geometry_geojson=_address_area_geometry_geojson(coords, closed=_way_is_closed(way)),
|
|
)
|
|
if row is None:
|
|
self.skipped_count += 1
|
|
return
|
|
self.rows.append(row)
|
|
self.way_address_count += 1
|
|
self._after_address()
|
|
|
|
def _after_address(self) -> None:
|
|
self.address_count += 1
|
|
if len(self.rows) >= self.batch_size:
|
|
self.flush()
|
|
if self.address_count % 50_000 == 0:
|
|
_emit(
|
|
self.progress_callback,
|
|
"address_index_import_batch",
|
|
f"Imported {self.address_count:,} OSM addresses.",
|
|
self.address_count,
|
|
None,
|
|
{"processed": self.processed_count, "skipped": self.skipped_count},
|
|
)
|
|
|
|
def flush(self) -> None:
|
|
if not self.rows:
|
|
return
|
|
self.session.bulk_insert_mappings(OsmAddress, self.rows)
|
|
self.session.commit()
|
|
self.rows = []
|
|
|
|
|
|
def _apply_address_file_processor(handler: _AddressHandler, path: Path) -> None:
|
|
processor = (
|
|
osmium.FileProcessor(str(path), osmium.osm.NODE | osmium.osm.WAY)
|
|
.with_locations()
|
|
.with_filter(osmium.filter.KeyFilter("addr:housenumber", "addr:housename"))
|
|
)
|
|
for obj in processor:
|
|
handler.process_object(obj)
|
|
|
|
|
|
def _has_address(tags: dict[str, str]) -> bool:
|
|
housenumber = _clean(tags.get("addr:housenumber") or tags.get("addr:housename"))
|
|
if not housenumber:
|
|
return False
|
|
return any(_clean(tags.get(key)) for key in ("addr:street", "addr:place", "addr:city", "addr:postcode"))
|
|
|
|
|
|
def _address_row(
|
|
*,
|
|
dataset_id: int,
|
|
osm_type: str,
|
|
osm_id: str,
|
|
tags: dict[str, str],
|
|
lon: float,
|
|
lat: float,
|
|
bounds: tuple[float, float, float, float],
|
|
geometry_geojson: str | None = None,
|
|
) -> dict[str, object] | None:
|
|
housenumber = _clean(tags.get("addr:housenumber") or tags.get("addr:housename"))
|
|
street = _clean(tags.get("addr:street"))
|
|
place = _clean(tags.get("addr:place"))
|
|
postcode = _clean(tags.get("addr:postcode"))
|
|
city = _clean(tags.get("addr:city") or tags.get("addr:municipality"))
|
|
country = _clean(tags.get("addr:country"))
|
|
unit = _clean(tags.get("addr:unit"))
|
|
name = _clean(tags.get("name"))
|
|
display_name = _display_name(housenumber=housenumber, street=street, place=place, postcode=postcode, city=city, name=name)
|
|
if not display_name:
|
|
return None
|
|
search_text = _search_text(display_name, housenumber, street, place, postcode, city, country, unit, name)
|
|
selected_tags = {key: tags[key] for key in sorted(ADDRESS_TAGS) if key in tags}
|
|
min_lon, min_lat, max_lon, max_lat = bounds
|
|
return {
|
|
"dataset_id": dataset_id,
|
|
"osm_type": osm_type,
|
|
"osm_id": osm_id,
|
|
"housenumber": housenumber,
|
|
"street": street,
|
|
"place": place,
|
|
"postcode": postcode,
|
|
"city": city,
|
|
"country": country,
|
|
"unit": unit,
|
|
"name": name,
|
|
"display_name": display_name,
|
|
"search_text": search_text,
|
|
"lon": lon,
|
|
"lat": lat,
|
|
"min_lon": min_lon,
|
|
"min_lat": min_lat,
|
|
"max_lon": max_lon,
|
|
"max_lat": max_lat,
|
|
"geometry_geojson": geometry_geojson,
|
|
"tags_json": json.dumps(selected_tags, separators=(",", ":")) if selected_tags else None,
|
|
}
|
|
|
|
|
|
def _address_area_geometry_geojson(coords: list[tuple[float, float]], *, closed: bool | None = None) -> str | None:
|
|
if closed is False:
|
|
return None
|
|
if len(coords) < 3:
|
|
return None
|
|
ring_coords = list(coords)
|
|
first = ring_coords[0]
|
|
last = ring_coords[-1]
|
|
already_closed = abs(first[0] - last[0]) <= 1e-12 and abs(first[1] - last[1]) <= 1e-12
|
|
if not already_closed:
|
|
if closed is not True:
|
|
return None
|
|
ring_coords.append(first)
|
|
if len(ring_coords) < 4:
|
|
return None
|
|
ring = [[float(lon), float(lat)] for lon, lat in ring_coords]
|
|
if len({(round(lon, 12), round(lat, 12)) for lon, lat in ring_coords[:-1]}) < 3:
|
|
return None
|
|
return json.dumps({"type": "Polygon", "coordinates": [ring]}, separators=(",", ":"))
|
|
|
|
|
|
def _way_is_closed(way) -> bool:
|
|
try:
|
|
nodes = way.nodes
|
|
return len(nodes) >= 3 and nodes[0].ref == nodes[-1].ref
|
|
except (AttributeError, IndexError, TypeError):
|
|
return False
|
|
|
|
|
|
def _display_name(
|
|
*,
|
|
housenumber: str | None,
|
|
street: str | None,
|
|
place: str | None,
|
|
postcode: str | None,
|
|
city: str | None,
|
|
name: str | None,
|
|
) -> str | None:
|
|
road = street or place or name
|
|
if road and housenumber:
|
|
first = f"{road} {housenumber}"
|
|
else:
|
|
first = road or housenumber
|
|
locality = " ".join(part for part in [postcode, city] if part)
|
|
if first and locality:
|
|
return f"{first}, {locality}"
|
|
return first or locality
|
|
|
|
|
|
def _search_text(*parts: str | None) -> str:
|
|
return re.sub(r"\s+", " ", " ".join(part.casefold() for part in parts if part)).strip()
|
|
|
|
|
|
def _clean(value: object) -> str | None:
|
|
cleaned = re.sub(r"\s+", " ", str(value or "")).strip()
|
|
return cleaned or None
|
|
|
|
|
|
def _centroid(coords: list[tuple[float, float]]) -> tuple[float, float]:
|
|
if len(coords) >= 4 and coords[0] == coords[-1]:
|
|
area = 0.0
|
|
cx = 0.0
|
|
cy = 0.0
|
|
for (x1, y1), (x2, y2) in zip(coords, coords[1:]):
|
|
cross = x1 * y2 - x2 * y1
|
|
area += cross
|
|
cx += (x1 + x2) * cross
|
|
cy += (y1 + y2) * cross
|
|
if abs(area) > 1e-18:
|
|
factor = 1 / (3 * area)
|
|
return cx * factor, cy * factor
|
|
return (
|
|
math.fsum(coord[0] for coord in coords) / len(coords),
|
|
math.fsum(coord[1] for coord in coords) / len(coords),
|
|
)
|
|
|
|
|
|
def _drop_address_indexes(session: Session) -> None:
|
|
for name in [
|
|
"ix_osm_addresses_dataset_city_street",
|
|
"ix_osm_addresses_dataset_postcode",
|
|
"ix_osm_addresses_bbox",
|
|
"ix_osm_addresses_geom_gist",
|
|
"ix_osm_addresses_area_geom_gist",
|
|
"ix_osm_addresses_search_trgm",
|
|
"ix_osm_addresses_display_trgm",
|
|
"ix_osm_addresses_street_key_house",
|
|
"ix_osm_addresses_street_key_trgm",
|
|
]:
|
|
session.execute(text(f"DROP INDEX IF EXISTS {name}"))
|
|
|
|
|
|
def _create_address_indexes(session: Session) -> None:
|
|
statements = [
|
|
"CREATE INDEX IF NOT EXISTS ix_osm_addresses_dataset_city_street ON osm_addresses (dataset_id, city, street, housenumber)",
|
|
"CREATE INDEX IF NOT EXISTS ix_osm_addresses_dataset_postcode ON osm_addresses (dataset_id, postcode)",
|
|
"CREATE INDEX IF NOT EXISTS ix_osm_addresses_bbox ON osm_addresses (dataset_id, min_lon, max_lon, min_lat, max_lat)",
|
|
]
|
|
if settings.is_postgresql_database:
|
|
statements.extend(
|
|
[
|
|
"CREATE INDEX IF NOT EXISTS ix_osm_addresses_geom_gist ON osm_addresses USING GIST (geom)",
|
|
"CREATE INDEX IF NOT EXISTS ix_osm_addresses_area_geom_gist ON osm_addresses USING GIST (area_geom)",
|
|
"CREATE INDEX IF NOT EXISTS ix_osm_addresses_search_trgm ON osm_addresses USING GIN (LOWER(COALESCE(search_text, '')) gin_trgm_ops)",
|
|
"CREATE INDEX IF NOT EXISTS ix_osm_addresses_display_trgm ON osm_addresses USING GIN (LOWER(COALESCE(display_name, '')) gin_trgm_ops)",
|
|
"CREATE INDEX IF NOT EXISTS ix_osm_addresses_street_key_house ON osm_addresses (dataset_id, REPLACE(LOWER(COALESCE(NULLIF(street, ''), NULLIF(place, ''), '')), 'ß', 'ss'), housenumber)",
|
|
"CREATE INDEX IF NOT EXISTS ix_osm_addresses_street_key_trgm ON osm_addresses USING GIN (REPLACE(LOWER(COALESCE(NULLIF(street, ''), NULLIF(place, ''), '')), 'ß', 'ss') gin_trgm_ops)",
|
|
]
|
|
)
|
|
for statement in statements:
|
|
session.execute(text(statement))
|
|
|
|
|
|
def _metadata(dataset: Dataset) -> dict[str, object]:
|
|
try:
|
|
value = json.loads(dataset.metadata_json or "{}")
|
|
except json.JSONDecodeError:
|
|
return {}
|
|
return value if isinstance(value, dict) else {}
|
|
|
|
|
|
def _emit(
|
|
progress_callback: ProgressCallback | None,
|
|
event_type: str,
|
|
message: str,
|
|
progress_current: int | None,
|
|
progress_total: int | None,
|
|
metadata: dict[str, object] | None = None,
|
|
) -> None:
|
|
if progress_callback is not None:
|
|
progress_callback(event_type, message, progress_current, progress_total, metadata)
|