Files
multi-seal-mail/server/app/storage/services.py

751 lines
27 KiB
Python

from __future__ import annotations
import hashlib
import mimetypes
import re
import zipfile
from dataclasses import dataclass
from datetime import datetime, timezone
from io import BytesIO
from pathlib import PurePosixPath
from typing import Any, Iterable
from uuid import uuid4
from sqlalchemy import or_
from sqlalchemy.orm import Session
from app.db.models import (
Campaign,
CampaignAttachmentUse,
CampaignJob,
FileAsset,
FileBlob,
FileFolder,
FileShare,
FileVersion,
Group,
UserGroupMembership,
)
from app.settings import settings
from app.storage.backends import get_storage_backend
from app.storage.paths import filename_from_path, join_folder_filename, normalize_folder, normalize_logical_path, safe_storage_component
class FileStorageError(RuntimeError):
pass
@dataclass(slots=True)
class UploadedStoredFile:
asset: FileAsset
version: FileVersion
blob: FileBlob
@dataclass(slots=True)
class ResolvedPattern:
pattern: str
matches: list[FileAsset]
def utcnow() -> datetime:
return datetime.now(timezone.utc)
def user_group_ids(session: Session, *, tenant_id: str, user_id: str, include_admin_groups: bool = False) -> list[str]:
if include_admin_groups:
return [row.id for row in session.query(Group).filter(Group.tenant_id == tenant_id).order_by(Group.name.asc()).all()]
return [
row.group_id
for row in session.query(UserGroupMembership)
.filter(UserGroupMembership.tenant_id == tenant_id, UserGroupMembership.user_id == user_id)
.all()
]
def ensure_group_access(session: Session, *, tenant_id: str, group_id: str, user_id: str, is_admin: bool = False) -> None:
group = session.get(Group, group_id)
if not group or group.tenant_id != tenant_id:
raise FileStorageError("Group not found")
if is_admin:
return
membership = (
session.query(UserGroupMembership)
.filter(UserGroupMembership.tenant_id == tenant_id, UserGroupMembership.user_id == user_id, UserGroupMembership.group_id == group_id)
.one_or_none()
)
if membership is None:
raise FileStorageError("No access to this group file space")
def _owner_filter(query, owner_type: str, owner_id: str):
if owner_type == "user":
return query.filter(FileFolder.owner_user_id == owner_id)
if owner_type == "group":
return query.filter(FileFolder.owner_group_id == owner_id)
raise FileStorageError("Unsupported owner type")
def ensure_owner_access(session: Session, *, tenant_id: str, owner_type: str, owner_id: str, user_id: str, is_admin: bool = False) -> None:
owner_type = owner_type.lower().strip()
if owner_type == "user":
if owner_id != user_id and not is_admin:
raise FileStorageError("No access to this user file space")
return
if owner_type == "group":
ensure_group_access(session, tenant_id=tenant_id, group_id=owner_id, user_id=user_id, is_admin=is_admin)
return
raise FileStorageError("Files must be owned by a user or group")
def create_folder(
session: Session,
*,
tenant_id: str,
owner_type: str,
owner_id: str,
user_id: str,
path: str,
is_admin: bool = False,
) -> FileFolder:
owner_type = owner_type.lower().strip()
ensure_owner_access(session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id, user_id=user_id, is_admin=is_admin)
normalized = normalize_folder(path)
if not normalized:
raise FileStorageError("Folder path is required")
query = session.query(FileFolder).filter(FileFolder.tenant_id == tenant_id, FileFolder.owner_type == owner_type, FileFolder.path == normalized)
query = _owner_filter(query, owner_type, owner_id)
existing = query.order_by(FileFolder.deleted_at.asc()).first()
if existing:
if existing.deleted_at is not None:
existing.deleted_at = None
session.add(existing)
return existing
folder = FileFolder(
tenant_id=tenant_id,
owner_type=owner_type,
owner_user_id=owner_id if owner_type == "user" else None,
owner_group_id=owner_id if owner_type == "group" else None,
path=normalized,
created_by_user_id=user_id,
metadata_={},
)
session.add(folder)
session.flush()
return folder
def list_folders_for_user(
session: Session,
*,
tenant_id: str,
user_id: str,
owner_type: str,
owner_id: str,
include_deleted: bool = False,
is_admin: bool = False,
) -> list[FileFolder]:
owner_type = owner_type.lower().strip()
ensure_owner_access(session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id, user_id=user_id, is_admin=is_admin)
query = session.query(FileFolder).filter(FileFolder.tenant_id == tenant_id, FileFolder.owner_type == owner_type)
query = _owner_filter(query, owner_type, owner_id)
if not include_deleted:
query = query.filter(FileFolder.deleted_at.is_(None))
return query.order_by(FileFolder.path.asc()).all()
def soft_delete_folder(
session: Session,
*,
tenant_id: str,
owner_type: str,
owner_id: str,
user_id: str,
path: str,
recursive: bool = True,
is_admin: bool = False,
) -> tuple[int, int]:
owner_type = owner_type.lower().strip()
ensure_owner_access(session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id, user_id=user_id, is_admin=is_admin)
normalized = normalize_folder(path)
if not normalized:
raise FileStorageError("Folder path is required")
prefix = f"{normalized}/"
now = utcnow()
folder_query = session.query(FileFolder).filter(FileFolder.tenant_id == tenant_id, FileFolder.owner_type == owner_type, FileFolder.deleted_at.is_(None))
folder_query = _owner_filter(folder_query, owner_type, owner_id)
if recursive:
folder_query = folder_query.filter(or_(FileFolder.path == normalized, FileFolder.path.like(f"{prefix}%")))
else:
child_exists = folder_query.filter(FileFolder.path.like(f"{prefix}%")).first() is not None
file_exists = _asset_query_for_owner(session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id).filter(FileAsset.display_path.like(f"{prefix}%")).first() is not None
if child_exists or file_exists:
raise FileStorageError("Folder is not empty")
folder_query = folder_query.filter(FileFolder.path == normalized)
folders = folder_query.all()
for folder in folders:
folder.deleted_at = now
session.add(folder)
file_query = _asset_query_for_owner(session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id).filter(FileAsset.deleted_at.is_(None), FileAsset.display_path.like(f"{prefix}%"))
assets = file_query.all() if recursive else []
for asset in assets:
asset.deleted_at = now
session.add(asset)
return len(folders), len(assets)
def _asset_query_for_owner(session: Session, *, tenant_id: str, owner_type: str, owner_id: str):
query = session.query(FileAsset).filter(FileAsset.tenant_id == tenant_id, FileAsset.owner_type == owner_type)
if owner_type == "user":
return query.filter(FileAsset.owner_user_id == owner_id)
if owner_type == "group":
return query.filter(FileAsset.owner_group_id == owner_id)
raise FileStorageError("Unsupported owner type")
def _storage_bucket_name() -> str:
return settings.file_storage_s3_bucket or settings.s3_bucket
def _storage_backend_name() -> str:
return settings.file_storage_backend.lower().strip()
def _storage_key(*, tenant_id: str, checksum: str, filename: str) -> str:
return f"tenants/{tenant_id}/files/{checksum[:2]}/{uuid4().hex}-{safe_storage_component(filename)}"
def _get_or_create_blob(
session: Session,
*,
tenant_id: str,
data: bytes,
filename: str,
content_type: str | None,
) -> FileBlob:
checksum = hashlib.sha256(data).hexdigest()
size = len(data)
blob = (
session.query(FileBlob)
.filter(FileBlob.tenant_id == tenant_id, FileBlob.checksum_sha256 == checksum, FileBlob.size_bytes == size)
.one_or_none()
)
if blob:
blob.ref_count += 1
session.add(blob)
return blob
storage_key = _storage_key(tenant_id=tenant_id, checksum=checksum, filename=filename)
backend = get_storage_backend()
backend.put_bytes(storage_key, data, content_type=content_type)
blob = FileBlob(
tenant_id=tenant_id,
storage_backend=_storage_backend_name(),
storage_bucket=_storage_bucket_name(),
storage_key=storage_key,
checksum_sha256=checksum,
size_bytes=size,
content_type=content_type,
ref_count=1,
)
session.add(blob)
session.flush()
return blob
def create_file_asset(
session: Session,
*,
tenant_id: str,
owner_type: str,
owner_id: str,
user_id: str,
filename: str,
data: bytes,
folder: str | None = None,
display_path: str | None = None,
content_type: str | None = None,
description: str | None = None,
metadata: dict[str, Any] | None = None,
campaign_id: str | None = None,
is_admin: bool = False,
) -> UploadedStoredFile:
owner_type = owner_type.lower().strip()
ensure_owner_access(session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id, user_id=user_id, is_admin=is_admin)
safe_filename = filename_from_path(normalize_logical_path(filename, fallback_filename="file"))
logical_path = normalize_logical_path(display_path) if display_path else join_folder_filename(folder, safe_filename)
if not content_type:
content_type = mimetypes.guess_type(safe_filename)[0] or "application/octet-stream"
blob = _get_or_create_blob(session, tenant_id=tenant_id, data=data, filename=safe_filename, content_type=content_type)
asset = FileAsset(
tenant_id=tenant_id,
owner_type=owner_type,
owner_user_id=owner_id if owner_type == "user" else None,
owner_group_id=owner_id if owner_type == "group" else None,
display_path=logical_path,
filename=filename_from_path(logical_path),
description=description,
created_by_user_id=user_id,
metadata_=metadata or {},
)
session.add(asset)
session.flush()
version = FileVersion(
tenant_id=tenant_id,
file_asset_id=asset.id,
blob_id=blob.id,
version_number=1,
filename_at_upload=safe_filename,
display_path_at_upload=logical_path,
content_type=content_type,
size_bytes=blob.size_bytes,
checksum_sha256=blob.checksum_sha256,
created_by_user_id=user_id,
)
session.add(version)
session.flush()
asset.current_version_id = version.id
session.add(asset)
if campaign_id:
share_file(session, tenant_id=tenant_id, asset=asset, target_type="campaign", target_id=campaign_id, permission="read", user_id=user_id)
return UploadedStoredFile(asset=asset, version=version, blob=blob)
def get_asset_for_user(session: Session, *, tenant_id: str, user_id: str, asset_id: str, require_write: bool = False, is_admin: bool = False) -> FileAsset:
asset = session.get(FileAsset, asset_id)
if not asset or asset.tenant_id != tenant_id or asset.deleted_at is not None:
raise FileStorageError("File not found")
if is_admin:
return asset
group_ids = user_group_ids(session, tenant_id=tenant_id, user_id=user_id)
owns = (asset.owner_type == "user" and asset.owner_user_id == user_id) or (asset.owner_type == "group" and asset.owner_group_id in group_ids)
if owns:
return asset
permission_values = ["read", "write", "manage"] if not require_write else ["write", "manage"]
share = (
session.query(FileShare)
.filter(
FileShare.tenant_id == tenant_id,
FileShare.file_asset_id == asset.id,
FileShare.revoked_at.is_(None),
FileShare.permission.in_(permission_values),
or_(
(FileShare.target_type == "user") & (FileShare.target_id == user_id),
(FileShare.target_type == "group") & (FileShare.target_id.in_(group_ids)),
(FileShare.target_type == "tenant") & (FileShare.target_id == tenant_id),
),
)
.first()
)
if not share:
raise FileStorageError("No access to this file")
return asset
def list_assets_for_user(
session: Session,
*,
tenant_id: str,
user_id: str,
owner_type: str | None = None,
owner_id: str | None = None,
campaign_id: str | None = None,
path_prefix: str | None = None,
include_deleted: bool = False,
is_admin: bool = False,
) -> list[FileAsset]:
query = session.query(FileAsset).filter(FileAsset.tenant_id == tenant_id)
if not include_deleted:
query = query.filter(FileAsset.deleted_at.is_(None))
if owner_type:
query = query.filter(FileAsset.owner_type == owner_type)
if owner_type == "user" and owner_id:
query = query.filter(FileAsset.owner_user_id == owner_id)
if owner_type == "group" and owner_id:
query = query.filter(FileAsset.owner_group_id == owner_id)
if campaign_id:
query = query.join(FileShare, FileShare.file_asset_id == FileAsset.id).filter(
FileShare.tenant_id == tenant_id,
FileShare.target_type == "campaign",
FileShare.target_id == campaign_id,
FileShare.revoked_at.is_(None),
)
elif not is_admin and not owner_type:
group_ids = user_group_ids(session, tenant_id=tenant_id, user_id=user_id)
query = query.outerjoin(FileShare, FileShare.file_asset_id == FileAsset.id).filter(
or_(
(FileAsset.owner_type == "user") & (FileAsset.owner_user_id == user_id),
(FileAsset.owner_type == "group") & (FileAsset.owner_group_id.in_(group_ids)),
(FileShare.revoked_at.is_(None)) & (FileShare.target_type == "user") & (FileShare.target_id == user_id),
(FileShare.revoked_at.is_(None)) & (FileShare.target_type == "group") & (FileShare.target_id.in_(group_ids)),
(FileShare.revoked_at.is_(None)) & (FileShare.target_type == "tenant") & (FileShare.target_id == tenant_id),
)
)
if path_prefix:
prefix = normalize_folder(path_prefix)
if prefix:
query = query.filter(FileAsset.display_path.like(f"{prefix}/%"))
return query.order_by(FileAsset.display_path.asc(), FileAsset.updated_at.desc()).all()
def current_version_and_blob(session: Session, asset: FileAsset) -> tuple[FileVersion, FileBlob]:
if not asset.current_version_id:
raise FileStorageError("File has no current version")
version = session.get(FileVersion, asset.current_version_id)
if not version:
raise FileStorageError("File version not found")
blob = session.get(FileBlob, version.blob_id)
if not blob:
raise FileStorageError("File blob not found")
return version, blob
def read_asset_bytes(session: Session, asset: FileAsset) -> tuple[bytes, FileVersion, FileBlob]:
version, blob = current_version_and_blob(session, asset)
backend = get_storage_backend()
return backend.get_bytes(blob.storage_key), version, blob
def share_file(
session: Session,
*,
tenant_id: str,
asset: FileAsset,
target_type: str,
target_id: str,
permission: str,
user_id: str,
) -> FileShare:
target_type = target_type.lower().strip()
permission = permission.lower().strip()
if target_type not in {"user", "group", "campaign", "tenant"}:
raise FileStorageError("Unsupported share target")
if permission not in {"read", "write", "manage"}:
raise FileStorageError("Unsupported file permission")
if target_type == "campaign":
campaign = session.get(Campaign, target_id)
if not campaign or campaign.tenant_id != tenant_id:
raise FileStorageError("Campaign not found")
existing = (
session.query(FileShare)
.filter(
FileShare.tenant_id == tenant_id,
FileShare.file_asset_id == asset.id,
FileShare.target_type == target_type,
FileShare.target_id == target_id,
FileShare.revoked_at.is_(None),
)
.one_or_none()
)
if existing:
existing.permission = permission
session.add(existing)
return existing
share = FileShare(
tenant_id=tenant_id,
file_asset_id=asset.id,
target_type=target_type,
target_id=target_id,
permission=permission,
created_by_user_id=user_id,
)
session.add(share)
return share
def soft_delete_assets(session: Session, assets: Iterable[FileAsset]) -> int:
count = 0
now = utcnow()
for asset in assets:
if asset.deleted_at is None:
asset.deleted_at = now
session.add(asset)
count += 1
return count
def asset_is_audit_relevant(session: Session, asset: FileAsset) -> bool:
return (
session.query(CampaignAttachmentUse)
.filter(CampaignAttachmentUse.file_asset_id == asset.id, CampaignAttachmentUse.use_stage == "sent")
.first()
is not None
)
def _normalize_pattern(pattern: str) -> str:
if pattern.strip() in {"", "*"}:
return "*"
return normalize_logical_path(pattern, fallback_filename="*")
def _logical_glob_regex(pattern: str) -> re.Pattern[str]:
"""Compile Multi Seal Mail logical globs.
`*` and `?` stay within one folder segment. `**` crosses folder
boundaries, and `**/` also matches the current folder so `**/*.pdf`
returns direct and nested PDF files.
"""
pattern = _normalize_pattern(pattern)
pieces = ["^"]
index = 0
while index < len(pattern):
char = pattern[index]
if char == "*":
if index + 1 < len(pattern) and pattern[index + 1] == "*":
index += 2
if index < len(pattern) and pattern[index] == "/":
pieces.append("(?:.*/)?")
index += 1
else:
pieces.append(".*")
continue
pieces.append("[^/]*")
elif char == "?":
pieces.append("[^/]")
else:
pieces.append(re.escape(char))
index += 1
pieces.append("$")
return re.compile("".join(pieces))
def _relative_display_path(asset: FileAsset, base_path: str | None) -> str:
path = normalize_logical_path(asset.display_path)
base = normalize_folder(base_path)
if not base:
return path
prefix = f"{base}/"
if path.startswith(prefix):
return path[len(prefix) :]
return path
def match_assets(assets: Iterable[FileAsset], pattern: str, *, base_path: str | None = None) -> list[FileAsset]:
regex = _logical_glob_regex(pattern)
normalized_pattern = _normalize_pattern(pattern)
has_path_context = base_path is not None or "/" in normalized_pattern or "**" in normalized_pattern
matches: list[FileAsset] = []
for asset in assets:
candidates = [_relative_display_path(asset, base_path)] if has_path_context else [asset.display_path, asset.filename]
if any(regex.match(candidate) for candidate in candidates):
matches.append(asset)
return matches
def resolve_patterns(assets: list[FileAsset], patterns: list[str], *, base_path: str | None = None) -> tuple[list[ResolvedPattern], list[FileAsset]]:
resolved = [ResolvedPattern(pattern=pattern, matches=match_assets(assets, pattern, base_path=base_path)) for pattern in patterns]
matched_ids = {asset.id for item in resolved for asset in item.matches}
unmatched = [asset for asset in assets if asset.id not in matched_ids]
return resolved, unmatched
def rename_asset(asset: FileAsset, *, new_path: str) -> None:
normalized = normalize_logical_path(new_path)
asset.display_path = normalized
asset.filename = filename_from_path(normalized)
def build_rename_preview(asset: FileAsset, *, mode: str, find: str | None = None, replacement: str = "", prefix: str = "", suffix: str = "") -> str:
path = PurePosixPath(asset.display_path)
folder = "" if str(path.parent) == "." else str(path.parent)
name = path.name
stem = PurePosixPath(name).stem
ext = "".join(PurePosixPath(name).suffixes)
if mode == "prefix":
next_name = prefix + name
elif mode == "suffix":
next_name = f"{stem}{suffix}{ext}"
elif mode == "replace":
if not find:
next_name = name
else:
next_name = name.replace(find, replacement)
else:
raise FileStorageError("Unsupported rename mode")
return f"{folder}/{next_name}" if folder else next_name
def create_zip_bytes(session: Session, assets: Iterable[FileAsset]) -> bytes:
buffer = BytesIO()
with zipfile.ZipFile(buffer, mode="w", compression=zipfile.ZIP_DEFLATED) as archive:
for asset in assets:
data, _, _ = read_asset_bytes(session, asset)
archive.writestr(asset.display_path, data)
buffer.seek(0)
return buffer.getvalue()
def extract_zip_upload(
session: Session,
*,
tenant_id: str,
owner_type: str,
owner_id: str,
user_id: str,
zip_data: bytes,
folder: str | None,
campaign_id: str | None,
is_admin: bool = False,
max_files: int = 1000,
max_total_bytes: int = 250 * 1024 * 1024,
) -> list[UploadedStoredFile]:
uploaded: list[UploadedStoredFile] = []
total = 0
base_folder = normalize_folder(folder)
with zipfile.ZipFile(BytesIO(zip_data)) as archive:
infos = [info for info in archive.infolist() if not info.is_dir()]
if len(infos) > max_files:
raise FileStorageError(f"ZIP contains too many files (limit {max_files})")
for info in infos:
if info.file_size < 0:
raise FileStorageError("Invalid ZIP member")
total += info.file_size
if total > max_total_bytes:
raise FileStorageError("ZIP is too large after extraction")
inner_path = normalize_logical_path(info.filename)
target_path = f"{base_folder}/{inner_path}" if base_folder else inner_path
data = archive.read(info)
uploaded.append(
create_file_asset(
session,
tenant_id=tenant_id,
owner_type=owner_type,
owner_id=owner_id,
user_id=user_id,
filename=filename_from_path(inner_path),
data=data,
display_path=target_path,
content_type=mimetypes.guess_type(inner_path)[0] or "application/octet-stream",
campaign_id=campaign_id,
is_admin=is_admin,
)
)
return uploaded
def _candidate_match_keys(raw_match: str) -> set[str]:
cleaned = raw_match.replace("\\", "/").strip().strip("/")
result = {cleaned}
if cleaned:
result.add(PurePosixPath(cleaned).name)
return {item for item in result if item}
def record_campaign_attachment_uses_for_job(session: Session, job: CampaignJob, *, stage: str = "built") -> None:
"""Create best-effort immutable file-use records for matched managed files.
Existing attachment resolution is still filesystem/path based. This bridge
records uses when a resolved attachment match can be tied to a managed file
by logical path or filename among files shared with the campaign.
"""
attachments = job.resolved_attachments or []
if not isinstance(attachments, list):
return
assets = list_assets_for_user(
session,
tenant_id=job.tenant_id,
user_id="",
campaign_id=job.campaign_id,
is_admin=True,
)
by_key: dict[str, FileAsset] = {}
for asset in assets:
by_key[asset.display_path.strip("/")] = asset
by_key[asset.filename] = asset
for attachment in attachments:
if not isinstance(attachment, dict):
continue
matches = attachment.get("matches") if isinstance(attachment.get("matches"), list) else []
for raw in matches:
if not isinstance(raw, str):
continue
asset = next((by_key[key] for key in _candidate_match_keys(raw) if key in by_key), None)
if not asset:
continue
version, blob = current_version_and_blob(session, asset)
exists = (
session.query(CampaignAttachmentUse)
.filter(
CampaignAttachmentUse.campaign_job_id == job.id,
CampaignAttachmentUse.file_version_id == version.id,
CampaignAttachmentUse.filename_used == asset.filename,
CampaignAttachmentUse.use_stage == stage,
)
.one_or_none()
)
if exists:
continue
session.add(
CampaignAttachmentUse(
tenant_id=job.tenant_id,
campaign_id=job.campaign_id,
campaign_version_id=job.campaign_version_id,
campaign_job_id=job.id,
entry_index=job.entry_index,
entry_id=job.entry_id,
file_asset_id=asset.id,
file_version_id=version.id,
file_blob_id=blob.id,
filename_used=asset.filename,
checksum_sha256=blob.checksum_sha256,
size_bytes=blob.size_bytes,
content_type=blob.content_type,
use_stage=stage,
)
)
def mark_job_attachment_uses_sent(session: Session, job: CampaignJob) -> None:
record_campaign_attachment_uses_for_job(session, job, stage="built")
now = utcnow()
uses = (
session.query(CampaignAttachmentUse)
.filter(
CampaignAttachmentUse.tenant_id == job.tenant_id,
CampaignAttachmentUse.campaign_job_id == job.id,
CampaignAttachmentUse.use_stage == "built",
)
.all()
)
for use in uses:
sent = (
session.query(CampaignAttachmentUse)
.filter(
CampaignAttachmentUse.campaign_job_id == job.id,
CampaignAttachmentUse.file_version_id == use.file_version_id,
CampaignAttachmentUse.use_stage == "sent",
)
.one_or_none()
)
if sent:
continue
session.add(
CampaignAttachmentUse(
tenant_id=use.tenant_id,
campaign_id=use.campaign_id,
campaign_version_id=use.campaign_version_id,
campaign_job_id=use.campaign_job_id,
entry_index=use.entry_index,
entry_id=use.entry_id,
file_asset_id=use.file_asset_id,
file_version_id=use.file_version_id,
file_blob_id=use.file_blob_id,
filename_used=use.filename_used,
checksum_sha256=use.checksum_sha256,
size_bytes=use.size_bytes,
content_type=use.content_type,
use_stage="sent",
used_at=now,
)
)