from __future__ import annotations import hashlib import mimetypes import re import zipfile from dataclasses import dataclass from datetime import datetime, timezone from io import BytesIO from pathlib import PurePosixPath from typing import Any, Iterable from uuid import uuid4 from sqlalchemy import or_ from sqlalchemy.orm import Session from app.db.models import ( Campaign, CampaignAttachmentUse, CampaignJob, FileAsset, FileBlob, FileFolder, FileShare, FileVersion, Group, UserGroupMembership, ) from app.settings import settings from app.storage.backends import get_storage_backend from app.storage.paths import filename_from_path, join_folder_filename, normalize_folder, normalize_logical_path, safe_storage_component class FileStorageError(RuntimeError): pass @dataclass(slots=True) class UploadedStoredFile: asset: FileAsset version: FileVersion blob: FileBlob @dataclass(slots=True) class ResolvedPattern: pattern: str matches: list[FileAsset] def utcnow() -> datetime: return datetime.now(timezone.utc) def user_group_ids(session: Session, *, tenant_id: str, user_id: str, include_admin_groups: bool = False) -> list[str]: if include_admin_groups: return [row.id for row in session.query(Group).filter(Group.tenant_id == tenant_id).order_by(Group.name.asc()).all()] return [ row.group_id for row in session.query(UserGroupMembership) .filter(UserGroupMembership.tenant_id == tenant_id, UserGroupMembership.user_id == user_id) .all() ] def ensure_group_access(session: Session, *, tenant_id: str, group_id: str, user_id: str, is_admin: bool = False) -> None: group = session.get(Group, group_id) if not group or group.tenant_id != tenant_id: raise FileStorageError("Group not found") if is_admin: return membership = ( session.query(UserGroupMembership) .filter(UserGroupMembership.tenant_id == tenant_id, UserGroupMembership.user_id == user_id, UserGroupMembership.group_id == group_id) .one_or_none() ) if membership is None: raise FileStorageError("No access to this group file space") def _owner_filter(query, owner_type: str, owner_id: str): if owner_type == "user": return query.filter(FileFolder.owner_user_id == owner_id) if owner_type == "group": return query.filter(FileFolder.owner_group_id == owner_id) raise FileStorageError("Unsupported owner type") def ensure_owner_access(session: Session, *, tenant_id: str, owner_type: str, owner_id: str, user_id: str, is_admin: bool = False) -> None: owner_type = owner_type.lower().strip() if owner_type == "user": if owner_id != user_id and not is_admin: raise FileStorageError("No access to this user file space") return if owner_type == "group": ensure_group_access(session, tenant_id=tenant_id, group_id=owner_id, user_id=user_id, is_admin=is_admin) return raise FileStorageError("Files must be owned by a user or group") def create_folder( session: Session, *, tenant_id: str, owner_type: str, owner_id: str, user_id: str, path: str, is_admin: bool = False, ) -> FileFolder: owner_type = owner_type.lower().strip() ensure_owner_access(session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id, user_id=user_id, is_admin=is_admin) normalized = normalize_folder(path) if not normalized: raise FileStorageError("Folder path is required") query = session.query(FileFolder).filter(FileFolder.tenant_id == tenant_id, FileFolder.owner_type == owner_type, FileFolder.path == normalized) query = _owner_filter(query, owner_type, owner_id) existing = query.order_by(FileFolder.deleted_at.asc()).first() if existing: if existing.deleted_at is not None: existing.deleted_at = None session.add(existing) return existing folder = FileFolder( tenant_id=tenant_id, owner_type=owner_type, owner_user_id=owner_id if owner_type == "user" else None, owner_group_id=owner_id if owner_type == "group" else None, path=normalized, created_by_user_id=user_id, metadata_={}, ) session.add(folder) session.flush() return folder def list_folders_for_user( session: Session, *, tenant_id: str, user_id: str, owner_type: str, owner_id: str, include_deleted: bool = False, is_admin: bool = False, ) -> list[FileFolder]: owner_type = owner_type.lower().strip() ensure_owner_access(session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id, user_id=user_id, is_admin=is_admin) query = session.query(FileFolder).filter(FileFolder.tenant_id == tenant_id, FileFolder.owner_type == owner_type) query = _owner_filter(query, owner_type, owner_id) if not include_deleted: query = query.filter(FileFolder.deleted_at.is_(None)) return query.order_by(FileFolder.path.asc()).all() def soft_delete_folder( session: Session, *, tenant_id: str, owner_type: str, owner_id: str, user_id: str, path: str, recursive: bool = True, is_admin: bool = False, ) -> tuple[int, int]: owner_type = owner_type.lower().strip() ensure_owner_access(session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id, user_id=user_id, is_admin=is_admin) normalized = normalize_folder(path) if not normalized: raise FileStorageError("Folder path is required") prefix = f"{normalized}/" now = utcnow() folder_query = session.query(FileFolder).filter(FileFolder.tenant_id == tenant_id, FileFolder.owner_type == owner_type, FileFolder.deleted_at.is_(None)) folder_query = _owner_filter(folder_query, owner_type, owner_id) if recursive: folder_query = folder_query.filter(or_(FileFolder.path == normalized, FileFolder.path.like(f"{prefix}%"))) else: child_exists = folder_query.filter(FileFolder.path.like(f"{prefix}%")).first() is not None file_exists = _asset_query_for_owner(session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id).filter(FileAsset.display_path.like(f"{prefix}%")).first() is not None if child_exists or file_exists: raise FileStorageError("Folder is not empty") folder_query = folder_query.filter(FileFolder.path == normalized) folders = folder_query.all() for folder in folders: folder.deleted_at = now session.add(folder) file_query = _asset_query_for_owner(session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id).filter(FileAsset.deleted_at.is_(None), FileAsset.display_path.like(f"{prefix}%")) assets = file_query.all() if recursive else [] for asset in assets: asset.deleted_at = now session.add(asset) return len(folders), len(assets) def _asset_query_for_owner(session: Session, *, tenant_id: str, owner_type: str, owner_id: str): query = session.query(FileAsset).filter(FileAsset.tenant_id == tenant_id, FileAsset.owner_type == owner_type) if owner_type == "user": return query.filter(FileAsset.owner_user_id == owner_id) if owner_type == "group": return query.filter(FileAsset.owner_group_id == owner_id) raise FileStorageError("Unsupported owner type") def _storage_bucket_name() -> str: return settings.file_storage_s3_bucket or settings.s3_bucket def _storage_backend_name() -> str: return settings.file_storage_backend.lower().strip() def _storage_key(*, tenant_id: str, checksum: str, filename: str) -> str: return f"tenants/{tenant_id}/files/{checksum[:2]}/{uuid4().hex}-{safe_storage_component(filename)}" def _get_or_create_blob( session: Session, *, tenant_id: str, data: bytes, filename: str, content_type: str | None, ) -> FileBlob: checksum = hashlib.sha256(data).hexdigest() size = len(data) blob = ( session.query(FileBlob) .filter(FileBlob.tenant_id == tenant_id, FileBlob.checksum_sha256 == checksum, FileBlob.size_bytes == size) .one_or_none() ) if blob: blob.ref_count += 1 session.add(blob) return blob storage_key = _storage_key(tenant_id=tenant_id, checksum=checksum, filename=filename) backend = get_storage_backend() backend.put_bytes(storage_key, data, content_type=content_type) blob = FileBlob( tenant_id=tenant_id, storage_backend=_storage_backend_name(), storage_bucket=_storage_bucket_name(), storage_key=storage_key, checksum_sha256=checksum, size_bytes=size, content_type=content_type, ref_count=1, ) session.add(blob) session.flush() return blob def create_file_asset( session: Session, *, tenant_id: str, owner_type: str, owner_id: str, user_id: str, filename: str, data: bytes, folder: str | None = None, display_path: str | None = None, content_type: str | None = None, description: str | None = None, metadata: dict[str, Any] | None = None, campaign_id: str | None = None, is_admin: bool = False, ) -> UploadedStoredFile: owner_type = owner_type.lower().strip() ensure_owner_access(session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id, user_id=user_id, is_admin=is_admin) safe_filename = filename_from_path(normalize_logical_path(filename, fallback_filename="file")) logical_path = normalize_logical_path(display_path) if display_path else join_folder_filename(folder, safe_filename) if not content_type: content_type = mimetypes.guess_type(safe_filename)[0] or "application/octet-stream" blob = _get_or_create_blob(session, tenant_id=tenant_id, data=data, filename=safe_filename, content_type=content_type) asset = FileAsset( tenant_id=tenant_id, owner_type=owner_type, owner_user_id=owner_id if owner_type == "user" else None, owner_group_id=owner_id if owner_type == "group" else None, display_path=logical_path, filename=filename_from_path(logical_path), description=description, created_by_user_id=user_id, metadata_=metadata or {}, ) session.add(asset) session.flush() version = FileVersion( tenant_id=tenant_id, file_asset_id=asset.id, blob_id=blob.id, version_number=1, filename_at_upload=safe_filename, display_path_at_upload=logical_path, content_type=content_type, size_bytes=blob.size_bytes, checksum_sha256=blob.checksum_sha256, created_by_user_id=user_id, ) session.add(version) session.flush() asset.current_version_id = version.id session.add(asset) if campaign_id: share_file(session, tenant_id=tenant_id, asset=asset, target_type="campaign", target_id=campaign_id, permission="read", user_id=user_id) return UploadedStoredFile(asset=asset, version=version, blob=blob) def get_asset_for_user(session: Session, *, tenant_id: str, user_id: str, asset_id: str, require_write: bool = False, is_admin: bool = False) -> FileAsset: asset = session.get(FileAsset, asset_id) if not asset or asset.tenant_id != tenant_id or asset.deleted_at is not None: raise FileStorageError("File not found") if is_admin: return asset group_ids = user_group_ids(session, tenant_id=tenant_id, user_id=user_id) owns = (asset.owner_type == "user" and asset.owner_user_id == user_id) or (asset.owner_type == "group" and asset.owner_group_id in group_ids) if owns: return asset permission_values = ["read", "write", "manage"] if not require_write else ["write", "manage"] share = ( session.query(FileShare) .filter( FileShare.tenant_id == tenant_id, FileShare.file_asset_id == asset.id, FileShare.revoked_at.is_(None), FileShare.permission.in_(permission_values), or_( (FileShare.target_type == "user") & (FileShare.target_id == user_id), (FileShare.target_type == "group") & (FileShare.target_id.in_(group_ids)), (FileShare.target_type == "tenant") & (FileShare.target_id == tenant_id), ), ) .first() ) if not share: raise FileStorageError("No access to this file") return asset def list_assets_for_user( session: Session, *, tenant_id: str, user_id: str, owner_type: str | None = None, owner_id: str | None = None, campaign_id: str | None = None, path_prefix: str | None = None, include_deleted: bool = False, is_admin: bool = False, ) -> list[FileAsset]: query = session.query(FileAsset).filter(FileAsset.tenant_id == tenant_id) if not include_deleted: query = query.filter(FileAsset.deleted_at.is_(None)) if owner_type: query = query.filter(FileAsset.owner_type == owner_type) if owner_type == "user" and owner_id: query = query.filter(FileAsset.owner_user_id == owner_id) if owner_type == "group" and owner_id: query = query.filter(FileAsset.owner_group_id == owner_id) if campaign_id: query = query.join(FileShare, FileShare.file_asset_id == FileAsset.id).filter( FileShare.tenant_id == tenant_id, FileShare.target_type == "campaign", FileShare.target_id == campaign_id, FileShare.revoked_at.is_(None), ) elif not is_admin and not owner_type: group_ids = user_group_ids(session, tenant_id=tenant_id, user_id=user_id) query = query.outerjoin(FileShare, FileShare.file_asset_id == FileAsset.id).filter( or_( (FileAsset.owner_type == "user") & (FileAsset.owner_user_id == user_id), (FileAsset.owner_type == "group") & (FileAsset.owner_group_id.in_(group_ids)), (FileShare.revoked_at.is_(None)) & (FileShare.target_type == "user") & (FileShare.target_id == user_id), (FileShare.revoked_at.is_(None)) & (FileShare.target_type == "group") & (FileShare.target_id.in_(group_ids)), (FileShare.revoked_at.is_(None)) & (FileShare.target_type == "tenant") & (FileShare.target_id == tenant_id), ) ) if path_prefix: prefix = normalize_folder(path_prefix) if prefix: query = query.filter(FileAsset.display_path.like(f"{prefix}/%")) return query.order_by(FileAsset.display_path.asc(), FileAsset.updated_at.desc()).all() def current_version_and_blob(session: Session, asset: FileAsset) -> tuple[FileVersion, FileBlob]: if not asset.current_version_id: raise FileStorageError("File has no current version") version = session.get(FileVersion, asset.current_version_id) if not version: raise FileStorageError("File version not found") blob = session.get(FileBlob, version.blob_id) if not blob: raise FileStorageError("File blob not found") return version, blob def read_asset_bytes(session: Session, asset: FileAsset) -> tuple[bytes, FileVersion, FileBlob]: version, blob = current_version_and_blob(session, asset) backend = get_storage_backend() return backend.get_bytes(blob.storage_key), version, blob def share_file( session: Session, *, tenant_id: str, asset: FileAsset, target_type: str, target_id: str, permission: str, user_id: str, ) -> FileShare: target_type = target_type.lower().strip() permission = permission.lower().strip() if target_type not in {"user", "group", "campaign", "tenant"}: raise FileStorageError("Unsupported share target") if permission not in {"read", "write", "manage"}: raise FileStorageError("Unsupported file permission") if target_type == "campaign": campaign = session.get(Campaign, target_id) if not campaign or campaign.tenant_id != tenant_id: raise FileStorageError("Campaign not found") existing = ( session.query(FileShare) .filter( FileShare.tenant_id == tenant_id, FileShare.file_asset_id == asset.id, FileShare.target_type == target_type, FileShare.target_id == target_id, FileShare.revoked_at.is_(None), ) .one_or_none() ) if existing: existing.permission = permission session.add(existing) return existing share = FileShare( tenant_id=tenant_id, file_asset_id=asset.id, target_type=target_type, target_id=target_id, permission=permission, created_by_user_id=user_id, ) session.add(share) return share def soft_delete_assets(session: Session, assets: Iterable[FileAsset]) -> int: count = 0 now = utcnow() for asset in assets: if asset.deleted_at is None: asset.deleted_at = now session.add(asset) count += 1 return count def asset_is_audit_relevant(session: Session, asset: FileAsset) -> bool: return ( session.query(CampaignAttachmentUse) .filter(CampaignAttachmentUse.file_asset_id == asset.id, CampaignAttachmentUse.use_stage == "sent") .first() is not None ) def _normalize_pattern(pattern: str) -> str: if pattern.strip() in {"", "*"}: return "*" return normalize_logical_path(pattern, fallback_filename="*") def _logical_glob_regex(pattern: str) -> re.Pattern[str]: """Compile Multi Seal Mail logical globs. `*` and `?` stay within one folder segment. `**` crosses folder boundaries, and `**/` also matches the current folder so `**/*.pdf` returns direct and nested PDF files. """ pattern = _normalize_pattern(pattern) pieces = ["^"] index = 0 while index < len(pattern): char = pattern[index] if char == "*": if index + 1 < len(pattern) and pattern[index + 1] == "*": index += 2 if index < len(pattern) and pattern[index] == "/": pieces.append("(?:.*/)?") index += 1 else: pieces.append(".*") continue pieces.append("[^/]*") elif char == "?": pieces.append("[^/]") else: pieces.append(re.escape(char)) index += 1 pieces.append("$") return re.compile("".join(pieces)) def _relative_display_path(asset: FileAsset, base_path: str | None) -> str: path = normalize_logical_path(asset.display_path) base = normalize_folder(base_path) if not base: return path prefix = f"{base}/" if path.startswith(prefix): return path[len(prefix) :] return path def match_assets(assets: Iterable[FileAsset], pattern: str, *, base_path: str | None = None) -> list[FileAsset]: regex = _logical_glob_regex(pattern) normalized_pattern = _normalize_pattern(pattern) has_path_context = base_path is not None or "/" in normalized_pattern or "**" in normalized_pattern matches: list[FileAsset] = [] for asset in assets: candidates = [_relative_display_path(asset, base_path)] if has_path_context else [asset.display_path, asset.filename] if any(regex.match(candidate) for candidate in candidates): matches.append(asset) return matches def resolve_patterns(assets: list[FileAsset], patterns: list[str], *, base_path: str | None = None) -> tuple[list[ResolvedPattern], list[FileAsset]]: resolved = [ResolvedPattern(pattern=pattern, matches=match_assets(assets, pattern, base_path=base_path)) for pattern in patterns] matched_ids = {asset.id for item in resolved for asset in item.matches} unmatched = [asset for asset in assets if asset.id not in matched_ids] return resolved, unmatched def rename_asset(asset: FileAsset, *, new_path: str) -> None: normalized = normalize_logical_path(new_path) asset.display_path = normalized asset.filename = filename_from_path(normalized) def build_rename_preview(asset: FileAsset, *, mode: str, find: str | None = None, replacement: str = "", prefix: str = "", suffix: str = "") -> str: path = PurePosixPath(asset.display_path) folder = "" if str(path.parent) == "." else str(path.parent) name = path.name stem = PurePosixPath(name).stem ext = "".join(PurePosixPath(name).suffixes) if mode == "prefix": next_name = prefix + name elif mode == "suffix": next_name = f"{stem}{suffix}{ext}" elif mode == "replace": if not find: next_name = name else: next_name = name.replace(find, replacement) else: raise FileStorageError("Unsupported rename mode") return f"{folder}/{next_name}" if folder else next_name def create_zip_bytes(session: Session, assets: Iterable[FileAsset]) -> bytes: buffer = BytesIO() with zipfile.ZipFile(buffer, mode="w", compression=zipfile.ZIP_DEFLATED) as archive: for asset in assets: data, _, _ = read_asset_bytes(session, asset) archive.writestr(asset.display_path, data) buffer.seek(0) return buffer.getvalue() def extract_zip_upload( session: Session, *, tenant_id: str, owner_type: str, owner_id: str, user_id: str, zip_data: bytes, folder: str | None, campaign_id: str | None, is_admin: bool = False, max_files: int = 1000, max_total_bytes: int = 250 * 1024 * 1024, ) -> list[UploadedStoredFile]: uploaded: list[UploadedStoredFile] = [] total = 0 base_folder = normalize_folder(folder) with zipfile.ZipFile(BytesIO(zip_data)) as archive: infos = [info for info in archive.infolist() if not info.is_dir()] if len(infos) > max_files: raise FileStorageError(f"ZIP contains too many files (limit {max_files})") for info in infos: if info.file_size < 0: raise FileStorageError("Invalid ZIP member") total += info.file_size if total > max_total_bytes: raise FileStorageError("ZIP is too large after extraction") inner_path = normalize_logical_path(info.filename) target_path = f"{base_folder}/{inner_path}" if base_folder else inner_path data = archive.read(info) uploaded.append( create_file_asset( session, tenant_id=tenant_id, owner_type=owner_type, owner_id=owner_id, user_id=user_id, filename=filename_from_path(inner_path), data=data, display_path=target_path, content_type=mimetypes.guess_type(inner_path)[0] or "application/octet-stream", campaign_id=campaign_id, is_admin=is_admin, ) ) return uploaded def _candidate_match_keys(raw_match: str) -> set[str]: cleaned = raw_match.replace("\\", "/").strip().strip("/") result = {cleaned} if cleaned: result.add(PurePosixPath(cleaned).name) return {item for item in result if item} def record_campaign_attachment_uses_for_job(session: Session, job: CampaignJob, *, stage: str = "built") -> None: """Create best-effort immutable file-use records for matched managed files. Existing attachment resolution is still filesystem/path based. This bridge records uses when a resolved attachment match can be tied to a managed file by logical path or filename among files shared with the campaign. """ attachments = job.resolved_attachments or [] if not isinstance(attachments, list): return assets = list_assets_for_user( session, tenant_id=job.tenant_id, user_id="", campaign_id=job.campaign_id, is_admin=True, ) by_key: dict[str, FileAsset] = {} for asset in assets: by_key[asset.display_path.strip("/")] = asset by_key[asset.filename] = asset for attachment in attachments: if not isinstance(attachment, dict): continue matches = attachment.get("matches") if isinstance(attachment.get("matches"), list) else [] for raw in matches: if not isinstance(raw, str): continue asset = next((by_key[key] for key in _candidate_match_keys(raw) if key in by_key), None) if not asset: continue version, blob = current_version_and_blob(session, asset) exists = ( session.query(CampaignAttachmentUse) .filter( CampaignAttachmentUse.campaign_job_id == job.id, CampaignAttachmentUse.file_version_id == version.id, CampaignAttachmentUse.filename_used == asset.filename, CampaignAttachmentUse.use_stage == stage, ) .one_or_none() ) if exists: continue session.add( CampaignAttachmentUse( tenant_id=job.tenant_id, campaign_id=job.campaign_id, campaign_version_id=job.campaign_version_id, campaign_job_id=job.id, entry_index=job.entry_index, entry_id=job.entry_id, file_asset_id=asset.id, file_version_id=version.id, file_blob_id=blob.id, filename_used=asset.filename, checksum_sha256=blob.checksum_sha256, size_bytes=blob.size_bytes, content_type=blob.content_type, use_stage=stage, ) ) def mark_job_attachment_uses_sent(session: Session, job: CampaignJob) -> None: record_campaign_attachment_uses_for_job(session, job, stage="built") now = utcnow() uses = ( session.query(CampaignAttachmentUse) .filter( CampaignAttachmentUse.tenant_id == job.tenant_id, CampaignAttachmentUse.campaign_job_id == job.id, CampaignAttachmentUse.use_stage == "built", ) .all() ) for use in uses: sent = ( session.query(CampaignAttachmentUse) .filter( CampaignAttachmentUse.campaign_job_id == job.id, CampaignAttachmentUse.file_version_id == use.file_version_id, CampaignAttachmentUse.use_stage == "sent", ) .one_or_none() ) if sent: continue session.add( CampaignAttachmentUse( tenant_id=use.tenant_id, campaign_id=use.campaign_id, campaign_version_id=use.campaign_version_id, campaign_job_id=use.campaign_job_id, entry_index=use.entry_index, entry_id=use.entry_id, file_asset_id=use.file_asset_id, file_version_id=use.file_version_id, file_blob_id=use.file_blob_id, filename_used=use.filename_used, checksum_sha256=use.checksum_sha256, size_bytes=use.size_bytes, content_type=use.content_type, use_stage="sent", used_at=now, ) )