diff --git a/backend/app/config.py b/backend/app/config.py index b3dfb49..c346d4a 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -125,6 +125,9 @@ class Settings(BaseSettings): PASSWORD_RESET_TOKEN_EXPIRE_MINUTES: int = 30 APP_BASE_URL: str = "http://localhost:3000" + # Retention cleanup — delete config snapshots older than N days + CONFIG_RETENTION_DAYS: int = 90 + # App settings APP_NAME: str = "TOD - The Other Dude" APP_VERSION: str = "0.1.0" diff --git a/backend/app/services/retention_service.py b/backend/app/services/retention_service.py new file mode 100644 index 0000000..68e7649 --- /dev/null +++ b/backend/app/services/retention_service.py @@ -0,0 +1,85 @@ +"""Retention cleanup service — deletes config snapshots older than CONFIG_RETENTION_DAYS. + +Runs as an APScheduler IntervalTrigger job (every 24h). CASCADE FK constraints +on router_config_diffs and router_config_changes handle associated data automatically. +""" + +import logging +from typing import Optional + +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from apscheduler.triggers.interval import IntervalTrigger +from prometheus_client import Counter, Histogram +from sqlalchemy import text + +from app.config import settings +from app.database import AdminAsyncSessionLocal + +logger = logging.getLogger(__name__) + +_scheduler: Optional[AsyncIOScheduler] = None + +# Prometheus metrics +config_snapshots_cleaned_total = Counter( + "config_snapshots_cleaned_total", + "Cumulative count of expired config snapshots deleted by retention cleanup", +) +config_retention_cleanup_duration_seconds = Histogram( + "config_retention_cleanup_duration_seconds", + "Duration of retention cleanup execution", +) + + +async def cleanup_expired_snapshots() -> int: + """Delete config snapshots older than CONFIG_RETENTION_DAYS. + + CASCADE FK constraints on router_config_diffs and router_config_changes + automatically remove associated rows. + + Returns the number of deleted snapshots. + """ + days = settings.CONFIG_RETENTION_DAYS + + with config_retention_cleanup_duration_seconds.time(): + async with AdminAsyncSessionLocal() as session: + result = await session.execute( + text( + "DELETE FROM router_config_snapshots " + "WHERE collected_at < NOW() - make_interval(days => :days)" + ), + {"days": days}, + ) + await session.commit() + deleted = result.rowcount + + config_snapshots_cleaned_total.inc(deleted) + logger.info("retention cleanup complete", extra={"deleted_snapshots": deleted, "retention_days": days}) + return deleted + + +async def start_retention_scheduler() -> None: + """Start APScheduler with a 24-hour interval job for retention cleanup.""" + global _scheduler + _scheduler = AsyncIOScheduler(timezone="UTC") + _scheduler.add_job( + cleanup_expired_snapshots, + trigger=IntervalTrigger(hours=24, jitter=3600), + id="retention_cleanup", + name="Config snapshot retention cleanup", + max_instances=1, + replace_existing=True, + ) + _scheduler.start() + logger.info( + "retention scheduler started (every 24h, retention_days=%d)", + settings.CONFIG_RETENTION_DAYS, + ) + + +async def stop_retention_scheduler() -> None: + """Gracefully shutdown the retention scheduler.""" + global _scheduler + if _scheduler: + _scheduler.shutdown(wait=False) + _scheduler = None + logger.info("retention scheduler stopped")