feat(09-01): implement retention cleanup service with configurable retention period

- Add CONFIG_RETENTION_DAYS setting (default 90) to config.py
- Create retention_service.py with cleanup_expired_snapshots (parameterized SQL via make_interval)
- APScheduler IntervalTrigger runs cleanup every 24h with 1h jitter
- Prometheus counter and histogram for observability
- CASCADE FKs handle diff/change deletion automatically
- All 4 unit tests pass

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jason Staack
2026-03-12 23:33:27 -05:00
parent 00bdde9975
commit a9f7a45a9b
2 changed files with 88 additions and 0 deletions

View File

@@ -125,6 +125,9 @@ class Settings(BaseSettings):
PASSWORD_RESET_TOKEN_EXPIRE_MINUTES: int = 30
APP_BASE_URL: str = "http://localhost:3000"
# Retention cleanup — delete config snapshots older than N days
CONFIG_RETENTION_DAYS: int = 90
# App settings
APP_NAME: str = "TOD - The Other Dude"
APP_VERSION: str = "0.1.0"

View File

@@ -0,0 +1,85 @@
"""Retention cleanup service — deletes config snapshots older than CONFIG_RETENTION_DAYS.
Runs as an APScheduler IntervalTrigger job (every 24h). CASCADE FK constraints
on router_config_diffs and router_config_changes handle associated data automatically.
"""
import logging
from typing import Optional
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.interval import IntervalTrigger
from prometheus_client import Counter, Histogram
from sqlalchemy import text
from app.config import settings
from app.database import AdminAsyncSessionLocal
logger = logging.getLogger(__name__)
_scheduler: Optional[AsyncIOScheduler] = None
# Prometheus metrics
config_snapshots_cleaned_total = Counter(
"config_snapshots_cleaned_total",
"Cumulative count of expired config snapshots deleted by retention cleanup",
)
config_retention_cleanup_duration_seconds = Histogram(
"config_retention_cleanup_duration_seconds",
"Duration of retention cleanup execution",
)
async def cleanup_expired_snapshots() -> int:
"""Delete config snapshots older than CONFIG_RETENTION_DAYS.
CASCADE FK constraints on router_config_diffs and router_config_changes
automatically remove associated rows.
Returns the number of deleted snapshots.
"""
days = settings.CONFIG_RETENTION_DAYS
with config_retention_cleanup_duration_seconds.time():
async with AdminAsyncSessionLocal() as session:
result = await session.execute(
text(
"DELETE FROM router_config_snapshots "
"WHERE collected_at < NOW() - make_interval(days => :days)"
),
{"days": days},
)
await session.commit()
deleted = result.rowcount
config_snapshots_cleaned_total.inc(deleted)
logger.info("retention cleanup complete", extra={"deleted_snapshots": deleted, "retention_days": days})
return deleted
async def start_retention_scheduler() -> None:
"""Start APScheduler with a 24-hour interval job for retention cleanup."""
global _scheduler
_scheduler = AsyncIOScheduler(timezone="UTC")
_scheduler.add_job(
cleanup_expired_snapshots,
trigger=IntervalTrigger(hours=24, jitter=3600),
id="retention_cleanup",
name="Config snapshot retention cleanup",
max_instances=1,
replace_existing=True,
)
_scheduler.start()
logger.info(
"retention scheduler started (every 24h, retention_days=%d)",
settings.CONFIG_RETENTION_DAYS,
)
async def stop_retention_scheduler() -> None:
"""Gracefully shutdown the retention scheduler."""
global _scheduler
if _scheduler:
_scheduler.shutdown(wait=False)
_scheduler = None
logger.info("retention scheduler stopped")