Files
the-other-dude/backend/app/services/backup_service.py
Jason Staack 06a41ca9bf fix(lint): resolve all ruff lint errors
Add ruff config to exclude alembic E402, SQLAlchemy F821, and pre-existing
E501 line-length issues. Auto-fix 69 unused imports and 2 f-strings without
placeholders. Manually fix 8 unused variables. Apply ruff format to 127 files.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-14 22:17:50 -05:00

379 lines
14 KiB
Python

"""SSH-based config capture service for RouterOS devices.
This service handles:
1. capture_export() — SSH to device, run /export compact, return stdout text
2. capture_binary_backup() — SSH to device, trigger /system backup save, SFTP-download result
3. run_backup() — Orchestrate a full backup: capture + git commit + DB record
All functions are async (asyncssh is asyncio-native).
Security policy:
known_hosts=None is intentional — RouterOS devices use self-signed SSH host keys
that change on reset or key regeneration. This mirrors InsecureSkipVerify=true
used in the poller's TLS connection. The threat model accepts device impersonation
risk in exchange for operational simplicity (no pre-enrollment of host keys needed).
See Pitfall 2 in 04-RESEARCH.md.
pygit2 calls are synchronous C bindings and MUST be wrapped in run_in_executor.
See Pitfall 3 in 04-RESEARCH.md.
Phase 30: ALL backups (manual, scheduled, pre-restore) are encrypted via OpenBao
Transit (Tier 2) before git commit. The server retains decrypt capability for
on-demand viewing. Raw files in git are ciphertext; the API decrypts on GET.
"""
import asyncio
import base64
import io
import json
import logging
from datetime import datetime, timezone
import asyncssh
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings
from app.database import AdminAsyncSessionLocal, set_tenant_context
from app.models.config_backup import ConfigBackupRun
from app.models.device import Device
from app.services import git_store
from app.services.crypto import decrypt_credentials_hybrid
logger = logging.getLogger(__name__)
# Fixed backup file name on device flash — overwrites on each run so files
# don't accumulate. See Pitfall 4 in 04-RESEARCH.md.
_BACKUP_NAME = "portal-backup"
async def capture_export(
ip: str,
port: int = 22,
username: str = "",
password: str = "",
) -> str:
"""SSH to a RouterOS device and capture /export compact output.
Args:
ip: Device IP address.
port: SSH port (default 22; RouterOS default is 22).
username: SSH login username.
password: SSH login password.
Returns:
The raw RSC text from /export compact (may include RouterOS header line).
Raises:
asyncssh.Error: On SSH connection or command execution failure.
"""
async with asyncssh.connect(
ip,
port=port,
username=username,
password=password,
known_hosts=None, # RouterOS self-signed host keys — see module docstring
connect_timeout=30,
) as conn:
result = await conn.run("/export compact", check=True)
return result.stdout
async def capture_binary_backup(
ip: str,
port: int = 22,
username: str = "",
password: str = "",
) -> bytes:
"""SSH to a RouterOS device, create a binary backup, SFTP-download it, then clean up.
Uses a fixed backup name ({_BACKUP_NAME}.backup) so the file overwrites
on subsequent runs, preventing flash storage accumulation.
The cleanup (removing the file from device flash) runs in a try/finally
block so cleanup failures don't mask the actual backup error but are
logged for observability. See Pitfall 4 in 04-RESEARCH.md.
Args:
ip: Device IP address.
port: SSH port (default 22).
username: SSH login username.
password: SSH login password.
Returns:
Raw bytes of the binary backup file.
Raises:
asyncssh.Error: On SSH connection, command, or SFTP failure.
"""
async with asyncssh.connect(
ip,
port=port,
username=username,
password=password,
known_hosts=None,
connect_timeout=30,
) as conn:
# Step 1: Trigger backup creation on device flash.
await conn.run(
f"/system backup save name={_BACKUP_NAME} dont-encrypt=yes",
check=True,
)
buf = io.BytesIO()
try:
# Step 2: SFTP-download the backup file.
async with conn.start_sftp_client() as sftp:
async with sftp.open(f"{_BACKUP_NAME}.backup", "rb") as f:
buf.write(await f.read())
finally:
# Step 3: Remove backup file from device flash (best-effort cleanup).
try:
await conn.run(f"/file remove {_BACKUP_NAME}.backup", check=True)
except Exception as cleanup_err:
logger.warning(
"Failed to remove backup file from device %s: %s",
ip,
cleanup_err,
)
return buf.getvalue()
async def run_backup(
device_id: str,
tenant_id: str,
trigger_type: str,
db_session: AsyncSession | None = None,
) -> dict:
"""Orchestrate a full config backup for a device.
Steps:
1. Load device from DB (ip_address, encrypted_credentials).
2. Decrypt credentials using crypto.decrypt_credentials().
3. Capture /export compact and binary backup concurrently via asyncio.gather().
4. Compute line delta vs the most recent export.rsc in git (None for first backup).
5. Commit both files to the tenant's bare git repo (run_in_executor for pygit2).
6. Insert ConfigBackupRun record with commit SHA, trigger type, line deltas.
7. Return summary dict.
Args:
device_id: Device UUID as string.
tenant_id: Tenant UUID as string.
trigger_type: 'scheduled' | 'manual' | 'pre-restore'
db_session: Optional AsyncSession with RLS context already set.
If None, uses AdminAsyncSessionLocal (for scheduler context).
Returns:
Dict: {"commit_sha": str, "trigger_type": str, "lines_added": int|None, "lines_removed": int|None}
Raises:
ValueError: If device not found or missing credentials.
asyncssh.Error: On SSH/SFTP failure.
"""
loop = asyncio.get_event_loop()
ts = datetime.now(timezone.utc).isoformat()
# -----------------------------------------------------------------------
# Step 1: Load device from DB
# -----------------------------------------------------------------------
if db_session is not None:
session = db_session
should_close = False
else:
# Scheduler context: use admin session (cross-tenant; RLS bypassed)
session = AdminAsyncSessionLocal()
should_close = True
try:
from sqlalchemy import select
if should_close:
# Admin session doesn't have RLS context — query directly.
result = await session.execute(
select(Device).where(
Device.id == device_id, # type: ignore[arg-type]
Device.tenant_id == tenant_id, # type: ignore[arg-type]
)
)
else:
result = await session.execute(
select(Device).where(Device.id == device_id) # type: ignore[arg-type]
)
device = result.scalar_one_or_none()
if device is None:
raise ValueError(f"Device {device_id!r} not found for tenant {tenant_id!r}")
if not device.encrypted_credentials_transit and not device.encrypted_credentials:
raise ValueError(
f"Device {device_id!r} has no stored credentials — cannot perform backup"
)
# -----------------------------------------------------------------------
# Step 2: Decrypt credentials (dual-read: Transit preferred, legacy fallback)
# -----------------------------------------------------------------------
key = settings.get_encryption_key_bytes()
creds_json = await decrypt_credentials_hybrid(
device.encrypted_credentials_transit,
device.encrypted_credentials,
str(device.tenant_id),
key,
)
creds = json.loads(creds_json)
ssh_username = creds.get("username", "")
ssh_password = creds.get("password", "")
ip = device.ip_address
hostname = device.hostname or ip
# -----------------------------------------------------------------------
# Step 3: Capture export and binary backup concurrently
# -----------------------------------------------------------------------
logger.info(
"Starting %s backup for device %s (%s) tenant %s",
trigger_type,
hostname,
ip,
tenant_id,
)
export_text, binary_backup = await asyncio.gather(
capture_export(ip, username=ssh_username, password=ssh_password),
capture_binary_backup(ip, username=ssh_username, password=ssh_password),
)
# -----------------------------------------------------------------------
# Step 4: Compute line delta vs prior version
# -----------------------------------------------------------------------
lines_added: int | None = None
lines_removed: int | None = None
prior_commits = await loop.run_in_executor(
None, git_store.list_device_commits, tenant_id, device_id
)
if prior_commits:
try:
prior_export_bytes = await loop.run_in_executor(
None,
git_store.read_file,
tenant_id,
prior_commits[0]["sha"],
device_id,
"export.rsc",
)
prior_text = prior_export_bytes.decode("utf-8", errors="replace")
lines_added, lines_removed = await loop.run_in_executor(
None, git_store.compute_line_delta, prior_text, export_text
)
except Exception as delta_err:
logger.warning(
"Failed to compute line delta for device %s: %s",
device_id,
delta_err,
)
# Keep lines_added/lines_removed as None on error — non-fatal
else:
# First backup: all lines are "added", none removed
all_lines = len(export_text.splitlines())
lines_added = all_lines
lines_removed = 0
# -----------------------------------------------------------------------
# Step 5: Encrypt ALL backups via Transit (Tier 2: OpenBao Transit)
# -----------------------------------------------------------------------
encryption_tier: int | None = None
git_export_content = export_text
git_binary_content = binary_backup
try:
from app.services.crypto import encrypt_data_transit
encrypted_export = await encrypt_data_transit(export_text, tenant_id)
encrypted_binary = await encrypt_data_transit(
base64.b64encode(binary_backup).decode(), tenant_id
)
# Transit ciphertext is text — store directly in git
git_export_content = encrypted_export
git_binary_content = encrypted_binary.encode("utf-8")
encryption_tier = 2
logger.info(
"Tier 2 Transit encryption applied for %s backup of device %s",
trigger_type,
device_id,
)
except Exception as enc_err:
# Transit unavailable — fall back to plaintext (non-fatal)
logger.warning(
"Transit encryption failed for %s backup of device %s, storing plaintext: %s",
trigger_type,
device_id,
enc_err,
)
# Keep encryption_tier = None (plaintext fallback)
# -----------------------------------------------------------------------
# Step 6: Commit to git (wrapped in run_in_executor — pygit2 is sync C bindings)
# -----------------------------------------------------------------------
commit_message = f"{trigger_type}: {hostname} ({ip}) at {ts}"
commit_sha = await loop.run_in_executor(
None,
git_store.commit_backup,
tenant_id,
device_id,
git_export_content,
git_binary_content,
commit_message,
)
logger.info(
"Committed backup for device %s to git SHA %s (tier=%s)",
device_id,
commit_sha[:8],
encryption_tier,
)
# -----------------------------------------------------------------------
# Step 7: Insert ConfigBackupRun record
# -----------------------------------------------------------------------
if not should_close:
# RLS-scoped session from API context — record directly
backup_run = ConfigBackupRun(
device_id=device.id,
tenant_id=device.tenant_id,
commit_sha=commit_sha,
trigger_type=trigger_type,
lines_added=lines_added,
lines_removed=lines_removed,
encryption_tier=encryption_tier,
)
session.add(backup_run)
await session.flush()
else:
# Admin session — set tenant context before insert so RLS policy is satisfied
async with AdminAsyncSessionLocal() as admin_session:
await set_tenant_context(admin_session, str(device.tenant_id))
backup_run = ConfigBackupRun(
device_id=device.id,
tenant_id=device.tenant_id,
commit_sha=commit_sha,
trigger_type=trigger_type,
lines_added=lines_added,
lines_removed=lines_removed,
encryption_tier=encryption_tier,
)
admin_session.add(backup_run)
await admin_session.commit()
return {
"commit_sha": commit_sha,
"trigger_type": trigger_type,
"lines_added": lines_added,
"lines_removed": lines_removed,
}
finally:
if should_close:
await session.close()