Add ruff config to exclude alembic E402, SQLAlchemy F821, and pre-existing E501 line-length issues. Auto-fix 69 unused imports and 2 f-strings without placeholders. Manually fix 8 unused variables. Apply ruff format to 127 files. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
623 lines
23 KiB
Python
623 lines
23 KiB
Python
"""Two-phase config push with panic-revert safety for RouterOS devices.
|
|
|
|
This module implements the critical safety mechanism for config restoration:
|
|
|
|
Phase 1 — Push:
|
|
1. Pre-backup (mandatory) — snapshot current config before any changes
|
|
2. Install panic-revert RouterOS scheduler — auto-reverts if device becomes
|
|
unreachable (the scheduler fires after 90s and loads the pre-push backup)
|
|
3. Push the target config via SSH /import
|
|
|
|
Phase 2 — Verification (60s settle window):
|
|
4. Wait 60s for config to settle (scheduled processes restart, etc.)
|
|
5. Reachability check via asyncssh
|
|
6a. Reachable — remove panic-revert scheduler; mark operation committed
|
|
6b. Unreachable — RouterOS is auto-reverting; mark operation reverted
|
|
|
|
Pitfall 6 handling:
|
|
If the API pod restarts during the 60s window, the config_push_operations
|
|
row with status='pending_verification' serves as the recovery signal.
|
|
On startup, recover_stale_push_operations() resolves any stale rows.
|
|
|
|
Security policy:
|
|
known_hosts=None — RouterOS self-signed host keys; mirrors InsecureSkipVerify
|
|
used in the poller's TLS connection. See Pitfall 2 in 04-RESEARCH.md.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
from datetime import datetime, timedelta, timezone
|
|
|
|
import asyncssh
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.config import settings
|
|
from app.models.config_backup import ConfigPushOperation
|
|
from app.models.device import Device
|
|
from app.services import backup_service, git_store
|
|
from app.services.event_publisher import publish_event
|
|
from app.services.push_tracker import record_push, clear_push
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Name of the panic-revert scheduler installed on the RouterOS device
|
|
_PANIC_REVERT_SCHEDULER = "the-other-dude-panic-revert"
|
|
# Name of the pre-push binary backup saved on device flash
|
|
_PRE_PUSH_BACKUP = "portal-pre-push"
|
|
# Name of the RSC file used for /import on device
|
|
_RESTORE_RSC = "portal-restore.rsc"
|
|
|
|
|
|
async def _publish_push_progress(
|
|
tenant_id: str,
|
|
device_id: str,
|
|
stage: str,
|
|
message: str,
|
|
push_op_id: str | None = None,
|
|
error: str | None = None,
|
|
) -> None:
|
|
"""Publish config push progress event to NATS (fire-and-forget)."""
|
|
payload = {
|
|
"event_type": "config_push",
|
|
"tenant_id": tenant_id,
|
|
"device_id": device_id,
|
|
"stage": stage,
|
|
"message": message,
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"push_operation_id": push_op_id,
|
|
}
|
|
if error:
|
|
payload["error"] = error
|
|
await publish_event(f"config.push.{tenant_id}.{device_id}", payload)
|
|
|
|
|
|
async def restore_config(
|
|
device_id: str,
|
|
tenant_id: str,
|
|
commit_sha: str,
|
|
db_session: AsyncSession,
|
|
) -> dict:
|
|
"""Restore a device config to a specific backup version via two-phase push.
|
|
|
|
Args:
|
|
device_id: Device UUID as string.
|
|
tenant_id: Tenant UUID as string.
|
|
commit_sha: Git commit SHA of the backup version to restore.
|
|
db_session: AsyncSession with RLS context already set (from API endpoint).
|
|
|
|
Returns:
|
|
{
|
|
"status": "committed" | "reverted" | "failed",
|
|
"message": str,
|
|
"pre_backup_sha": str,
|
|
}
|
|
|
|
Raises:
|
|
ValueError: If device not found or missing credentials.
|
|
Exception: On SSH failure during push phase (reverted status logged).
|
|
"""
|
|
loop = asyncio.get_event_loop()
|
|
|
|
# ------------------------------------------------------------------
|
|
# Step 1: Load device from DB and decrypt credentials
|
|
# ------------------------------------------------------------------
|
|
from sqlalchemy import select
|
|
|
|
result = await db_session.execute(
|
|
select(Device).where(Device.id == device_id) # type: ignore[arg-type]
|
|
)
|
|
device = result.scalar_one_or_none()
|
|
if device is None:
|
|
raise ValueError(f"Device {device_id!r} not found")
|
|
|
|
if not device.encrypted_credentials_transit and not device.encrypted_credentials:
|
|
raise ValueError(f"Device {device_id!r} has no stored credentials — cannot perform restore")
|
|
|
|
key = settings.get_encryption_key_bytes()
|
|
from app.services.crypto import decrypt_credentials_hybrid
|
|
|
|
creds_json = await decrypt_credentials_hybrid(
|
|
device.encrypted_credentials_transit,
|
|
device.encrypted_credentials,
|
|
str(device.tenant_id),
|
|
key,
|
|
)
|
|
creds = json.loads(creds_json)
|
|
ssh_username = creds.get("username", "")
|
|
ssh_password = creds.get("password", "")
|
|
ip = device.ip_address
|
|
|
|
hostname = device.hostname or ip
|
|
|
|
# Publish "started" progress event
|
|
await _publish_push_progress(
|
|
tenant_id, device_id, "started", f"Config restore started for {hostname}"
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Step 2: Read the target export.rsc from the backup commit
|
|
# ------------------------------------------------------------------
|
|
try:
|
|
export_bytes = await loop.run_in_executor(
|
|
None,
|
|
git_store.read_file,
|
|
tenant_id,
|
|
commit_sha,
|
|
device_id,
|
|
"export.rsc",
|
|
)
|
|
except (KeyError, Exception) as exc:
|
|
raise ValueError(
|
|
f"Backup version {commit_sha!r} not found for device {device_id!r}: {exc}"
|
|
) from exc
|
|
|
|
export_text = export_bytes.decode("utf-8", errors="replace")
|
|
|
|
# ------------------------------------------------------------------
|
|
# Step 3: Mandatory pre-backup before push
|
|
# ------------------------------------------------------------------
|
|
await _publish_push_progress(
|
|
tenant_id, device_id, "backing_up", f"Creating pre-restore backup for {hostname}"
|
|
)
|
|
|
|
logger.info(
|
|
"Starting pre-restore backup for device %s (%s) before pushing commit %s",
|
|
hostname,
|
|
ip,
|
|
commit_sha[:8],
|
|
)
|
|
pre_backup_result = await backup_service.run_backup(
|
|
device_id=device_id,
|
|
tenant_id=tenant_id,
|
|
trigger_type="pre-restore",
|
|
db_session=db_session,
|
|
)
|
|
pre_backup_sha = pre_backup_result["commit_sha"]
|
|
logger.info("Pre-restore backup complete: %s", pre_backup_sha[:8])
|
|
|
|
# ------------------------------------------------------------------
|
|
# Step 4: Record push operation (pending_verification for recovery)
|
|
# ------------------------------------------------------------------
|
|
push_op = ConfigPushOperation(
|
|
device_id=device.id,
|
|
tenant_id=device.tenant_id,
|
|
pre_push_commit_sha=pre_backup_sha,
|
|
scheduler_name=_PANIC_REVERT_SCHEDULER,
|
|
status="pending_verification",
|
|
)
|
|
db_session.add(push_op)
|
|
await db_session.flush()
|
|
push_op_id = push_op.id
|
|
|
|
logger.info(
|
|
"Push op %s in pending_verification — if API restarts, "
|
|
"recover_stale_push_operations() will resolve on next startup",
|
|
push_op.id,
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Step 5: SSH to device — install panic-revert, push config
|
|
# ------------------------------------------------------------------
|
|
push_op_id_str = str(push_op_id)
|
|
await _publish_push_progress(
|
|
tenant_id, device_id, "pushing", f"Pushing config to {hostname}", push_op_id=push_op_id_str
|
|
)
|
|
|
|
logger.info(
|
|
"Pushing config to device %s (%s): installing panic-revert scheduler and uploading config",
|
|
hostname,
|
|
ip,
|
|
)
|
|
|
|
try:
|
|
async with asyncssh.connect(
|
|
ip,
|
|
port=22,
|
|
username=ssh_username,
|
|
password=ssh_password,
|
|
known_hosts=None, # RouterOS self-signed host keys — see module docstring
|
|
connect_timeout=30,
|
|
) as conn:
|
|
# 5a: Create binary backup on device as revert point
|
|
await conn.run(
|
|
f"/system backup save name={_PRE_PUSH_BACKUP} dont-encrypt=yes",
|
|
check=True,
|
|
)
|
|
logger.debug("Pre-push binary backup saved on device as %s.backup", _PRE_PUSH_BACKUP)
|
|
|
|
# 5b: Install panic-revert RouterOS scheduler
|
|
# The scheduler fires after 90s on startup and loads the pre-push backup.
|
|
# This is the safety net: if the device becomes unreachable after push,
|
|
# RouterOS will auto-revert to the known-good config on the next reboot
|
|
# or after 90s of uptime.
|
|
await conn.run(
|
|
f"/system scheduler add "
|
|
f'name="{_PANIC_REVERT_SCHEDULER}" '
|
|
f"interval=90s "
|
|
f'on-event=":delay 0; /system backup load name={_PRE_PUSH_BACKUP}" '
|
|
f"start-time=startup",
|
|
check=True,
|
|
)
|
|
logger.debug("Panic-revert scheduler installed on device")
|
|
|
|
# 5c: Upload export.rsc and /import it
|
|
# Write the RSC content to the device filesystem via SSH exec,
|
|
# then use /import to apply it. The file is cleaned up after import.
|
|
# We use a here-doc approach: write content line-by-line via /file set.
|
|
# RouterOS supports writing files via /tool fetch or direct file commands.
|
|
# Simplest approach for large configs: use asyncssh's write_into to
|
|
# write file content, then /import.
|
|
#
|
|
# RouterOS doesn't support direct SFTP uploads via SSH open_sftp() easily
|
|
# for config files. Use the script approach instead:
|
|
# /system script add + run + remove (avoids flash write concerns).
|
|
#
|
|
# Actually the simplest method: write the export.rsc line by line via
|
|
# /file print / set commands is RouterOS 6 only and unreliable.
|
|
# Best approach for RouterOS 7: use SFTP to upload the file.
|
|
async with conn.start_sftp_client() as sftp:
|
|
async with sftp.open(_RESTORE_RSC, "wb") as f:
|
|
await f.write(export_text.encode("utf-8"))
|
|
logger.debug("Uploaded %s to device flash", _RESTORE_RSC)
|
|
|
|
# /import the config file
|
|
import_result = await conn.run(
|
|
f"/import file={_RESTORE_RSC}",
|
|
check=False, # Don't raise on non-zero exit — import may succeed with warnings
|
|
)
|
|
logger.info(
|
|
"Config import result for device %s: exit_status=%s stdout=%r",
|
|
hostname,
|
|
import_result.exit_status,
|
|
(import_result.stdout or "")[:200],
|
|
)
|
|
|
|
# Clean up the uploaded RSC file (best-effort)
|
|
try:
|
|
await conn.run(f"/file remove {_RESTORE_RSC}", check=True)
|
|
except Exception as cleanup_err:
|
|
logger.warning(
|
|
"Failed to clean up %s from device %s: %s",
|
|
_RESTORE_RSC,
|
|
ip,
|
|
cleanup_err,
|
|
)
|
|
|
|
except Exception as push_err:
|
|
logger.error(
|
|
"SSH push phase failed for device %s (%s): %s",
|
|
hostname,
|
|
ip,
|
|
push_err,
|
|
)
|
|
# Update push operation to failed
|
|
await _update_push_op_status(push_op_id, "failed", db_session)
|
|
await _publish_push_progress(
|
|
tenant_id,
|
|
device_id,
|
|
"failed",
|
|
f"Config push failed for {hostname}: {push_err}",
|
|
push_op_id=push_op_id_str,
|
|
error=str(push_err),
|
|
)
|
|
return {
|
|
"status": "failed",
|
|
"message": f"Config push failed during SSH phase: {push_err}",
|
|
"pre_backup_sha": pre_backup_sha,
|
|
}
|
|
|
|
# Record push in Redis so the poller can detect post-push offline events
|
|
await record_push(
|
|
device_id=device_id,
|
|
tenant_id=tenant_id,
|
|
push_type="restore",
|
|
push_operation_id=push_op_id_str,
|
|
pre_push_commit_sha=pre_backup_sha,
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Step 6: Wait 60s for config to settle
|
|
# ------------------------------------------------------------------
|
|
await _publish_push_progress(
|
|
tenant_id,
|
|
device_id,
|
|
"settling",
|
|
f"Config pushed to {hostname} — waiting 60s for settle",
|
|
push_op_id=push_op_id_str,
|
|
)
|
|
|
|
logger.info(
|
|
"Config pushed to device %s — waiting 60s for config to settle",
|
|
hostname,
|
|
)
|
|
await asyncio.sleep(60)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Step 7: Reachability check
|
|
# ------------------------------------------------------------------
|
|
await _publish_push_progress(
|
|
tenant_id,
|
|
device_id,
|
|
"verifying",
|
|
f"Verifying device {hostname} reachability",
|
|
push_op_id=push_op_id_str,
|
|
)
|
|
|
|
reachable = await _check_reachability(ip, ssh_username, ssh_password)
|
|
|
|
if reachable:
|
|
# ------------------------------------------------------------------
|
|
# Step 8a: Device is reachable — remove panic-revert scheduler + cleanup
|
|
# ------------------------------------------------------------------
|
|
logger.info("Device %s (%s) is reachable after push — committing", hostname, ip)
|
|
try:
|
|
async with asyncssh.connect(
|
|
ip,
|
|
port=22,
|
|
username=ssh_username,
|
|
password=ssh_password,
|
|
known_hosts=None,
|
|
connect_timeout=30,
|
|
) as conn:
|
|
# Remove the panic-revert scheduler
|
|
await conn.run(
|
|
f'/system scheduler remove "{_PANIC_REVERT_SCHEDULER}"',
|
|
check=False, # Non-fatal if already removed
|
|
)
|
|
# Clean up the pre-push binary backup from device flash
|
|
await conn.run(
|
|
f"/file remove {_PRE_PUSH_BACKUP}.backup",
|
|
check=False, # Non-fatal if already removed
|
|
)
|
|
except Exception as cleanup_err:
|
|
# Cleanup failure is non-fatal — scheduler will eventually fire but
|
|
# the backup is now the correct config, so it's acceptable.
|
|
logger.warning(
|
|
"Failed to clean up panic-revert scheduler/backup on device %s: %s",
|
|
hostname,
|
|
cleanup_err,
|
|
)
|
|
|
|
await _update_push_op_status(push_op_id, "committed", db_session)
|
|
await clear_push(device_id)
|
|
await _publish_push_progress(
|
|
tenant_id,
|
|
device_id,
|
|
"committed",
|
|
f"Config restored successfully on {hostname}",
|
|
push_op_id=push_op_id_str,
|
|
)
|
|
|
|
return {
|
|
"status": "committed",
|
|
"message": "Config restored successfully",
|
|
"pre_backup_sha": pre_backup_sha,
|
|
}
|
|
|
|
else:
|
|
# ------------------------------------------------------------------
|
|
# Step 8b: Device unreachable — RouterOS is auto-reverting via scheduler
|
|
# ------------------------------------------------------------------
|
|
logger.warning(
|
|
"Device %s (%s) is unreachable after push — RouterOS panic-revert scheduler "
|
|
"will auto-revert to %s.backup",
|
|
hostname,
|
|
ip,
|
|
_PRE_PUSH_BACKUP,
|
|
)
|
|
|
|
await _update_push_op_status(push_op_id, "reverted", db_session)
|
|
await _publish_push_progress(
|
|
tenant_id,
|
|
device_id,
|
|
"reverted",
|
|
f"Device {hostname} unreachable — auto-reverting via panic-revert scheduler",
|
|
push_op_id=push_op_id_str,
|
|
)
|
|
|
|
return {
|
|
"status": "reverted",
|
|
"message": (
|
|
"Device unreachable after push; RouterOS is auto-reverting "
|
|
"via panic-revert scheduler"
|
|
),
|
|
"pre_backup_sha": pre_backup_sha,
|
|
}
|
|
|
|
|
|
async def _check_reachability(ip: str, username: str, password: str) -> bool:
|
|
"""Check if a RouterOS device is reachable via SSH.
|
|
|
|
Attempts to connect and run a simple command (/system identity print).
|
|
Returns True if successful, False if the connection fails or times out.
|
|
|
|
Uses asyncssh (not the poller's binary API) to avoid a circular import.
|
|
A 30-second timeout is used — if the device doesn't respond within that
|
|
window, it's considered unreachable (panic-revert will handle it).
|
|
|
|
Args:
|
|
ip: Device IP address.
|
|
username: SSH username.
|
|
password: SSH password.
|
|
|
|
Returns:
|
|
True if reachable, False if unreachable.
|
|
"""
|
|
try:
|
|
async with asyncssh.connect(
|
|
ip,
|
|
port=22,
|
|
username=username,
|
|
password=password,
|
|
known_hosts=None,
|
|
connect_timeout=30,
|
|
) as conn:
|
|
result = await conn.run("/system identity print", check=True)
|
|
logger.debug("Reachability check OK for %s: %r", ip, result.stdout[:50])
|
|
return True
|
|
except Exception as exc:
|
|
logger.info("Device %s unreachable after push: %s", ip, exc)
|
|
return False
|
|
|
|
|
|
async def _update_push_op_status(
|
|
push_op_id,
|
|
new_status: str,
|
|
db_session: AsyncSession,
|
|
) -> None:
|
|
"""Update the status and completed_at of a ConfigPushOperation row.
|
|
|
|
Args:
|
|
push_op_id: UUID of the ConfigPushOperation row.
|
|
new_status: New status value ('committed' | 'reverted' | 'failed').
|
|
db_session: Database session (must already have tenant context set).
|
|
"""
|
|
from sqlalchemy import update
|
|
|
|
await db_session.execute(
|
|
update(ConfigPushOperation)
|
|
.where(ConfigPushOperation.id == push_op_id) # type: ignore[arg-type]
|
|
.values(
|
|
status=new_status,
|
|
completed_at=datetime.now(timezone.utc),
|
|
)
|
|
)
|
|
# Don't commit here — the caller (endpoint) owns the transaction
|
|
|
|
|
|
async def _remove_panic_scheduler(
|
|
ip: str, username: str, password: str, scheduler_name: str
|
|
) -> bool:
|
|
"""SSH to device and remove the panic-revert scheduler. Returns True if removed."""
|
|
try:
|
|
async with asyncssh.connect(
|
|
ip,
|
|
username=username,
|
|
password=password,
|
|
known_hosts=None,
|
|
connect_timeout=30,
|
|
) as conn:
|
|
# Check if scheduler exists
|
|
result = await conn.run(
|
|
f'/system scheduler print where name="{scheduler_name}"',
|
|
check=False,
|
|
)
|
|
if scheduler_name in result.stdout:
|
|
await conn.run(
|
|
f'/system scheduler remove [find name="{scheduler_name}"]',
|
|
check=False,
|
|
)
|
|
# Also clean up pre-push backup file
|
|
await conn.run(
|
|
f'/file remove [find name="{_PRE_PUSH_BACKUP}.backup"]',
|
|
check=False,
|
|
)
|
|
return True
|
|
return False # Scheduler already gone (device reverted itself)
|
|
except Exception as e:
|
|
logger.error("Failed to remove panic scheduler from %s: %s", ip, e)
|
|
return False
|
|
|
|
|
|
async def recover_stale_push_operations(db_session: AsyncSession) -> None:
|
|
"""Recover stale pending_verification push operations on API startup.
|
|
|
|
Scans for operations older than 5 minutes that are still pending.
|
|
For each, checks device reachability and resolves the operation.
|
|
"""
|
|
from sqlalchemy import select
|
|
|
|
from app.models.config_backup import ConfigPushOperation
|
|
from app.models.device import Device
|
|
from app.services.crypto import decrypt_credentials_hybrid
|
|
|
|
cutoff = datetime.now(timezone.utc) - timedelta(minutes=5)
|
|
|
|
result = await db_session.execute(
|
|
select(ConfigPushOperation).where(
|
|
ConfigPushOperation.status == "pending_verification",
|
|
ConfigPushOperation.started_at < cutoff,
|
|
)
|
|
)
|
|
stale_ops = result.scalars().all()
|
|
|
|
if not stale_ops:
|
|
logger.info("No stale push operations to recover")
|
|
return
|
|
|
|
logger.warning("Found %d stale push operations to recover", len(stale_ops))
|
|
|
|
key = settings.get_encryption_key_bytes()
|
|
|
|
for op in stale_ops:
|
|
try:
|
|
# Load device
|
|
dev_result = await db_session.execute(select(Device).where(Device.id == op.device_id))
|
|
device = dev_result.scalar_one_or_none()
|
|
if not device:
|
|
logger.error("Device %s not found for stale op %s", op.device_id, op.id)
|
|
await _update_push_op_status(op.id, "failed", db_session)
|
|
continue
|
|
|
|
# Decrypt credentials
|
|
creds_json = await decrypt_credentials_hybrid(
|
|
device.encrypted_credentials_transit,
|
|
device.encrypted_credentials,
|
|
str(op.tenant_id),
|
|
key,
|
|
)
|
|
creds = json.loads(creds_json)
|
|
ssh_username = creds.get("username", "admin")
|
|
ssh_password = creds.get("password", "")
|
|
|
|
# Check reachability
|
|
reachable = await _check_reachability(device.ip_address, ssh_username, ssh_password)
|
|
|
|
if reachable:
|
|
# Try to remove scheduler (if still there, push was good)
|
|
removed = await _remove_panic_scheduler(
|
|
device.ip_address,
|
|
ssh_username,
|
|
ssh_password,
|
|
op.scheduler_name,
|
|
)
|
|
if removed:
|
|
logger.info("Recovery: committed op %s (scheduler removed)", op.id)
|
|
else:
|
|
# Scheduler already gone — device may have reverted
|
|
logger.warning(
|
|
"Recovery: op %s — scheduler gone, device may have reverted. "
|
|
"Marking committed (device is reachable).",
|
|
op.id,
|
|
)
|
|
await _update_push_op_status(op.id, "committed", db_session)
|
|
|
|
await _publish_push_progress(
|
|
str(op.tenant_id),
|
|
str(op.device_id),
|
|
"committed",
|
|
"Recovered after API restart",
|
|
push_op_id=str(op.id),
|
|
)
|
|
else:
|
|
logger.warning(
|
|
"Recovery: device %s unreachable, marking op %s failed",
|
|
op.device_id,
|
|
op.id,
|
|
)
|
|
await _update_push_op_status(op.id, "failed", db_session)
|
|
await _publish_push_progress(
|
|
str(op.tenant_id),
|
|
str(op.device_id),
|
|
"failed",
|
|
"Device unreachable during recovery after API restart",
|
|
push_op_id=str(op.id),
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Recovery failed for op %s: %s", op.id, e)
|
|
await _update_push_op_status(op.id, "failed", db_session)
|
|
|
|
await db_session.commit()
|