Files
the-other-dude/backend/app/services/nats_subscriber.py
Jason Staack 06a41ca9bf fix(lint): resolve all ruff lint errors
Add ruff config to exclude alembic E402, SQLAlchemy F821, and pre-existing
E501 line-length issues. Auto-fix 69 unused imports and 2 f-strings without
placeholders. Manually fix 8 unused variables. Apply ruff format to 127 files.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-14 22:17:50 -05:00

237 lines
8.1 KiB
Python

"""NATS JetStream subscriber for device status events from the Go poller.
Subscribes to device.status.> and updates device records in PostgreSQL.
This is a system-level process that needs to update devices across all tenants,
so it uses the admin engine (bypasses RLS).
"""
import asyncio
import json
import logging
import re
from datetime import datetime, timezone
from typing import Optional
import nats
from nats.js import JetStreamContext
from nats.aio.client import Client as NATSClient
from sqlalchemy import text
from app.config import settings
from app.database import AdminAsyncSessionLocal
logger = logging.getLogger(__name__)
_nats_client: Optional[NATSClient] = None
# Regex for RouterOS uptime strings like "42d14h23m15s", "14h23m15s", "23m15s", "3w2d"
_UPTIME_RE = re.compile(r"(?:(\d+)w)?(?:(\d+)d)?(?:(\d+)h)?(?:(\d+)m)?(?:(\d+)s)?")
def _parse_uptime(raw: str) -> int | None:
"""Parse a RouterOS uptime string into total seconds."""
if not raw:
return None
m = _UPTIME_RE.fullmatch(raw)
if not m:
return None
weeks = int(m.group(1) or 0)
days = int(m.group(2) or 0)
hours = int(m.group(3) or 0)
minutes = int(m.group(4) or 0)
seconds = int(m.group(5) or 0)
total = weeks * 604800 + days * 86400 + hours * 3600 + minutes * 60 + seconds
return total if total > 0 else None
async def on_device_status(msg) -> None:
"""Handle a device.status event published by the Go poller.
Payload (JSON):
device_id (str) — UUID of the device
tenant_id (str) — UUID of the owning tenant
status (str) — "online" or "offline"
routeros_version (str | None) — e.g. "7.16.2"
major_version (int | None) — e.g. 7
board_name (str | None) — e.g. "RB4011iGS+5HacQ2HnD"
last_seen (str | None) — ISO-8601 timestamp
"""
try:
data = json.loads(msg.data)
device_id = data.get("device_id")
status = data.get("status")
routeros_version = data.get("routeros_version")
major_version = data.get("major_version")
board_name = data.get("board_name")
last_seen_raw = data.get("last_seen")
serial_number = data.get("serial_number") or None
firmware_version = data.get("firmware_version") or None
uptime_seconds = _parse_uptime(data.get("uptime", ""))
if not device_id or not status:
logger.warning(
"Received device.status event with missing device_id or status — skipping"
)
await msg.ack()
return
# Parse timestamp in Python — asyncpg needs datetime objects, not strings
last_seen_dt = None
if last_seen_raw:
try:
last_seen_dt = datetime.fromisoformat(last_seen_raw.replace("Z", "+00:00"))
except (ValueError, AttributeError):
last_seen_dt = datetime.now(timezone.utc)
async with AdminAsyncSessionLocal() as session:
await session.execute(
text(
"""
UPDATE devices SET
status = :status,
routeros_version = COALESCE(:routeros_version, routeros_version),
routeros_major_version = COALESCE(:major_version, routeros_major_version),
model = COALESCE(:board_name, model),
serial_number = COALESCE(:serial_number, serial_number),
firmware_version = COALESCE(:firmware_version, firmware_version),
uptime_seconds = COALESCE(:uptime_seconds, uptime_seconds),
last_seen = COALESCE(:last_seen, last_seen),
updated_at = NOW()
WHERE id = CAST(:device_id AS uuid)
"""
),
{
"status": status,
"routeros_version": routeros_version,
"major_version": major_version,
"board_name": board_name,
"serial_number": serial_number,
"firmware_version": firmware_version,
"uptime_seconds": uptime_seconds,
"last_seen": last_seen_dt,
"device_id": device_id,
},
)
await session.commit()
# Alert evaluation for offline/online status changes — non-fatal
try:
from app.services import alert_evaluator
if status == "offline":
await alert_evaluator.evaluate_offline(device_id, data.get("tenant_id", ""))
elif status == "online":
await alert_evaluator.evaluate_online(device_id, data.get("tenant_id", ""))
except Exception as e:
logger.warning(
"Alert evaluation failed for device %s status=%s: %s", device_id, status, e
)
logger.info(
"Device status updated",
extra={
"device_id": device_id,
"status": status,
"routeros_version": routeros_version,
},
)
await msg.ack()
except Exception as exc:
logger.error(
"Failed to process device.status event: %s",
exc,
exc_info=True,
)
try:
await msg.nak()
except Exception:
pass # If NAK also fails, NATS will redeliver after ack_wait
async def _subscribe_with_retry(js: JetStreamContext) -> None:
"""Subscribe to device.status.> with durable consumer, retrying if stream not ready."""
max_attempts = 6 # ~30 seconds at 5s intervals
for attempt in range(1, max_attempts + 1):
try:
await js.subscribe(
"device.status.>",
cb=on_device_status,
durable="api-status-consumer",
stream="DEVICE_EVENTS",
)
logger.info("NATS: subscribed to device.status.> (durable: api-status-consumer)")
return
except Exception as exc:
if attempt < max_attempts:
logger.warning(
"NATS: stream DEVICE_EVENTS not ready (attempt %d/%d): %s — retrying in 5s",
attempt,
max_attempts,
exc,
)
await asyncio.sleep(5)
else:
logger.warning(
"NATS: giving up on device.status.> after %d attempts: %s — API will run without real-time status updates",
max_attempts,
exc,
)
return
async def start_nats_subscriber() -> Optional[NATSClient]:
"""Connect to NATS and start the device.status.> subscription.
Returns the NATS connection (must be passed to stop_nats_subscriber on shutdown).
Raises on fatal connection errors after retry exhaustion.
"""
global _nats_client
logger.info("NATS: connecting to %s", settings.NATS_URL)
nc = await nats.connect(
settings.NATS_URL,
max_reconnect_attempts=-1, # reconnect forever (pod-to-pod transient failures)
reconnect_time_wait=2,
error_cb=_on_error,
reconnected_cb=_on_reconnected,
disconnected_cb=_on_disconnected,
)
logger.info("NATS: connected to %s", settings.NATS_URL)
js = nc.jetstream()
await _subscribe_with_retry(js)
_nats_client = nc
return nc
async def stop_nats_subscriber(nc: Optional[NATSClient]) -> None:
"""Drain and close the NATS connection gracefully."""
if nc is None:
return
try:
logger.info("NATS: draining connection...")
await nc.drain()
logger.info("NATS: connection closed")
except Exception as exc:
logger.warning("NATS: error during drain: %s", exc)
try:
await nc.close()
except Exception:
pass
async def _on_error(exc: Exception) -> None:
logger.error("NATS error: %s", exc)
async def _on_reconnected() -> None:
logger.info("NATS: reconnected")
async def _on_disconnected() -> None:
logger.warning("NATS: disconnected")