feat: implement Remote WinBox worker, API, frontend integration, OpenBao persistence, and supporting docs

This commit is contained in:
Jason Staack
2026-03-14 09:05:14 -05:00
parent 7af08276ea
commit 970501e453
86 changed files with 3440 additions and 3764 deletions

View File

@@ -22,7 +22,7 @@ KNOWN_INSECURE_DEFAULTS: dict[str, list[str]] = {
],
"OPENBAO_TOKEN": [
"dev-openbao-token",
"CHANGE_ME_IN_PRODUCTION",
"",
],
}
@@ -43,7 +43,8 @@ def validate_production_settings(settings: "Settings") -> None:
f"FATAL: {field} uses a known insecure default in '{settings.ENVIRONMENT}' environment.\n"
f"Generate a secure value and set it in your .env.prod file.\n"
f"For JWT_SECRET_KEY: python -c \"import secrets; print(secrets.token_urlsafe(64))\"\n"
f"For CREDENTIAL_ENCRYPTION_KEY: python -c \"import secrets, base64; print(base64.b64encode(secrets.token_bytes(32)).decode())\"",
f"For CREDENTIAL_ENCRYPTION_KEY: python -c \"import secrets, base64; print(base64.b64encode(secrets.token_bytes(32)).decode())\"\n"
f"For OPENBAO_TOKEN: use the token from your OpenBao server (not the dev token)",
file=sys.stderr,
)
sys.exit(1)
@@ -92,7 +93,7 @@ class Settings(BaseSettings):
# OpenBao Transit (KMS for per-tenant credential encryption)
OPENBAO_ADDR: str = "http://localhost:8200"
OPENBAO_TOKEN: str = "dev-openbao-token"
OPENBAO_TOKEN: str = ""
# First admin bootstrap
FIRST_ADMIN_EMAIL: Optional[str] = None
@@ -119,7 +120,7 @@ class Settings(BaseSettings):
SMTP_USER: Optional[str] = None
SMTP_PASSWORD: Optional[str] = None
SMTP_USE_TLS: bool = False
SMTP_FROM_ADDRESS: str = "noreply@mikrotik-portal.local"
SMTP_FROM_ADDRESS: str = "noreply@the-other-dude.local"
# Password reset
PASSWORD_RESET_TOKEN_EXPIRE_MINUTES: int = 30

View File

@@ -1,7 +1,8 @@
"""FastAPI application entry point."""
import asyncio
from contextlib import asynccontextmanager
from typing import AsyncGenerator
from typing import AsyncGenerator, Optional
import structlog
from fastapi import FastAPI
@@ -232,11 +233,80 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
except Exception as exc:
logger.warning("retention scheduler could not start (API will run without it)", error=str(exc))
# Start Remote WinBox session reconciliation loop (60s interval).
# Detects orphaned sessions (worker lost them) and cleans up Redis + tunnels.
winbox_reconcile_task: Optional[asyncio.Task] = None # type: ignore[type-arg]
try:
from app.routers.winbox_remote import _get_redis as _wb_get_redis, _close_tunnel
from app.services.winbox_remote import get_session as _wb_worker_get, health_check as _wb_health
async def _winbox_reconcile_loop() -> None:
"""Scan Redis for winbox-remote:* keys and reconcile with worker."""
import json as _json
while True:
try:
await asyncio.sleep(60)
rd = await _wb_get_redis()
cursor = "0"
while True:
cursor, keys = await rd.scan(
cursor=cursor, match="winbox-remote:*", count=100
)
for key in keys:
raw = await rd.get(key)
if raw is None:
continue
try:
sess = _json.loads(raw)
except Exception:
await rd.delete(key)
continue
sess_status = sess.get("status")
if sess_status not in ("creating", "active", "grace"):
continue
session_id = sess.get("session_id")
if not session_id:
await rd.delete(key)
continue
# Health-check against worker
worker_info = await _wb_worker_get(session_id)
if worker_info is None:
# Worker lost the session — clean up
logger.warning(
"reconcile: worker lost session %s, cleaning up",
session_id,
)
tunnel_id = sess.get("tunnel_id")
if tunnel_id:
await _close_tunnel(tunnel_id)
await rd.delete(key)
if cursor == "0" or cursor == 0:
break
except asyncio.CancelledError:
break
except Exception as exc:
logger.warning("winbox reconcile loop error: %s", exc)
winbox_reconcile_task = asyncio.create_task(_winbox_reconcile_loop())
except Exception as exc:
logger.warning("winbox reconcile loop could not start (non-fatal)", error=str(exc))
logger.info("startup complete, ready to serve requests")
yield
# Shutdown
logger.info("shutting down TOD API")
if winbox_reconcile_task and not winbox_reconcile_task.done():
winbox_reconcile_task.cancel()
try:
await winbox_reconcile_task
except asyncio.CancelledError:
pass
await stop_backup_scheduler()
await stop_nats_subscriber(nats_connection)
await stop_metrics_subscriber(metrics_nc)
@@ -311,6 +381,7 @@ def create_app() -> FastAPI:
from app.routers.transparency import router as transparency_router
from app.routers.settings import router as settings_router
from app.routers.remote_access import router as remote_access_router
from app.routers.winbox_remote import router as winbox_remote_router
app.include_router(auth_router, prefix="/api")
app.include_router(tenants_router, prefix="/api")
@@ -339,6 +410,7 @@ def create_app() -> FastAPI:
app.include_router(transparency_router, prefix="/api")
app.include_router(settings_router, prefix="/api")
app.include_router(remote_access_router, prefix="/api")
app.include_router(winbox_remote_router, prefix="/api")
# Health check endpoints
@app.get("/health", tags=["health"])

View File

@@ -164,6 +164,67 @@ async def get_current_user(
)
async def get_current_user_ws(
websocket: "WebSocket",
) -> CurrentUser:
"""
WebSocket authentication helper.
Extracts JWT from the ``access_token`` cookie or ``token`` query parameter,
decodes it, and returns a :class:`CurrentUser`. Unlike :func:`get_current_user`
this does **not** touch the database (no RLS tenant context) because WebSocket
handlers typically manage their own DB sessions.
Raises:
WebSocketException 1008: If no token is provided or the token is invalid.
"""
from starlette.websockets import WebSocket, WebSocketState
from fastapi import WebSocketException
# 1. Try cookie
token: Optional[str] = websocket.cookies.get("access_token")
# 2. Fall back to query param
if not token:
token = websocket.query_params.get("token")
if not token:
raise WebSocketException(code=1008, reason="Not authenticated")
try:
payload = verify_token(token, expected_type="access")
except HTTPException:
raise WebSocketException(code=1008, reason="Invalid or expired token")
user_id_str = payload.get("sub")
tenant_id_str = payload.get("tenant_id")
role = payload.get("role")
if not user_id_str or not role:
raise WebSocketException(code=1008, reason="Invalid token payload")
try:
user_id = uuid.UUID(user_id_str)
except ValueError:
raise WebSocketException(code=1008, reason="Invalid token payload")
tenant_id: Optional[uuid.UUID] = None
if tenant_id_str:
try:
tenant_id = uuid.UUID(tenant_id_str)
except ValueError:
pass
if role != "super_admin" and tenant_id is None:
raise WebSocketException(code=1008, reason="Invalid token: no tenant context")
return CurrentUser(
user_id=user_id,
tenant_id=tenant_id,
role=role,
)
async def get_optional_current_user(
request: Request,
credentials: Annotated[Optional[HTTPAuthorizationCredentials], Depends(bearer_scheme)] = None,

View File

@@ -817,7 +817,7 @@ async def get_emergency_kit_template(
io.BytesIO(pdf_bytes),
media_type="application/pdf",
headers={
"Content-Disposition": 'attachment; filename="MikroTik-Portal-Emergency-Kit.pdf"',
"Content-Disposition": 'attachment; filename="The-Other-Dude-Emergency-Kit.pdf"',
},
)

View File

@@ -29,6 +29,7 @@ from app.schemas.remote_access import (
TunnelStatusItem,
WinboxSessionResponse,
)
from app.schemas.winbox_remote import RemoteWinboxSessionItem
from app.middleware.rate_limit import limiter
from app.services.audit_service import log_action
from sqlalchemy import select
@@ -329,4 +330,26 @@ async def list_sessions(
logger.warning("tunnel.status.list NATS request failed: %s", exc)
# Return empty list rather than error — poller may be unavailable
return ActiveSessionsResponse(winbox_tunnels=tunnels, ssh_sessions=[])
# Query Redis for remote winbox (browser) sessions for this device
remote_winbox: list[RemoteWinboxSessionItem] = []
try:
rd = await _get_redis()
pattern = f"winbox-remote:{device_id}:*"
cursor, keys = await rd.scan(0, match=pattern, count=100)
while keys or cursor:
for key in keys:
raw = await rd.get(key)
if raw:
data = json.loads(raw)
remote_winbox.append(RemoteWinboxSessionItem(**data))
if not cursor:
break
cursor, keys = await rd.scan(cursor, match=pattern, count=100)
except Exception as exc:
logger.warning("Redis winbox-remote scan failed: %s", exc)
return ActiveSessionsResponse(
winbox_tunnels=tunnels,
ssh_sessions=[],
remote_winbox_sessions=remote_winbox,
)

View File

@@ -7,6 +7,7 @@ Transit encryption for passwords. Falls back to .env values.
import logging
from typing import Optional
import redis.asyncio as aioredis
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from sqlalchemy import text
@@ -153,3 +154,20 @@ async def test_smtp(
return await send_test_email(data.to, config)
return conn_result
@router.delete("/winbox-sessions")
async def clear_winbox_sessions(user=Depends(require_role("super_admin"))):
"""Clear all WinBox remote session and rate-limit keys from Redis."""
rd = aioredis.from_url(settings.REDIS_URL, decode_responses=True)
try:
deleted = 0
for pattern in ["winbox-remote:*", "winbox-remote-rate:*"]:
keys = []
async for key in rd.scan_iter(match=pattern):
keys.append(key)
if keys:
deleted += await rd.delete(*keys)
return {"status": "ok", "deleted": deleted}
finally:
await rd.aclose()

View File

@@ -0,0 +1,781 @@
"""
Remote WinBox (Browser) endpoints — Xpra-based in-browser WinBox sessions.
All routes are tenant-scoped under /api/tenants/{tenant_id}/devices/{device_id}.
RBAC: operator+ required for all endpoints.
"""
import asyncio
import json
import logging
import uuid
from datetime import datetime, timezone
from typing import Optional
import httpx
import nats
import redis.asyncio as aioredis
from fastapi import (
APIRouter,
Depends,
HTTPException,
Request,
WebSocket,
WebSocketDisconnect,
status,
)
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings
from app.database import get_db
from app.middleware.rbac import require_operator_or_above
from app.middleware.rate_limit import limiter
from app.middleware.tenant_context import CurrentUser, get_current_user
from app.models.device import Device
from app.schemas.winbox_remote import (
RemoteWinboxCreateRequest,
RemoteWinboxSessionResponse,
RemoteWinboxState,
RemoteWinboxStatusResponse,
RemoteWinboxTerminateResponse,
)
from app.services.audit_service import log_action
from app.services.winbox_remote import (
WorkerCapacityError,
WorkerLaunchError,
create_session as worker_create_session,
get_session as worker_get_session,
terminate_session as worker_terminate_session,
)
logger = logging.getLogger(__name__)
router = APIRouter(tags=["winbox-remote"])
REDIS_PREFIX = "winbox-remote:"
RATE_PREFIX = "winbox-remote-rate:"
# ---------------------------------------------------------------------------
# Lazy NATS and Redis clients (same pattern as remote_access.py)
# ---------------------------------------------------------------------------
_nc: Optional[nats.aio.client.Client] = None
_redis: Optional[aioredis.Redis] = None
async def _get_nats() -> nats.aio.client.Client:
"""Get or create a shared NATS client."""
global _nc
if _nc is None or _nc.is_closed:
_nc = await nats.connect(settings.NATS_URL)
return _nc
async def _get_redis() -> aioredis.Redis:
"""Get or create a shared Redis client."""
global _redis
if _redis is None:
_redis = aioredis.from_url(settings.REDIS_URL, decode_responses=True)
return _redis
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _source_ip(request: Request) -> Optional[str]:
return request.headers.get("x-real-ip") or (request.client.host if request.client else None)
async def _get_device(db: AsyncSession, tenant_id: uuid.UUID, device_id: uuid.UUID) -> Device:
result = await db.execute(
select(Device).where(Device.id == device_id, Device.tenant_id == tenant_id)
)
device = result.scalar_one_or_none()
if not device:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Device not found")
return device
async def _check_tenant_access(
current_user: CurrentUser, tenant_id: uuid.UUID, db: AsyncSession
) -> None:
if current_user.is_super_admin:
from app.database import set_tenant_context
await set_tenant_context(db, str(tenant_id))
return
if current_user.tenant_id != tenant_id:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Access denied: you do not belong to this tenant.",
)
async def _check_rate_limit(user_id: uuid.UUID) -> None:
"""Allow max 3 session creates per 5 minutes per user."""
rd = await _get_redis()
key = f"{RATE_PREFIX}{user_id}"
count = await rd.incr(key)
if count == 1:
await rd.expire(key, 300)
if count > 3:
raise HTTPException(
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
detail="Too many session requests. Try again later.",
)
async def _get_session_from_redis(session_id: str) -> Optional[dict]:
rd = await _get_redis()
raw = await rd.get(f"{REDIS_PREFIX}{session_id}")
if raw is None:
return None
return json.loads(raw)
async def _save_session_to_redis(session_id: str, data: dict, ttl: int = 14400) -> None:
rd = await _get_redis()
await rd.setex(f"{REDIS_PREFIX}{session_id}", ttl, json.dumps(data, default=str))
async def _delete_session_from_redis(session_id: str) -> None:
rd = await _get_redis()
await rd.delete(f"{REDIS_PREFIX}{session_id}")
async def _open_tunnel(
device_id: uuid.UUID, tenant_id: uuid.UUID, user_id: uuid.UUID
) -> dict:
"""Open a TCP tunnel to device port 8291 via NATS request-reply."""
payload = json.dumps({
"device_id": str(device_id),
"tenant_id": str(tenant_id),
"user_id": str(user_id),
"target_port": 8291,
}).encode()
try:
nc = await _get_nats()
msg = await nc.request("tunnel.open", payload, timeout=10)
except Exception as exc:
logger.error("NATS tunnel.open failed: %s", exc)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Tunnel service unavailable",
)
try:
data = json.loads(msg.data)
except Exception:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Invalid response from tunnel service",
)
if "error" in data:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=data["error"]
)
return data
async def _close_tunnel(tunnel_id: str) -> None:
"""Close a tunnel via NATS — idempotent."""
try:
nc = await _get_nats()
payload = json.dumps({"tunnel_id": tunnel_id}).encode()
await nc.request("tunnel.close", payload, timeout=10)
except Exception:
pass # Idempotent — tunnel may already be closed
# ---------------------------------------------------------------------------
# POST — Create a Remote WinBox (Browser) session
# ---------------------------------------------------------------------------
@router.post(
"/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions",
response_model=RemoteWinboxSessionResponse,
summary="Create a Remote WinBox browser session",
dependencies=[Depends(require_operator_or_above)],
)
@limiter.limit("10/minute")
async def create_winbox_remote_session(
tenant_id: uuid.UUID,
device_id: uuid.UUID,
request: Request,
body: RemoteWinboxCreateRequest = RemoteWinboxCreateRequest(),
current_user: CurrentUser = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> RemoteWinboxSessionResponse:
"""
Create an Xpra-based WinBox session accessible via WebSocket in the browser.
Flow: auth -> tenant check -> device exists -> duplicate check -> rate limit ->
credential decrypt -> tunnel open -> worker create -> Redis save -> audit log.
Full rollback on failure.
"""
await _check_tenant_access(current_user, tenant_id, db)
device = await _get_device(db, tenant_id, device_id)
source_ip = _source_ip(request)
# Check for duplicate active session for this user+device
rd = await _get_redis()
cursor = "0"
while True:
cursor, keys = await rd.scan(cursor=cursor, match=f"{REDIS_PREFIX}*", count=100)
for key in keys:
raw = await rd.get(key)
if raw is None:
continue
try:
sess = json.loads(raw)
except Exception:
continue
if (
sess.get("device_id") == str(device_id)
and sess.get("user_id") == str(current_user.user_id)
and sess.get("status") in ("creating", "active", "grace")
):
# Verify the worker actually has this session — if not, clean up
# the stale Redis entry instead of blocking the user.
stale_sid = sess.get("session_id", "")
try:
worker_info = await worker_get_session(stale_sid)
except Exception:
worker_info = None
if worker_info is None:
logger.warning(
"Cleaning stale Redis session %s (worker 404)", stale_sid
)
tunnel_id = sess.get("tunnel_id")
if tunnel_id:
await _close_tunnel(tunnel_id)
await _delete_session_from_redis(stale_sid)
continue
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail="Active session already exists for this device",
)
if cursor == "0" or cursor == 0:
break
# Rate limit
await _check_rate_limit(current_user.user_id)
# Decrypt device credentials
try:
from app.services.crypto import decrypt_credentials_hybrid
creds_json = await decrypt_credentials_hybrid(
transit_ciphertext=device.encrypted_credentials_transit,
legacy_ciphertext=device.encrypted_credentials,
tenant_id=str(tenant_id),
legacy_key=settings.get_encryption_key_bytes(),
)
creds = json.loads(creds_json)
username = creds.get("username", "")
password = creds.get("password", "")
except Exception as exc:
logger.error("Failed to decrypt credentials for device %s: %s", device_id, exc)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Unable to retrieve device credentials",
)
# Open tunnel to device
tunnel_data = None
session_id = str(uuid.uuid4())
now = datetime.now(timezone.utc)
try:
tunnel_data = await _open_tunnel(device_id, tenant_id, current_user.user_id)
tunnel_id = tunnel_data.get("tunnel_id", "")
tunnel_port = tunnel_data.get("local_port")
if not isinstance(tunnel_port, int) or not (49000 <= tunnel_port <= 49100):
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Invalid port allocation from tunnel service",
)
# Create session on worker
# Tunnel listener runs on the poller container, reachable via Docker DNS
try:
worker_resp = await worker_create_session(
session_id=session_id,
device_ip="tod_poller",
device_port=tunnel_port,
username=username,
password=password,
idle_timeout_seconds=body.idle_timeout_seconds,
max_lifetime_seconds=body.max_lifetime_seconds,
)
except WorkerCapacityError:
await _close_tunnel(tunnel_id)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="No capacity for new sessions",
)
except WorkerLaunchError as exc:
await _close_tunnel(tunnel_id)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail=f"Session launch failed: {exc}",
)
finally:
# Zero credentials
username = "" # noqa: F841
password = "" # noqa: F841
expires_at = datetime.fromisoformat(
worker_resp.get("expires_at", now.isoformat())
)
max_expires_at = datetime.fromisoformat(
worker_resp.get("max_expires_at", now.isoformat())
)
# Save session to Redis
session_data = {
"session_id": session_id,
"tenant_id": str(tenant_id),
"device_id": str(device_id),
"user_id": str(current_user.user_id),
"tunnel_id": tunnel_id,
"tunnel_port": tunnel_port,
"status": RemoteWinboxState.active.value,
"created_at": now.isoformat(),
"expires_at": expires_at.isoformat(),
"max_expires_at": max_expires_at.isoformat(),
"idle_timeout_seconds": body.idle_timeout_seconds,
"max_lifetime_seconds": body.max_lifetime_seconds,
"xpra_ws_port": worker_resp.get("xpra_ws_port"),
}
await _save_session_to_redis(session_id, session_data, ttl=body.max_lifetime_seconds + 60)
# Audit log (fire-and-forget)
try:
await log_action(
db,
tenant_id,
current_user.user_id,
"winbox_remote_session_create",
resource_type="device",
resource_id=str(device_id),
device_id=device_id,
details={"session_id": session_id, "source_ip": source_ip},
ip_address=source_ip,
)
except Exception:
pass
ws_path = (
f"/api/tenants/{tenant_id}/devices/{device_id}"
f"/winbox-remote-sessions/{session_id}/ws"
)
return RemoteWinboxSessionResponse(
session_id=uuid.UUID(session_id),
websocket_path=ws_path,
expires_at=expires_at,
max_expires_at=max_expires_at,
idle_timeout_seconds=body.idle_timeout_seconds,
max_lifetime_seconds=body.max_lifetime_seconds,
xpra_ws_port=worker_resp.get("xpra_ws_port"),
)
except HTTPException:
raise
except Exception as exc:
# Full rollback
logger.error("Unexpected error creating winbox remote session: %s", exc)
if tunnel_data and tunnel_data.get("tunnel_id"):
await _close_tunnel(tunnel_data["tunnel_id"])
await _delete_session_from_redis(session_id)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Session creation failed",
)
# ---------------------------------------------------------------------------
# GET — Session status
# ---------------------------------------------------------------------------
@router.get(
"/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions/{session_id}",
response_model=RemoteWinboxStatusResponse,
summary="Get Remote WinBox session status",
dependencies=[Depends(require_operator_or_above)],
)
async def get_winbox_remote_session(
tenant_id: uuid.UUID,
device_id: uuid.UUID,
session_id: uuid.UUID,
current_user: CurrentUser = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> RemoteWinboxStatusResponse:
await _check_tenant_access(current_user, tenant_id, db)
sess = await _get_session_from_redis(str(session_id))
if sess is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="Session not found"
)
if sess.get("tenant_id") != str(tenant_id) or sess.get("device_id") != str(device_id):
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="Session not found"
)
return RemoteWinboxStatusResponse(
session_id=uuid.UUID(sess["session_id"]),
status=RemoteWinboxState(sess.get("status", "active")),
created_at=datetime.fromisoformat(sess["created_at"]),
expires_at=datetime.fromisoformat(sess["expires_at"]),
max_expires_at=datetime.fromisoformat(sess["max_expires_at"]),
idle_timeout_seconds=sess.get("idle_timeout_seconds", 600),
max_lifetime_seconds=sess.get("max_lifetime_seconds", 7200),
xpra_ws_port=sess.get("xpra_ws_port"),
)
# ---------------------------------------------------------------------------
# GET — List sessions for a device
# ---------------------------------------------------------------------------
@router.get(
"/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions",
response_model=list[RemoteWinboxStatusResponse],
summary="List Remote WinBox sessions for a device",
dependencies=[Depends(require_operator_or_above)],
)
async def list_winbox_remote_sessions(
tenant_id: uuid.UUID,
device_id: uuid.UUID,
current_user: CurrentUser = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> list[RemoteWinboxStatusResponse]:
await _check_tenant_access(current_user, tenant_id, db)
sessions = []
rd = await _get_redis()
cursor = "0"
while True:
cursor, keys = await rd.scan(cursor=cursor, match=f"{REDIS_PREFIX}*", count=100)
for key in keys:
raw = await rd.get(key)
if raw is None:
continue
try:
sess = json.loads(raw)
except Exception:
continue
if (
sess.get("tenant_id") == str(tenant_id)
and sess.get("device_id") == str(device_id)
):
sessions.append(
RemoteWinboxStatusResponse(
session_id=uuid.UUID(sess["session_id"]),
status=RemoteWinboxState(sess.get("status", "active")),
created_at=datetime.fromisoformat(sess["created_at"]),
expires_at=datetime.fromisoformat(sess["expires_at"]),
max_expires_at=datetime.fromisoformat(sess["max_expires_at"]),
idle_timeout_seconds=sess.get("idle_timeout_seconds", 600),
max_lifetime_seconds=sess.get("max_lifetime_seconds", 7200),
xpra_ws_port=sess.get("xpra_ws_port"),
)
)
if cursor == "0" or cursor == 0:
break
return sessions
# ---------------------------------------------------------------------------
# DELETE — Terminate session (idempotent)
# ---------------------------------------------------------------------------
@router.delete(
"/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions/{session_id}",
response_model=RemoteWinboxTerminateResponse,
summary="Terminate a Remote WinBox session",
dependencies=[Depends(require_operator_or_above)],
)
async def terminate_winbox_remote_session(
tenant_id: uuid.UUID,
device_id: uuid.UUID,
session_id: uuid.UUID,
request: Request,
current_user: CurrentUser = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> RemoteWinboxTerminateResponse:
await _check_tenant_access(current_user, tenant_id, db)
source_ip = _source_ip(request)
sess = await _get_session_from_redis(str(session_id))
# Idempotent — if already gone, return terminated
if sess is None:
return RemoteWinboxTerminateResponse(
session_id=session_id,
status=RemoteWinboxState.terminated,
reason="Session already terminated or not found",
)
if sess.get("tenant_id") != str(tenant_id):
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="Session not found"
)
# Rollback order: worker -> tunnel -> redis -> audit
await worker_terminate_session(str(session_id))
tunnel_id = sess.get("tunnel_id")
if tunnel_id:
await _close_tunnel(tunnel_id)
await _delete_session_from_redis(str(session_id))
try:
await log_action(
db,
tenant_id,
current_user.user_id,
"winbox_remote_session_terminate",
resource_type="device",
resource_id=str(device_id),
device_id=device_id,
details={"session_id": str(session_id), "source_ip": source_ip},
ip_address=source_ip,
)
except Exception:
pass
return RemoteWinboxTerminateResponse(
session_id=session_id,
status=RemoteWinboxState.terminated,
reason="Terminated by user",
)
# ---------------------------------------------------------------------------
# HTTP Proxy — Serve Xpra HTML5 client files from worker
# ---------------------------------------------------------------------------
@router.get(
"/tenants/{tenant_id}/devices/{device_id}"
"/winbox-remote-sessions/{session_id}/xpra/{path:path}",
summary="Proxy Xpra HTML5 client files",
dependencies=[Depends(require_operator_or_above)],
)
@router.get(
"/tenants/{tenant_id}/devices/{device_id}"
"/winbox-remote-sessions/{session_id}/xpra",
summary="Proxy Xpra HTML5 client (root)",
dependencies=[Depends(require_operator_or_above)],
)
async def proxy_xpra_html(
tenant_id: uuid.UUID,
device_id: uuid.UUID,
session_id: uuid.UUID,
request: Request,
path: str = "",
current_user: CurrentUser = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> None:
"""Reverse-proxy HTTP requests to the Xpra HTML5 server inside the worker."""
from starlette.responses import Response
await _check_tenant_access(current_user, tenant_id, db)
sess = await _get_session_from_redis(str(session_id))
if sess is None:
raise HTTPException(status_code=404, detail="Session not found")
if sess.get("tenant_id") != str(tenant_id) or sess.get("device_id") != str(device_id):
raise HTTPException(status_code=404, detail="Session not found")
xpra_ws_port = sess.get("xpra_ws_port")
if not xpra_ws_port:
raise HTTPException(status_code=503, detail="Xpra port unavailable")
# Proxy the request to Xpra's built-in HTTP server
target_url = f"http://tod_winbox_worker:{xpra_ws_port}/{path}"
try:
async with httpx.AsyncClient(timeout=httpx.Timeout(10.0)) as client:
proxy_resp = await client.get(
target_url,
params=dict(request.query_params),
)
except Exception as exc:
logger.error("Xpra HTTP proxy error: %s", exc)
raise HTTPException(status_code=502, detail="Xpra server unreachable")
# Forward the response with correct content type
return Response(
content=proxy_resp.content,
status_code=proxy_resp.status_code,
headers={
k: v for k, v in proxy_resp.headers.items()
if k.lower() in ("content-type", "cache-control", "content-encoding")
},
)
# ---------------------------------------------------------------------------
# WebSocket — Proxy browser <-> Xpra worker
# ---------------------------------------------------------------------------
@router.websocket(
"/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions/{session_id}/ws"
)
async def winbox_remote_ws_proxy(
websocket: WebSocket,
tenant_id: uuid.UUID,
device_id: uuid.UUID,
session_id: uuid.UUID,
) -> None:
"""
Bidirectional WebSocket proxy between the browser and the worker's Xpra
WebSocket. Authentication via access_token cookie or query param.
1. Authenticate via cookie/query param token
2. Validate session in Redis (ownership, status, expiry)
3. Resolve Xpra WebSocket port from worker
4. Accept browser WebSocket upgrade
5. Proxy bidirectionally until close
"""
# --- Auth: extract token from cookie or query param ---
token = websocket.cookies.get("access_token") or websocket.query_params.get("token")
if not token:
await websocket.close(code=4001, reason="Authentication required")
return
from app.services.auth import verify_token
try:
payload = verify_token(token, expected_type="access")
except Exception:
await websocket.close(code=4001, reason="Invalid token")
return
user_id_str = payload.get("sub")
user_tenant_str = payload.get("tenant_id")
role = payload.get("role")
if not user_id_str or not role:
await websocket.close(code=4001, reason="Invalid token payload")
return
# Tenant access check
if role != "super_admin":
if user_tenant_str != str(tenant_id):
await websocket.close(code=4003, reason="Tenant access denied")
return
# --- Session validation ---
sess = await _get_session_from_redis(str(session_id))
if sess is None:
await websocket.close(code=4004, reason="Session not found")
return
if sess.get("tenant_id") != str(tenant_id) or sess.get("device_id") != str(device_id):
await websocket.close(code=4004, reason="Session not found")
return
# Ownership check: user must own the session (or be super_admin)
if role != "super_admin" and sess.get("user_id") != user_id_str:
await websocket.close(code=4003, reason="Not your session")
return
sess_status = sess.get("status")
if sess_status not in ("active", "grace"):
await websocket.close(code=4004, reason=f"Session not active (status={sess_status})")
return
# Check max expiry
max_expires = datetime.fromisoformat(sess["max_expires_at"])
if datetime.now(timezone.utc) > max_expires:
await websocket.close(code=4004, reason="Session expired")
return
# Resolve Xpra WebSocket port from worker
xpra_ws_port = sess.get("xpra_ws_port")
if not xpra_ws_port:
worker_info = await worker_get_session(str(session_id))
if not worker_info:
await websocket.close(code=4004, reason="Worker session not found")
return
xpra_ws_port = worker_info.get("xpra_ws_port") or worker_info.get("ws_port")
if not xpra_ws_port:
await websocket.close(code=4004, reason="Xpra port unavailable")
return
# Update last_client_connect_at in Redis
sess["last_client_connect_at"] = datetime.now(timezone.utc).isoformat()
try:
await _save_session_to_redis(str(session_id), sess)
except Exception:
pass
# Accept browser WebSocket
await websocket.accept()
# Connect to worker Xpra WebSocket
import websockets
worker_ws_url = f"ws://tod_winbox_worker:{xpra_ws_port}"
try:
async with websockets.connect(worker_ws_url) as worker_ws:
async def browser_to_worker() -> None:
try:
while True:
data = await websocket.receive_bytes()
await worker_ws.send(data)
except WebSocketDisconnect:
pass
except Exception:
pass
async def worker_to_browser() -> None:
try:
async for message in worker_ws:
if isinstance(message, bytes):
await websocket.send_bytes(message)
else:
await websocket.send_text(message)
except Exception:
pass
# Run both directions concurrently
done, pending = await asyncio.wait(
[
asyncio.create_task(browser_to_worker()),
asyncio.create_task(worker_to_browser()),
],
return_when=asyncio.FIRST_COMPLETED,
)
for task in pending:
task.cancel()
except Exception as exc:
logger.warning("WebSocket proxy error for session %s: %s", session_id, exc)
finally:
try:
await websocket.close()
except Exception:
pass

View File

@@ -1,5 +1,7 @@
from pydantic import BaseModel, Field
from app.schemas.winbox_remote import RemoteWinboxSessionItem
class WinboxSessionResponse(BaseModel):
tunnel_id: str
@@ -37,3 +39,4 @@ class SSHSessionStatusItem(BaseModel):
class ActiveSessionsResponse(BaseModel):
winbox_tunnels: list[TunnelStatusItem] = []
ssh_sessions: list[SSHSessionStatusItem] = []
remote_winbox_sessions: list[RemoteWinboxSessionItem] = []

View File

@@ -0,0 +1,63 @@
"""Request/response schemas for Remote WinBox (Browser) sessions."""
import uuid
from datetime import datetime
from enum import Enum
from typing import Optional
from pydantic import BaseModel, Field
class RemoteWinboxState(str, Enum):
creating = "creating"
active = "active"
grace = "grace"
terminating = "terminating"
terminated = "terminated"
failed = "failed"
class RemoteWinboxCreateRequest(BaseModel):
idle_timeout_seconds: int = Field(default=600, ge=60, le=3600)
max_lifetime_seconds: int = Field(default=7200, ge=300, le=14400)
class RemoteWinboxSessionResponse(BaseModel):
session_id: uuid.UUID
status: RemoteWinboxState = RemoteWinboxState.active
websocket_path: str
expires_at: datetime
max_expires_at: datetime
idle_timeout_seconds: int
max_lifetime_seconds: int
xpra_ws_port: Optional[int] = None
class RemoteWinboxStatusResponse(BaseModel):
session_id: uuid.UUID
status: RemoteWinboxState
created_at: datetime
expires_at: datetime
max_expires_at: datetime
idle_timeout_seconds: int
max_lifetime_seconds: int
xpra_ws_port: Optional[int] = None
class RemoteWinboxTerminateResponse(BaseModel):
session_id: uuid.UUID
status: RemoteWinboxState
reason: str
class RemoteWinboxDuplicateDetail(BaseModel):
detail: str = "Active session exists"
session: RemoteWinboxStatusResponse
class RemoteWinboxSessionItem(BaseModel):
"""Used in the combined active sessions list."""
session_id: uuid.UUID
status: RemoteWinboxState
created_at: datetime
expires_at: datetime

View File

@@ -120,7 +120,7 @@ async def _send_email(channel: dict, alert_event: dict, device_hostname: str) ->
user=channel.get("smtp_user"),
password=smtp_password,
use_tls=channel.get("smtp_use_tls", False),
from_address=channel.get("from_address") or "alerts@mikrotik-portal.local",
from_address=channel.get("from_address") or "alerts@the-other-dude.local",
)
to = channel.get("to_address")

View File

@@ -43,7 +43,7 @@ from app.services.push_tracker import record_push, clear_push
logger = logging.getLogger(__name__)
# Name of the panic-revert scheduler installed on the RouterOS device
_PANIC_REVERT_SCHEDULER = "mikrotik-portal-panic-revert"
_PANIC_REVERT_SCHEDULER = "the-other-dude-panic-revert"
# Name of the pre-push binary backup saved on device flash
_PRE_PUSH_BACKUP = "portal-pre-push"
# Name of the RSC file used for /import on device

View File

@@ -35,7 +35,7 @@ logger = logging.getLogger(__name__)
_env = SandboxedEnvironment()
# Names used on the RouterOS device during template push
_PANIC_REVERT_SCHEDULER = "mikrotik-portal-template-revert"
_PANIC_REVERT_SCHEDULER = "the-other-dude-template-revert"
_PRE_PUSH_BACKUP = "portal-template-pre-push"
_TEMPLATE_RSC = "portal-template.rsc"

View File

@@ -0,0 +1,126 @@
"""HTTP client for the winbox-worker container.
Provides async helpers to create, terminate, query, and health-check
Remote WinBox (Xpra) sessions running inside the worker container.
All communication uses the internal Docker network.
"""
import logging
from typing import Any, Optional
import httpx
logger = logging.getLogger(__name__)
WORKER_BASE_URL = "http://tod_winbox_worker:9090"
_HEADERS = {"X-Internal-Service": "api"}
_TIMEOUT = httpx.Timeout(15.0, connect=5.0)
class WorkerCapacityError(Exception):
"""Worker has no capacity for new sessions."""
class WorkerLaunchError(Exception):
"""Worker failed to launch a session."""
async def create_session(
session_id: str,
device_ip: str,
device_port: int,
username: str,
password: str,
idle_timeout_seconds: int,
max_lifetime_seconds: int,
) -> dict[str, Any]:
"""POST /sessions — ask the worker to launch an Xpra+WinBox session.
Credentials are zeroed from locals after the request is sent.
Raises WorkerCapacityError (503) or WorkerLaunchError on failure.
"""
payload = {
"session_id": session_id,
"tunnel_host": device_ip,
"tunnel_port": device_port,
"username": username,
"password": password,
"idle_timeout_seconds": idle_timeout_seconds,
"max_lifetime_seconds": max_lifetime_seconds,
}
try:
async with httpx.AsyncClient(
base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=_TIMEOUT
) as client:
resp = await client.post("/sessions", json=payload)
finally:
# Zero credentials in the payload dict
payload["username"] = ""
payload["password"] = ""
del username, password # noqa: F821 — local unbind
if resp.status_code == 503:
raise WorkerCapacityError(resp.text)
if resp.status_code >= 400:
raise WorkerLaunchError(f"Worker returned {resp.status_code}: {resp.text}")
return resp.json()
async def terminate_session(session_id: str) -> bool:
"""DELETE /sessions/{session_id} — idempotent (ignores 404).
Returns True if the worker acknowledged termination, False if 404.
"""
async with httpx.AsyncClient(
base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=_TIMEOUT
) as client:
resp = await client.delete(f"/sessions/{session_id}")
if resp.status_code == 404:
return False
if resp.status_code >= 400:
logger.error("Worker terminate error %s: %s", resp.status_code, resp.text)
return False
return True
async def get_session(session_id: str) -> Optional[dict[str, Any]]:
"""GET /sessions/{session_id} — returns None if 404."""
async with httpx.AsyncClient(
base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=_TIMEOUT
) as client:
resp = await client.get(f"/sessions/{session_id}")
if resp.status_code == 404:
return None
if resp.status_code >= 400:
logger.error("Worker get_session error %s: %s", resp.status_code, resp.text)
return None
return resp.json()
async def list_sessions() -> list[dict[str, Any]]:
"""GET /sessions — return all sessions known to the worker."""
async with httpx.AsyncClient(
base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=_TIMEOUT
) as client:
resp = await client.get("/sessions")
if resp.status_code >= 400:
logger.error("Worker list_sessions error %s: %s", resp.status_code, resp.text)
return []
data = resp.json()
return data if isinstance(data, list) else []
async def health_check() -> bool:
"""GET /healthz — returns True if the worker is healthy."""
try:
async with httpx.AsyncClient(
base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=httpx.Timeout(5.0)
) as client:
resp = await client.get("/healthz")
return resp.status_code == 200
except Exception:
return False

View File

@@ -3,7 +3,7 @@ requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "mikrotik-portal-backend"
name = "the-other-dude-backend"
version = "9.0.1"
description = "MikroTik Fleet Management Portal - Backend API"
requires-python = ">=3.12"

View File

@@ -16,7 +16,7 @@ async def test_recovery_commits_reachable_device_with_scheduler():
push_op.device_id = uuid4()
push_op.tenant_id = uuid4()
push_op.status = "pending_verification"
push_op.scheduler_name = "mikrotik-portal-panic-revert"
push_op.scheduler_name = "the-other-dude-panic-revert"
push_op.started_at = datetime.now(timezone.utc) - timedelta(minutes=10)
device = MagicMock()
@@ -71,7 +71,7 @@ async def test_recovery_marks_unreachable_device_failed():
push_op.device_id = uuid4()
push_op.tenant_id = uuid4()
push_op.status = "pending_verification"
push_op.scheduler_name = "mikrotik-portal-panic-revert"
push_op.scheduler_name = "the-other-dude-panic-revert"
push_op.started_at = datetime.now(timezone.utc) - timedelta(minutes=10)
device = MagicMock()