From 970501e453659958d44da943181781a192c25c7c Mon Sep 17 00:00:00 2001 From: Jason Staack Date: Sat, 14 Mar 2026 09:05:14 -0500 Subject: [PATCH] feat: implement Remote WinBox worker, API, frontend integration, OpenBao persistence, and supporting docs --- .env.example | 9 +- .env.staging.example | 2 +- .gitignore | 4 + backend/app/config.py | 9 +- backend/app/main.py | 74 +- backend/app/middleware/tenant_context.py | 61 + backend/app/routers/auth.py | 2 +- backend/app/routers/remote_access.py | 25 +- backend/app/routers/settings.py | 18 + backend/app/routers/winbox_remote.py | 781 +++++ backend/app/schemas/remote_access.py | 3 + backend/app/schemas/winbox_remote.py | 63 + backend/app/services/notification_service.py | 2 +- backend/app/services/restore_service.py | 2 +- backend/app/services/template_service.py | 2 +- backend/app/services/winbox_remote.py | 126 + backend/pyproject.toml | 2 +- backend/tests/test_push_recovery.py | 4 +- docker-compose.override.yml | 14 +- docker-compose.prod.yml | 54 +- docker-compose.yml | 45 +- docs/CONFIGURATION.md | 2 +- .../plans/2026-03-12-remote-access.md | 2704 ----------------- .../specs/2026-03-12-remote-access-design.md | 841 ----- docs/website/docs.html | 2 +- .../components/fleet/RemoteWinBoxButton.tsx | 295 ++ .../src/components/settings/SettingsPage.tsx | 32 +- frontend/src/lib/api.ts | 53 + frontend/src/lib/crypto/keyStore.ts | 2 +- frontend/src/lib/settingsApi.ts | 5 + .../tenants/$tenantId/devices/$deviceId.tsx | 6 +- frontend/tests/e2e/auth.setup.ts | 2 +- frontend/tests/e2e/login.spec.ts | 2 +- infrastructure/docker/nginx-spa.conf | 25 +- infrastructure/helm/templates/_helpers.tpl | 68 +- .../helm/templates/api-deployment.yaml | 18 +- .../helm/templates/api-service.yaml | 6 +- infrastructure/helm/templates/configmap.yaml | 14 +- .../helm/templates/frontend-deployment.yaml | 14 +- infrastructure/helm/templates/ingress.yaml | 18 +- .../helm/templates/nats-statefulset.yaml | 22 +- .../helm/templates/poller-deployment.yaml | 20 +- .../helm/templates/postgres-statefulset.yaml | 26 +- .../helm/templates/redis-deployment.yaml | 14 +- infrastructure/helm/templates/secrets.yaml | 6 +- infrastructure/helm/values.yaml | 12 +- infrastructure/openbao/config.hcl | 11 + infrastructure/openbao/init.sh | 92 +- .../apache/tod.conf.example | 71 + .../caddy/Caddyfile.example | 71 + .../haproxy/haproxy.cfg.example | 77 + .../nginx/tod.conf.example | 90 + .../traefik/traefik-dynamic.yaml.example | 93 + poller/cmd/poller/main.go | 16 +- poller/go.mod | 2 +- poller/internal/bus/backup_responder.go | 2 +- poller/internal/bus/backup_responder_test.go | 2 +- poller/internal/bus/cmd_cert_deploy.go | 6 +- poller/internal/bus/cmd_responder.go | 6 +- poller/internal/bus/credential_subscriber.go | 2 +- poller/internal/bus/publisher.go | 2 +- .../bus/publisher_integration_test.go | 4 +- poller/internal/bus/tunnel_responder.go | 6 +- poller/internal/poller/backup_scheduler.go | 10 +- .../internal/poller/backup_scheduler_test.go | 4 +- poller/internal/poller/integration_test.go | 6 +- poller/internal/poller/interfaces.go | 2 +- poller/internal/poller/scheduler.go | 8 +- poller/internal/poller/scheduler_test.go | 4 +- poller/internal/poller/worker.go | 10 +- poller/internal/sshrelay/server.go | 6 +- .../store/devices_integration_test.go | 4 +- poller/internal/testutil/containers.go | 2 +- poller/internal/tunnel/manager.go | 4 +- poller/internal/vault/cache.go | 2 +- winbox-worker/.gitignore | 2 + winbox-worker/Dockerfile | 77 + winbox-worker/cmd/worker/main.go | 174 ++ winbox-worker/go.mod | 5 + winbox-worker/go.sum | 2 + winbox-worker/internal/session/manager.go | 375 +++ .../internal/session/manager_test.go | 107 + winbox-worker/internal/session/pool.go | 60 + winbox-worker/internal/session/pool_test.go | 48 + winbox-worker/internal/session/types.go | 67 + winbox-worker/internal/session/xpra.go | 161 + 86 files changed, 3440 insertions(+), 3764 deletions(-) create mode 100644 backend/app/routers/winbox_remote.py create mode 100644 backend/app/schemas/winbox_remote.py create mode 100644 backend/app/services/winbox_remote.py delete mode 100644 docs/superpowers/plans/2026-03-12-remote-access.md delete mode 100644 docs/superpowers/specs/2026-03-12-remote-access-design.md create mode 100644 frontend/src/components/fleet/RemoteWinBoxButton.tsx create mode 100644 infrastructure/openbao/config.hcl create mode 100644 infrastructure/reverse-proxy-examples/apache/tod.conf.example create mode 100644 infrastructure/reverse-proxy-examples/caddy/Caddyfile.example create mode 100644 infrastructure/reverse-proxy-examples/haproxy/haproxy.cfg.example create mode 100644 infrastructure/reverse-proxy-examples/nginx/tod.conf.example create mode 100644 infrastructure/reverse-proxy-examples/traefik/traefik-dynamic.yaml.example create mode 100644 winbox-worker/.gitignore create mode 100644 winbox-worker/Dockerfile create mode 100644 winbox-worker/cmd/worker/main.go create mode 100644 winbox-worker/go.mod create mode 100644 winbox-worker/go.sum create mode 100644 winbox-worker/internal/session/manager.go create mode 100644 winbox-worker/internal/session/manager_test.go create mode 100644 winbox-worker/internal/session/pool.go create mode 100644 winbox-worker/internal/session/pool_test.go create mode 100644 winbox-worker/internal/session/types.go create mode 100644 winbox-worker/internal/session/xpra.go diff --git a/.env.example b/.env.example index 5d9a9cd..c6a9b4d 100644 --- a/.env.example +++ b/.env.example @@ -6,7 +6,7 @@ # docker compose --profile full build api && docker compose --profile full build poller && docker compose --profile full build frontend # docker compose --profile full up -d # open http://localhost:3000 -# Login: admin@mikrotik-portal.dev / changeme-in-production +# Login: admin@the-other-dude.dev / changeme-in-production # Environment (dev | staging | production) ENVIRONMENT=dev @@ -37,8 +37,13 @@ NATS_URL=nats://nats:4222 JWT_SECRET_KEY=dev-jwt-secret-do-not-use-in-production-replace-me CREDENTIAL_ENCRYPTION_KEY=LLLjnfBZTSycvL2U07HDSxUeTtLxb9cZzryQl0R9E4w= +# OpenBao unseal key (generated on first run - see init.sh output) +BAO_UNSEAL_KEY= +# OpenBao root token (generated on first run - replaces dev-openbao-token) +# OPENBAO_TOKEN= + # First admin bootstrap (dev only) -FIRST_ADMIN_EMAIL=admin@mikrotik-portal.dev +FIRST_ADMIN_EMAIL=admin@the-other-dude.dev FIRST_ADMIN_PASSWORD=changeme-in-production # CORS (comma-separated origins) diff --git a/.env.staging.example b/.env.staging.example index c79573c..c90349d 100644 --- a/.env.staging.example +++ b/.env.staging.example @@ -30,7 +30,7 @@ JWT_SECRET_KEY=CHANGE_ME_STAGING CREDENTIAL_ENCRYPTION_KEY=CHANGE_ME_STAGING # First admin bootstrap -FIRST_ADMIN_EMAIL=admin@mikrotik-portal.staging +FIRST_ADMIN_EMAIL=admin@the-other-dude.staging FIRST_ADMIN_PASSWORD=CHANGE_ME_STAGING # CORS (staging URL) diff --git a/.gitignore b/.gitignore index 3f4dc20..6dc2ba3 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,7 @@ Thumbs.db # Playwright MCP logs .playwright-mcp/ + +# Local-only planning and design docs +.planning/ +docs/superpowers/ diff --git a/backend/app/config.py b/backend/app/config.py index c346d4a..181ce85 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -22,7 +22,7 @@ KNOWN_INSECURE_DEFAULTS: dict[str, list[str]] = { ], "OPENBAO_TOKEN": [ "dev-openbao-token", - "CHANGE_ME_IN_PRODUCTION", + "", ], } @@ -43,7 +43,8 @@ def validate_production_settings(settings: "Settings") -> None: f"FATAL: {field} uses a known insecure default in '{settings.ENVIRONMENT}' environment.\n" f"Generate a secure value and set it in your .env.prod file.\n" f"For JWT_SECRET_KEY: python -c \"import secrets; print(secrets.token_urlsafe(64))\"\n" - f"For CREDENTIAL_ENCRYPTION_KEY: python -c \"import secrets, base64; print(base64.b64encode(secrets.token_bytes(32)).decode())\"", + f"For CREDENTIAL_ENCRYPTION_KEY: python -c \"import secrets, base64; print(base64.b64encode(secrets.token_bytes(32)).decode())\"\n" + f"For OPENBAO_TOKEN: use the token from your OpenBao server (not the dev token)", file=sys.stderr, ) sys.exit(1) @@ -92,7 +93,7 @@ class Settings(BaseSettings): # OpenBao Transit (KMS for per-tenant credential encryption) OPENBAO_ADDR: str = "http://localhost:8200" - OPENBAO_TOKEN: str = "dev-openbao-token" + OPENBAO_TOKEN: str = "" # First admin bootstrap FIRST_ADMIN_EMAIL: Optional[str] = None @@ -119,7 +120,7 @@ class Settings(BaseSettings): SMTP_USER: Optional[str] = None SMTP_PASSWORD: Optional[str] = None SMTP_USE_TLS: bool = False - SMTP_FROM_ADDRESS: str = "noreply@mikrotik-portal.local" + SMTP_FROM_ADDRESS: str = "noreply@the-other-dude.local" # Password reset PASSWORD_RESET_TOKEN_EXPIRE_MINUTES: int = 30 diff --git a/backend/app/main.py b/backend/app/main.py index dfc2520..2ab5ebc 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,7 +1,8 @@ """FastAPI application entry point.""" +import asyncio from contextlib import asynccontextmanager -from typing import AsyncGenerator +from typing import AsyncGenerator, Optional import structlog from fastapi import FastAPI @@ -232,11 +233,80 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: except Exception as exc: logger.warning("retention scheduler could not start (API will run without it)", error=str(exc)) + # Start Remote WinBox session reconciliation loop (60s interval). + # Detects orphaned sessions (worker lost them) and cleans up Redis + tunnels. + winbox_reconcile_task: Optional[asyncio.Task] = None # type: ignore[type-arg] + try: + from app.routers.winbox_remote import _get_redis as _wb_get_redis, _close_tunnel + from app.services.winbox_remote import get_session as _wb_worker_get, health_check as _wb_health + + async def _winbox_reconcile_loop() -> None: + """Scan Redis for winbox-remote:* keys and reconcile with worker.""" + import json as _json + + while True: + try: + await asyncio.sleep(60) + rd = await _wb_get_redis() + cursor = "0" + while True: + cursor, keys = await rd.scan( + cursor=cursor, match="winbox-remote:*", count=100 + ) + for key in keys: + raw = await rd.get(key) + if raw is None: + continue + try: + sess = _json.loads(raw) + except Exception: + await rd.delete(key) + continue + + sess_status = sess.get("status") + if sess_status not in ("creating", "active", "grace"): + continue + + session_id = sess.get("session_id") + if not session_id: + await rd.delete(key) + continue + + # Health-check against worker + worker_info = await _wb_worker_get(session_id) + if worker_info is None: + # Worker lost the session — clean up + logger.warning( + "reconcile: worker lost session %s, cleaning up", + session_id, + ) + tunnel_id = sess.get("tunnel_id") + if tunnel_id: + await _close_tunnel(tunnel_id) + await rd.delete(key) + + if cursor == "0" or cursor == 0: + break + except asyncio.CancelledError: + break + except Exception as exc: + logger.warning("winbox reconcile loop error: %s", exc) + + winbox_reconcile_task = asyncio.create_task(_winbox_reconcile_loop()) + except Exception as exc: + logger.warning("winbox reconcile loop could not start (non-fatal)", error=str(exc)) + logger.info("startup complete, ready to serve requests") yield # Shutdown logger.info("shutting down TOD API") + if winbox_reconcile_task and not winbox_reconcile_task.done(): + winbox_reconcile_task.cancel() + try: + await winbox_reconcile_task + except asyncio.CancelledError: + pass await stop_backup_scheduler() await stop_nats_subscriber(nats_connection) await stop_metrics_subscriber(metrics_nc) @@ -311,6 +381,7 @@ def create_app() -> FastAPI: from app.routers.transparency import router as transparency_router from app.routers.settings import router as settings_router from app.routers.remote_access import router as remote_access_router + from app.routers.winbox_remote import router as winbox_remote_router app.include_router(auth_router, prefix="/api") app.include_router(tenants_router, prefix="/api") @@ -339,6 +410,7 @@ def create_app() -> FastAPI: app.include_router(transparency_router, prefix="/api") app.include_router(settings_router, prefix="/api") app.include_router(remote_access_router, prefix="/api") + app.include_router(winbox_remote_router, prefix="/api") # Health check endpoints @app.get("/health", tags=["health"]) diff --git a/backend/app/middleware/tenant_context.py b/backend/app/middleware/tenant_context.py index 438ccae..73cd15c 100644 --- a/backend/app/middleware/tenant_context.py +++ b/backend/app/middleware/tenant_context.py @@ -164,6 +164,67 @@ async def get_current_user( ) +async def get_current_user_ws( + websocket: "WebSocket", +) -> CurrentUser: + """ + WebSocket authentication helper. + + Extracts JWT from the ``access_token`` cookie or ``token`` query parameter, + decodes it, and returns a :class:`CurrentUser`. Unlike :func:`get_current_user` + this does **not** touch the database (no RLS tenant context) because WebSocket + handlers typically manage their own DB sessions. + + Raises: + WebSocketException 1008: If no token is provided or the token is invalid. + """ + from starlette.websockets import WebSocket, WebSocketState + from fastapi import WebSocketException + + # 1. Try cookie + token: Optional[str] = websocket.cookies.get("access_token") + + # 2. Fall back to query param + if not token: + token = websocket.query_params.get("token") + + if not token: + raise WebSocketException(code=1008, reason="Not authenticated") + + try: + payload = verify_token(token, expected_type="access") + except HTTPException: + raise WebSocketException(code=1008, reason="Invalid or expired token") + + user_id_str = payload.get("sub") + tenant_id_str = payload.get("tenant_id") + role = payload.get("role") + + if not user_id_str or not role: + raise WebSocketException(code=1008, reason="Invalid token payload") + + try: + user_id = uuid.UUID(user_id_str) + except ValueError: + raise WebSocketException(code=1008, reason="Invalid token payload") + + tenant_id: Optional[uuid.UUID] = None + if tenant_id_str: + try: + tenant_id = uuid.UUID(tenant_id_str) + except ValueError: + pass + + if role != "super_admin" and tenant_id is None: + raise WebSocketException(code=1008, reason="Invalid token: no tenant context") + + return CurrentUser( + user_id=user_id, + tenant_id=tenant_id, + role=role, + ) + + async def get_optional_current_user( request: Request, credentials: Annotated[Optional[HTTPAuthorizationCredentials], Depends(bearer_scheme)] = None, diff --git a/backend/app/routers/auth.py b/backend/app/routers/auth.py index 89ce44b..edb25c8 100644 --- a/backend/app/routers/auth.py +++ b/backend/app/routers/auth.py @@ -817,7 +817,7 @@ async def get_emergency_kit_template( io.BytesIO(pdf_bytes), media_type="application/pdf", headers={ - "Content-Disposition": 'attachment; filename="MikroTik-Portal-Emergency-Kit.pdf"', + "Content-Disposition": 'attachment; filename="The-Other-Dude-Emergency-Kit.pdf"', }, ) diff --git a/backend/app/routers/remote_access.py b/backend/app/routers/remote_access.py index 180ee1f..1ad0d27 100644 --- a/backend/app/routers/remote_access.py +++ b/backend/app/routers/remote_access.py @@ -29,6 +29,7 @@ from app.schemas.remote_access import ( TunnelStatusItem, WinboxSessionResponse, ) +from app.schemas.winbox_remote import RemoteWinboxSessionItem from app.middleware.rate_limit import limiter from app.services.audit_service import log_action from sqlalchemy import select @@ -329,4 +330,26 @@ async def list_sessions( logger.warning("tunnel.status.list NATS request failed: %s", exc) # Return empty list rather than error — poller may be unavailable - return ActiveSessionsResponse(winbox_tunnels=tunnels, ssh_sessions=[]) + # Query Redis for remote winbox (browser) sessions for this device + remote_winbox: list[RemoteWinboxSessionItem] = [] + try: + rd = await _get_redis() + pattern = f"winbox-remote:{device_id}:*" + cursor, keys = await rd.scan(0, match=pattern, count=100) + while keys or cursor: + for key in keys: + raw = await rd.get(key) + if raw: + data = json.loads(raw) + remote_winbox.append(RemoteWinboxSessionItem(**data)) + if not cursor: + break + cursor, keys = await rd.scan(cursor, match=pattern, count=100) + except Exception as exc: + logger.warning("Redis winbox-remote scan failed: %s", exc) + + return ActiveSessionsResponse( + winbox_tunnels=tunnels, + ssh_sessions=[], + remote_winbox_sessions=remote_winbox, + ) diff --git a/backend/app/routers/settings.py b/backend/app/routers/settings.py index 850a2ce..99716e2 100644 --- a/backend/app/routers/settings.py +++ b/backend/app/routers/settings.py @@ -7,6 +7,7 @@ Transit encryption for passwords. Falls back to .env values. import logging from typing import Optional +import redis.asyncio as aioredis from fastapi import APIRouter, Depends from pydantic import BaseModel from sqlalchemy import text @@ -153,3 +154,20 @@ async def test_smtp( return await send_test_email(data.to, config) return conn_result + + +@router.delete("/winbox-sessions") +async def clear_winbox_sessions(user=Depends(require_role("super_admin"))): + """Clear all WinBox remote session and rate-limit keys from Redis.""" + rd = aioredis.from_url(settings.REDIS_URL, decode_responses=True) + try: + deleted = 0 + for pattern in ["winbox-remote:*", "winbox-remote-rate:*"]: + keys = [] + async for key in rd.scan_iter(match=pattern): + keys.append(key) + if keys: + deleted += await rd.delete(*keys) + return {"status": "ok", "deleted": deleted} + finally: + await rd.aclose() diff --git a/backend/app/routers/winbox_remote.py b/backend/app/routers/winbox_remote.py new file mode 100644 index 0000000..3518514 --- /dev/null +++ b/backend/app/routers/winbox_remote.py @@ -0,0 +1,781 @@ +""" +Remote WinBox (Browser) endpoints — Xpra-based in-browser WinBox sessions. + +All routes are tenant-scoped under /api/tenants/{tenant_id}/devices/{device_id}. +RBAC: operator+ required for all endpoints. +""" + +import asyncio +import json +import logging +import uuid +from datetime import datetime, timezone +from typing import Optional + +import httpx +import nats +import redis.asyncio as aioredis +from fastapi import ( + APIRouter, + Depends, + HTTPException, + Request, + WebSocket, + WebSocketDisconnect, + status, +) +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import settings +from app.database import get_db +from app.middleware.rbac import require_operator_or_above +from app.middleware.rate_limit import limiter +from app.middleware.tenant_context import CurrentUser, get_current_user +from app.models.device import Device +from app.schemas.winbox_remote import ( + RemoteWinboxCreateRequest, + RemoteWinboxSessionResponse, + RemoteWinboxState, + RemoteWinboxStatusResponse, + RemoteWinboxTerminateResponse, +) +from app.services.audit_service import log_action +from app.services.winbox_remote import ( + WorkerCapacityError, + WorkerLaunchError, + create_session as worker_create_session, + get_session as worker_get_session, + terminate_session as worker_terminate_session, +) + +logger = logging.getLogger(__name__) + +router = APIRouter(tags=["winbox-remote"]) + +REDIS_PREFIX = "winbox-remote:" +RATE_PREFIX = "winbox-remote-rate:" + +# --------------------------------------------------------------------------- +# Lazy NATS and Redis clients (same pattern as remote_access.py) +# --------------------------------------------------------------------------- + +_nc: Optional[nats.aio.client.Client] = None +_redis: Optional[aioredis.Redis] = None + + +async def _get_nats() -> nats.aio.client.Client: + """Get or create a shared NATS client.""" + global _nc + if _nc is None or _nc.is_closed: + _nc = await nats.connect(settings.NATS_URL) + return _nc + + +async def _get_redis() -> aioredis.Redis: + """Get or create a shared Redis client.""" + global _redis + if _redis is None: + _redis = aioredis.from_url(settings.REDIS_URL, decode_responses=True) + return _redis + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _source_ip(request: Request) -> Optional[str]: + return request.headers.get("x-real-ip") or (request.client.host if request.client else None) + + +async def _get_device(db: AsyncSession, tenant_id: uuid.UUID, device_id: uuid.UUID) -> Device: + result = await db.execute( + select(Device).where(Device.id == device_id, Device.tenant_id == tenant_id) + ) + device = result.scalar_one_or_none() + if not device: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Device not found") + return device + + +async def _check_tenant_access( + current_user: CurrentUser, tenant_id: uuid.UUID, db: AsyncSession +) -> None: + if current_user.is_super_admin: + from app.database import set_tenant_context + + await set_tenant_context(db, str(tenant_id)) + return + if current_user.tenant_id != tenant_id: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Access denied: you do not belong to this tenant.", + ) + + +async def _check_rate_limit(user_id: uuid.UUID) -> None: + """Allow max 3 session creates per 5 minutes per user.""" + rd = await _get_redis() + key = f"{RATE_PREFIX}{user_id}" + count = await rd.incr(key) + if count == 1: + await rd.expire(key, 300) + if count > 3: + raise HTTPException( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + detail="Too many session requests. Try again later.", + ) + + +async def _get_session_from_redis(session_id: str) -> Optional[dict]: + rd = await _get_redis() + raw = await rd.get(f"{REDIS_PREFIX}{session_id}") + if raw is None: + return None + return json.loads(raw) + + +async def _save_session_to_redis(session_id: str, data: dict, ttl: int = 14400) -> None: + rd = await _get_redis() + await rd.setex(f"{REDIS_PREFIX}{session_id}", ttl, json.dumps(data, default=str)) + + +async def _delete_session_from_redis(session_id: str) -> None: + rd = await _get_redis() + await rd.delete(f"{REDIS_PREFIX}{session_id}") + + +async def _open_tunnel( + device_id: uuid.UUID, tenant_id: uuid.UUID, user_id: uuid.UUID +) -> dict: + """Open a TCP tunnel to device port 8291 via NATS request-reply.""" + payload = json.dumps({ + "device_id": str(device_id), + "tenant_id": str(tenant_id), + "user_id": str(user_id), + "target_port": 8291, + }).encode() + + try: + nc = await _get_nats() + msg = await nc.request("tunnel.open", payload, timeout=10) + except Exception as exc: + logger.error("NATS tunnel.open failed: %s", exc) + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Tunnel service unavailable", + ) + + try: + data = json.loads(msg.data) + except Exception: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Invalid response from tunnel service", + ) + + if "error" in data: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=data["error"] + ) + + return data + + +async def _close_tunnel(tunnel_id: str) -> None: + """Close a tunnel via NATS — idempotent.""" + try: + nc = await _get_nats() + payload = json.dumps({"tunnel_id": tunnel_id}).encode() + await nc.request("tunnel.close", payload, timeout=10) + except Exception: + pass # Idempotent — tunnel may already be closed + + +# --------------------------------------------------------------------------- +# POST — Create a Remote WinBox (Browser) session +# --------------------------------------------------------------------------- + + +@router.post( + "/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions", + response_model=RemoteWinboxSessionResponse, + summary="Create a Remote WinBox browser session", + dependencies=[Depends(require_operator_or_above)], +) +@limiter.limit("10/minute") +async def create_winbox_remote_session( + tenant_id: uuid.UUID, + device_id: uuid.UUID, + request: Request, + body: RemoteWinboxCreateRequest = RemoteWinboxCreateRequest(), + current_user: CurrentUser = Depends(get_current_user), + db: AsyncSession = Depends(get_db), +) -> RemoteWinboxSessionResponse: + """ + Create an Xpra-based WinBox session accessible via WebSocket in the browser. + + Flow: auth -> tenant check -> device exists -> duplicate check -> rate limit -> + credential decrypt -> tunnel open -> worker create -> Redis save -> audit log. + Full rollback on failure. + """ + await _check_tenant_access(current_user, tenant_id, db) + device = await _get_device(db, tenant_id, device_id) + source_ip = _source_ip(request) + + # Check for duplicate active session for this user+device + rd = await _get_redis() + cursor = "0" + while True: + cursor, keys = await rd.scan(cursor=cursor, match=f"{REDIS_PREFIX}*", count=100) + for key in keys: + raw = await rd.get(key) + if raw is None: + continue + try: + sess = json.loads(raw) + except Exception: + continue + if ( + sess.get("device_id") == str(device_id) + and sess.get("user_id") == str(current_user.user_id) + and sess.get("status") in ("creating", "active", "grace") + ): + # Verify the worker actually has this session — if not, clean up + # the stale Redis entry instead of blocking the user. + stale_sid = sess.get("session_id", "") + try: + worker_info = await worker_get_session(stale_sid) + except Exception: + worker_info = None + if worker_info is None: + logger.warning( + "Cleaning stale Redis session %s (worker 404)", stale_sid + ) + tunnel_id = sess.get("tunnel_id") + if tunnel_id: + await _close_tunnel(tunnel_id) + await _delete_session_from_redis(stale_sid) + continue + raise HTTPException( + status_code=status.HTTP_409_CONFLICT, + detail="Active session already exists for this device", + ) + if cursor == "0" or cursor == 0: + break + + # Rate limit + await _check_rate_limit(current_user.user_id) + + # Decrypt device credentials + try: + from app.services.crypto import decrypt_credentials_hybrid + + creds_json = await decrypt_credentials_hybrid( + transit_ciphertext=device.encrypted_credentials_transit, + legacy_ciphertext=device.encrypted_credentials, + tenant_id=str(tenant_id), + legacy_key=settings.get_encryption_key_bytes(), + ) + creds = json.loads(creds_json) + username = creds.get("username", "") + password = creds.get("password", "") + except Exception as exc: + logger.error("Failed to decrypt credentials for device %s: %s", device_id, exc) + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Unable to retrieve device credentials", + ) + + # Open tunnel to device + tunnel_data = None + session_id = str(uuid.uuid4()) + now = datetime.now(timezone.utc) + + try: + tunnel_data = await _open_tunnel(device_id, tenant_id, current_user.user_id) + tunnel_id = tunnel_data.get("tunnel_id", "") + tunnel_port = tunnel_data.get("local_port") + + if not isinstance(tunnel_port, int) or not (49000 <= tunnel_port <= 49100): + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Invalid port allocation from tunnel service", + ) + + # Create session on worker + # Tunnel listener runs on the poller container, reachable via Docker DNS + try: + worker_resp = await worker_create_session( + session_id=session_id, + device_ip="tod_poller", + device_port=tunnel_port, + username=username, + password=password, + idle_timeout_seconds=body.idle_timeout_seconds, + max_lifetime_seconds=body.max_lifetime_seconds, + ) + except WorkerCapacityError: + await _close_tunnel(tunnel_id) + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="No capacity for new sessions", + ) + except WorkerLaunchError as exc: + await _close_tunnel(tunnel_id) + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail=f"Session launch failed: {exc}", + ) + finally: + # Zero credentials + username = "" # noqa: F841 + password = "" # noqa: F841 + + expires_at = datetime.fromisoformat( + worker_resp.get("expires_at", now.isoformat()) + ) + max_expires_at = datetime.fromisoformat( + worker_resp.get("max_expires_at", now.isoformat()) + ) + + # Save session to Redis + session_data = { + "session_id": session_id, + "tenant_id": str(tenant_id), + "device_id": str(device_id), + "user_id": str(current_user.user_id), + "tunnel_id": tunnel_id, + "tunnel_port": tunnel_port, + "status": RemoteWinboxState.active.value, + "created_at": now.isoformat(), + "expires_at": expires_at.isoformat(), + "max_expires_at": max_expires_at.isoformat(), + "idle_timeout_seconds": body.idle_timeout_seconds, + "max_lifetime_seconds": body.max_lifetime_seconds, + "xpra_ws_port": worker_resp.get("xpra_ws_port"), + } + await _save_session_to_redis(session_id, session_data, ttl=body.max_lifetime_seconds + 60) + + # Audit log (fire-and-forget) + try: + await log_action( + db, + tenant_id, + current_user.user_id, + "winbox_remote_session_create", + resource_type="device", + resource_id=str(device_id), + device_id=device_id, + details={"session_id": session_id, "source_ip": source_ip}, + ip_address=source_ip, + ) + except Exception: + pass + + ws_path = ( + f"/api/tenants/{tenant_id}/devices/{device_id}" + f"/winbox-remote-sessions/{session_id}/ws" + ) + + return RemoteWinboxSessionResponse( + session_id=uuid.UUID(session_id), + websocket_path=ws_path, + expires_at=expires_at, + max_expires_at=max_expires_at, + idle_timeout_seconds=body.idle_timeout_seconds, + max_lifetime_seconds=body.max_lifetime_seconds, + xpra_ws_port=worker_resp.get("xpra_ws_port"), + ) + + except HTTPException: + raise + except Exception as exc: + # Full rollback + logger.error("Unexpected error creating winbox remote session: %s", exc) + if tunnel_data and tunnel_data.get("tunnel_id"): + await _close_tunnel(tunnel_data["tunnel_id"]) + await _delete_session_from_redis(session_id) + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Session creation failed", + ) + + +# --------------------------------------------------------------------------- +# GET — Session status +# --------------------------------------------------------------------------- + + +@router.get( + "/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions/{session_id}", + response_model=RemoteWinboxStatusResponse, + summary="Get Remote WinBox session status", + dependencies=[Depends(require_operator_or_above)], +) +async def get_winbox_remote_session( + tenant_id: uuid.UUID, + device_id: uuid.UUID, + session_id: uuid.UUID, + current_user: CurrentUser = Depends(get_current_user), + db: AsyncSession = Depends(get_db), +) -> RemoteWinboxStatusResponse: + await _check_tenant_access(current_user, tenant_id, db) + + sess = await _get_session_from_redis(str(session_id)) + if sess is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, detail="Session not found" + ) + + if sess.get("tenant_id") != str(tenant_id) or sess.get("device_id") != str(device_id): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, detail="Session not found" + ) + + return RemoteWinboxStatusResponse( + session_id=uuid.UUID(sess["session_id"]), + status=RemoteWinboxState(sess.get("status", "active")), + created_at=datetime.fromisoformat(sess["created_at"]), + expires_at=datetime.fromisoformat(sess["expires_at"]), + max_expires_at=datetime.fromisoformat(sess["max_expires_at"]), + idle_timeout_seconds=sess.get("idle_timeout_seconds", 600), + max_lifetime_seconds=sess.get("max_lifetime_seconds", 7200), + xpra_ws_port=sess.get("xpra_ws_port"), + ) + + +# --------------------------------------------------------------------------- +# GET — List sessions for a device +# --------------------------------------------------------------------------- + + +@router.get( + "/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions", + response_model=list[RemoteWinboxStatusResponse], + summary="List Remote WinBox sessions for a device", + dependencies=[Depends(require_operator_or_above)], +) +async def list_winbox_remote_sessions( + tenant_id: uuid.UUID, + device_id: uuid.UUID, + current_user: CurrentUser = Depends(get_current_user), + db: AsyncSession = Depends(get_db), +) -> list[RemoteWinboxStatusResponse]: + await _check_tenant_access(current_user, tenant_id, db) + + sessions = [] + rd = await _get_redis() + cursor = "0" + while True: + cursor, keys = await rd.scan(cursor=cursor, match=f"{REDIS_PREFIX}*", count=100) + for key in keys: + raw = await rd.get(key) + if raw is None: + continue + try: + sess = json.loads(raw) + except Exception: + continue + if ( + sess.get("tenant_id") == str(tenant_id) + and sess.get("device_id") == str(device_id) + ): + sessions.append( + RemoteWinboxStatusResponse( + session_id=uuid.UUID(sess["session_id"]), + status=RemoteWinboxState(sess.get("status", "active")), + created_at=datetime.fromisoformat(sess["created_at"]), + expires_at=datetime.fromisoformat(sess["expires_at"]), + max_expires_at=datetime.fromisoformat(sess["max_expires_at"]), + idle_timeout_seconds=sess.get("idle_timeout_seconds", 600), + max_lifetime_seconds=sess.get("max_lifetime_seconds", 7200), + xpra_ws_port=sess.get("xpra_ws_port"), + ) + ) + if cursor == "0" or cursor == 0: + break + + return sessions + + +# --------------------------------------------------------------------------- +# DELETE — Terminate session (idempotent) +# --------------------------------------------------------------------------- + + +@router.delete( + "/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions/{session_id}", + response_model=RemoteWinboxTerminateResponse, + summary="Terminate a Remote WinBox session", + dependencies=[Depends(require_operator_or_above)], +) +async def terminate_winbox_remote_session( + tenant_id: uuid.UUID, + device_id: uuid.UUID, + session_id: uuid.UUID, + request: Request, + current_user: CurrentUser = Depends(get_current_user), + db: AsyncSession = Depends(get_db), +) -> RemoteWinboxTerminateResponse: + await _check_tenant_access(current_user, tenant_id, db) + source_ip = _source_ip(request) + + sess = await _get_session_from_redis(str(session_id)) + + # Idempotent — if already gone, return terminated + if sess is None: + return RemoteWinboxTerminateResponse( + session_id=session_id, + status=RemoteWinboxState.terminated, + reason="Session already terminated or not found", + ) + + if sess.get("tenant_id") != str(tenant_id): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, detail="Session not found" + ) + + # Rollback order: worker -> tunnel -> redis -> audit + await worker_terminate_session(str(session_id)) + + tunnel_id = sess.get("tunnel_id") + if tunnel_id: + await _close_tunnel(tunnel_id) + + await _delete_session_from_redis(str(session_id)) + + try: + await log_action( + db, + tenant_id, + current_user.user_id, + "winbox_remote_session_terminate", + resource_type="device", + resource_id=str(device_id), + device_id=device_id, + details={"session_id": str(session_id), "source_ip": source_ip}, + ip_address=source_ip, + ) + except Exception: + pass + + return RemoteWinboxTerminateResponse( + session_id=session_id, + status=RemoteWinboxState.terminated, + reason="Terminated by user", + ) + + +# --------------------------------------------------------------------------- +# HTTP Proxy — Serve Xpra HTML5 client files from worker +# --------------------------------------------------------------------------- + + +@router.get( + "/tenants/{tenant_id}/devices/{device_id}" + "/winbox-remote-sessions/{session_id}/xpra/{path:path}", + summary="Proxy Xpra HTML5 client files", + dependencies=[Depends(require_operator_or_above)], +) +@router.get( + "/tenants/{tenant_id}/devices/{device_id}" + "/winbox-remote-sessions/{session_id}/xpra", + summary="Proxy Xpra HTML5 client (root)", + dependencies=[Depends(require_operator_or_above)], +) +async def proxy_xpra_html( + tenant_id: uuid.UUID, + device_id: uuid.UUID, + session_id: uuid.UUID, + request: Request, + path: str = "", + current_user: CurrentUser = Depends(get_current_user), + db: AsyncSession = Depends(get_db), +) -> None: + """Reverse-proxy HTTP requests to the Xpra HTML5 server inside the worker.""" + from starlette.responses import Response + + await _check_tenant_access(current_user, tenant_id, db) + + sess = await _get_session_from_redis(str(session_id)) + if sess is None: + raise HTTPException(status_code=404, detail="Session not found") + if sess.get("tenant_id") != str(tenant_id) or sess.get("device_id") != str(device_id): + raise HTTPException(status_code=404, detail="Session not found") + + xpra_ws_port = sess.get("xpra_ws_port") + if not xpra_ws_port: + raise HTTPException(status_code=503, detail="Xpra port unavailable") + + # Proxy the request to Xpra's built-in HTTP server + target_url = f"http://tod_winbox_worker:{xpra_ws_port}/{path}" + try: + async with httpx.AsyncClient(timeout=httpx.Timeout(10.0)) as client: + proxy_resp = await client.get( + target_url, + params=dict(request.query_params), + ) + except Exception as exc: + logger.error("Xpra HTTP proxy error: %s", exc) + raise HTTPException(status_code=502, detail="Xpra server unreachable") + + # Forward the response with correct content type + return Response( + content=proxy_resp.content, + status_code=proxy_resp.status_code, + headers={ + k: v for k, v in proxy_resp.headers.items() + if k.lower() in ("content-type", "cache-control", "content-encoding") + }, + ) + + +# --------------------------------------------------------------------------- +# WebSocket — Proxy browser <-> Xpra worker +# --------------------------------------------------------------------------- + + +@router.websocket( + "/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions/{session_id}/ws" +) +async def winbox_remote_ws_proxy( + websocket: WebSocket, + tenant_id: uuid.UUID, + device_id: uuid.UUID, + session_id: uuid.UUID, +) -> None: + """ + Bidirectional WebSocket proxy between the browser and the worker's Xpra + WebSocket. Authentication via access_token cookie or query param. + + 1. Authenticate via cookie/query param token + 2. Validate session in Redis (ownership, status, expiry) + 3. Resolve Xpra WebSocket port from worker + 4. Accept browser WebSocket upgrade + 5. Proxy bidirectionally until close + """ + # --- Auth: extract token from cookie or query param --- + token = websocket.cookies.get("access_token") or websocket.query_params.get("token") + if not token: + await websocket.close(code=4001, reason="Authentication required") + return + + from app.services.auth import verify_token + + try: + payload = verify_token(token, expected_type="access") + except Exception: + await websocket.close(code=4001, reason="Invalid token") + return + + user_id_str = payload.get("sub") + user_tenant_str = payload.get("tenant_id") + role = payload.get("role") + + if not user_id_str or not role: + await websocket.close(code=4001, reason="Invalid token payload") + return + + # Tenant access check + if role != "super_admin": + if user_tenant_str != str(tenant_id): + await websocket.close(code=4003, reason="Tenant access denied") + return + + # --- Session validation --- + sess = await _get_session_from_redis(str(session_id)) + if sess is None: + await websocket.close(code=4004, reason="Session not found") + return + + if sess.get("tenant_id") != str(tenant_id) or sess.get("device_id") != str(device_id): + await websocket.close(code=4004, reason="Session not found") + return + + # Ownership check: user must own the session (or be super_admin) + if role != "super_admin" and sess.get("user_id") != user_id_str: + await websocket.close(code=4003, reason="Not your session") + return + + sess_status = sess.get("status") + if sess_status not in ("active", "grace"): + await websocket.close(code=4004, reason=f"Session not active (status={sess_status})") + return + + # Check max expiry + max_expires = datetime.fromisoformat(sess["max_expires_at"]) + if datetime.now(timezone.utc) > max_expires: + await websocket.close(code=4004, reason="Session expired") + return + + # Resolve Xpra WebSocket port from worker + xpra_ws_port = sess.get("xpra_ws_port") + if not xpra_ws_port: + worker_info = await worker_get_session(str(session_id)) + if not worker_info: + await websocket.close(code=4004, reason="Worker session not found") + return + xpra_ws_port = worker_info.get("xpra_ws_port") or worker_info.get("ws_port") + + if not xpra_ws_port: + await websocket.close(code=4004, reason="Xpra port unavailable") + return + + # Update last_client_connect_at in Redis + sess["last_client_connect_at"] = datetime.now(timezone.utc).isoformat() + try: + await _save_session_to_redis(str(session_id), sess) + except Exception: + pass + + # Accept browser WebSocket + await websocket.accept() + + # Connect to worker Xpra WebSocket + import websockets + + worker_ws_url = f"ws://tod_winbox_worker:{xpra_ws_port}" + + try: + async with websockets.connect(worker_ws_url) as worker_ws: + + async def browser_to_worker() -> None: + try: + while True: + data = await websocket.receive_bytes() + await worker_ws.send(data) + except WebSocketDisconnect: + pass + except Exception: + pass + + async def worker_to_browser() -> None: + try: + async for message in worker_ws: + if isinstance(message, bytes): + await websocket.send_bytes(message) + else: + await websocket.send_text(message) + except Exception: + pass + + # Run both directions concurrently + done, pending = await asyncio.wait( + [ + asyncio.create_task(browser_to_worker()), + asyncio.create_task(worker_to_browser()), + ], + return_when=asyncio.FIRST_COMPLETED, + ) + for task in pending: + task.cancel() + + except Exception as exc: + logger.warning("WebSocket proxy error for session %s: %s", session_id, exc) + finally: + try: + await websocket.close() + except Exception: + pass diff --git a/backend/app/schemas/remote_access.py b/backend/app/schemas/remote_access.py index dda16aa..c8ae53a 100644 --- a/backend/app/schemas/remote_access.py +++ b/backend/app/schemas/remote_access.py @@ -1,5 +1,7 @@ from pydantic import BaseModel, Field +from app.schemas.winbox_remote import RemoteWinboxSessionItem + class WinboxSessionResponse(BaseModel): tunnel_id: str @@ -37,3 +39,4 @@ class SSHSessionStatusItem(BaseModel): class ActiveSessionsResponse(BaseModel): winbox_tunnels: list[TunnelStatusItem] = [] ssh_sessions: list[SSHSessionStatusItem] = [] + remote_winbox_sessions: list[RemoteWinboxSessionItem] = [] diff --git a/backend/app/schemas/winbox_remote.py b/backend/app/schemas/winbox_remote.py new file mode 100644 index 0000000..2e10331 --- /dev/null +++ b/backend/app/schemas/winbox_remote.py @@ -0,0 +1,63 @@ +"""Request/response schemas for Remote WinBox (Browser) sessions.""" + +import uuid +from datetime import datetime +from enum import Enum +from typing import Optional + +from pydantic import BaseModel, Field + + +class RemoteWinboxState(str, Enum): + creating = "creating" + active = "active" + grace = "grace" + terminating = "terminating" + terminated = "terminated" + failed = "failed" + + +class RemoteWinboxCreateRequest(BaseModel): + idle_timeout_seconds: int = Field(default=600, ge=60, le=3600) + max_lifetime_seconds: int = Field(default=7200, ge=300, le=14400) + + +class RemoteWinboxSessionResponse(BaseModel): + session_id: uuid.UUID + status: RemoteWinboxState = RemoteWinboxState.active + websocket_path: str + expires_at: datetime + max_expires_at: datetime + idle_timeout_seconds: int + max_lifetime_seconds: int + xpra_ws_port: Optional[int] = None + + +class RemoteWinboxStatusResponse(BaseModel): + session_id: uuid.UUID + status: RemoteWinboxState + created_at: datetime + expires_at: datetime + max_expires_at: datetime + idle_timeout_seconds: int + max_lifetime_seconds: int + xpra_ws_port: Optional[int] = None + + +class RemoteWinboxTerminateResponse(BaseModel): + session_id: uuid.UUID + status: RemoteWinboxState + reason: str + + +class RemoteWinboxDuplicateDetail(BaseModel): + detail: str = "Active session exists" + session: RemoteWinboxStatusResponse + + +class RemoteWinboxSessionItem(BaseModel): + """Used in the combined active sessions list.""" + session_id: uuid.UUID + status: RemoteWinboxState + created_at: datetime + expires_at: datetime diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py index 0f1f31b..e4e0b76 100644 --- a/backend/app/services/notification_service.py +++ b/backend/app/services/notification_service.py @@ -120,7 +120,7 @@ async def _send_email(channel: dict, alert_event: dict, device_hostname: str) -> user=channel.get("smtp_user"), password=smtp_password, use_tls=channel.get("smtp_use_tls", False), - from_address=channel.get("from_address") or "alerts@mikrotik-portal.local", + from_address=channel.get("from_address") or "alerts@the-other-dude.local", ) to = channel.get("to_address") diff --git a/backend/app/services/restore_service.py b/backend/app/services/restore_service.py index f21b934..51287b8 100644 --- a/backend/app/services/restore_service.py +++ b/backend/app/services/restore_service.py @@ -43,7 +43,7 @@ from app.services.push_tracker import record_push, clear_push logger = logging.getLogger(__name__) # Name of the panic-revert scheduler installed on the RouterOS device -_PANIC_REVERT_SCHEDULER = "mikrotik-portal-panic-revert" +_PANIC_REVERT_SCHEDULER = "the-other-dude-panic-revert" # Name of the pre-push binary backup saved on device flash _PRE_PUSH_BACKUP = "portal-pre-push" # Name of the RSC file used for /import on device diff --git a/backend/app/services/template_service.py b/backend/app/services/template_service.py index 8032f69..bbb02f4 100644 --- a/backend/app/services/template_service.py +++ b/backend/app/services/template_service.py @@ -35,7 +35,7 @@ logger = logging.getLogger(__name__) _env = SandboxedEnvironment() # Names used on the RouterOS device during template push -_PANIC_REVERT_SCHEDULER = "mikrotik-portal-template-revert" +_PANIC_REVERT_SCHEDULER = "the-other-dude-template-revert" _PRE_PUSH_BACKUP = "portal-template-pre-push" _TEMPLATE_RSC = "portal-template.rsc" diff --git a/backend/app/services/winbox_remote.py b/backend/app/services/winbox_remote.py new file mode 100644 index 0000000..269abc7 --- /dev/null +++ b/backend/app/services/winbox_remote.py @@ -0,0 +1,126 @@ +"""HTTP client for the winbox-worker container. + +Provides async helpers to create, terminate, query, and health-check +Remote WinBox (Xpra) sessions running inside the worker container. +All communication uses the internal Docker network. +""" + +import logging +from typing import Any, Optional + +import httpx + +logger = logging.getLogger(__name__) + +WORKER_BASE_URL = "http://tod_winbox_worker:9090" +_HEADERS = {"X-Internal-Service": "api"} +_TIMEOUT = httpx.Timeout(15.0, connect=5.0) + + +class WorkerCapacityError(Exception): + """Worker has no capacity for new sessions.""" + + +class WorkerLaunchError(Exception): + """Worker failed to launch a session.""" + + +async def create_session( + session_id: str, + device_ip: str, + device_port: int, + username: str, + password: str, + idle_timeout_seconds: int, + max_lifetime_seconds: int, +) -> dict[str, Any]: + """POST /sessions — ask the worker to launch an Xpra+WinBox session. + + Credentials are zeroed from locals after the request is sent. + Raises WorkerCapacityError (503) or WorkerLaunchError on failure. + """ + payload = { + "session_id": session_id, + "tunnel_host": device_ip, + "tunnel_port": device_port, + "username": username, + "password": password, + "idle_timeout_seconds": idle_timeout_seconds, + "max_lifetime_seconds": max_lifetime_seconds, + } + try: + async with httpx.AsyncClient( + base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=_TIMEOUT + ) as client: + resp = await client.post("/sessions", json=payload) + finally: + # Zero credentials in the payload dict + payload["username"] = "" + payload["password"] = "" + del username, password # noqa: F821 — local unbind + + if resp.status_code == 503: + raise WorkerCapacityError(resp.text) + if resp.status_code >= 400: + raise WorkerLaunchError(f"Worker returned {resp.status_code}: {resp.text}") + + return resp.json() + + +async def terminate_session(session_id: str) -> bool: + """DELETE /sessions/{session_id} — idempotent (ignores 404). + + Returns True if the worker acknowledged termination, False if 404. + """ + async with httpx.AsyncClient( + base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=_TIMEOUT + ) as client: + resp = await client.delete(f"/sessions/{session_id}") + + if resp.status_code == 404: + return False + if resp.status_code >= 400: + logger.error("Worker terminate error %s: %s", resp.status_code, resp.text) + return False + return True + + +async def get_session(session_id: str) -> Optional[dict[str, Any]]: + """GET /sessions/{session_id} — returns None if 404.""" + async with httpx.AsyncClient( + base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=_TIMEOUT + ) as client: + resp = await client.get(f"/sessions/{session_id}") + + if resp.status_code == 404: + return None + if resp.status_code >= 400: + logger.error("Worker get_session error %s: %s", resp.status_code, resp.text) + return None + return resp.json() + + +async def list_sessions() -> list[dict[str, Any]]: + """GET /sessions — return all sessions known to the worker.""" + async with httpx.AsyncClient( + base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=_TIMEOUT + ) as client: + resp = await client.get("/sessions") + + if resp.status_code >= 400: + logger.error("Worker list_sessions error %s: %s", resp.status_code, resp.text) + return [] + data = resp.json() + return data if isinstance(data, list) else [] + + +async def health_check() -> bool: + """GET /healthz — returns True if the worker is healthy.""" + try: + async with httpx.AsyncClient( + base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=httpx.Timeout(5.0) + ) as client: + resp = await client.get("/healthz") + return resp.status_code == 200 + except Exception: + return False diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 5742475..f052c89 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -3,7 +3,7 @@ requires = ["hatchling"] build-backend = "hatchling.build" [project] -name = "mikrotik-portal-backend" +name = "the-other-dude-backend" version = "9.0.1" description = "MikroTik Fleet Management Portal - Backend API" requires-python = ">=3.12" diff --git a/backend/tests/test_push_recovery.py b/backend/tests/test_push_recovery.py index 62aad3e..dfa148f 100644 --- a/backend/tests/test_push_recovery.py +++ b/backend/tests/test_push_recovery.py @@ -16,7 +16,7 @@ async def test_recovery_commits_reachable_device_with_scheduler(): push_op.device_id = uuid4() push_op.tenant_id = uuid4() push_op.status = "pending_verification" - push_op.scheduler_name = "mikrotik-portal-panic-revert" + push_op.scheduler_name = "the-other-dude-panic-revert" push_op.started_at = datetime.now(timezone.utc) - timedelta(minutes=10) device = MagicMock() @@ -71,7 +71,7 @@ async def test_recovery_marks_unreachable_device_failed(): push_op.device_id = uuid4() push_op.tenant_id = uuid4() push_op.status = "pending_verification" - push_op.scheduler_name = "mikrotik-portal-panic-revert" + push_op.scheduler_name = "the-other-dude-panic-revert" push_op.started_at = datetime.now(timezone.utc) - timedelta(minutes=10) device = MagicMock() diff --git a/docker-compose.override.yml b/docker-compose.override.yml index 0e7c421..59e7a9b 100644 --- a/docker-compose.override.yml +++ b/docker-compose.override.yml @@ -21,7 +21,7 @@ services: APP_USER_DATABASE_URL: postgresql+asyncpg://app_user:app_password@postgres:5432/mikrotik REDIS_URL: redis://redis:6379/0 NATS_URL: nats://nats:4222 - FIRST_ADMIN_EMAIL: ${FIRST_ADMIN_EMAIL:-admin@mikrotik-portal.dev} + FIRST_ADMIN_EMAIL: ${FIRST_ADMIN_EMAIL:-admin@the-other-dude.dev} FIRST_ADMIN_PASSWORD: ${FIRST_ADMIN_PASSWORD:-changeme-in-production} CREDENTIAL_ENCRYPTION_KEY: ${CREDENTIAL_ENCRYPTION_KEY:?Set CREDENTIAL_ENCRYPTION_KEY in .env} JWT_SECRET_KEY: ${JWT_SECRET_KEY:?Set JWT_SECRET_KEY in .env} @@ -62,6 +62,7 @@ services: memory: 512M networks: - tod + - tod_remote_worker poller: build: @@ -115,6 +116,17 @@ services: memory: 256M networks: - tod + - tod_remote_worker + + winbox-worker: + environment: + LOG_LEVEL: debug + MAX_CONCURRENT_SESSIONS: 5 + deploy: + resources: + limits: + memory: 512M + restart: on-failure frontend: build: diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 6f0193d..2ee189d 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -12,9 +12,27 @@ services: ENVIRONMENT: production LOG_LEVEL: info GUNICORN_WORKERS: "2" - command: ["gunicorn", "app.main:app", "--config", "gunicorn.conf.py"] + WIREGUARD_CONFIG_PATH: /data/wireguard + WIREGUARD_GATEWAY: wireguard + cap_add: + - NET_ADMIN + user: root + command: > + sh -c " + if [ -n \"$$WIREGUARD_GATEWAY\" ]; then + apt-get update -qq && apt-get install -y -qq iproute2 >/dev/null 2>&1 || true; + GW_IP=$$(getent hosts $$WIREGUARD_GATEWAY 2>/dev/null | awk '{print $$1}'); + [ -z \"$$GW_IP\" ] && GW_IP=$$WIREGUARD_GATEWAY; + ip route add 10.10.0.0/16 via $$GW_IP 2>/dev/null || true; + echo VPN route: 10.10.0.0/16 via $$GW_IP; + fi; + exec su -s /bin/sh appuser -c 'gunicorn app.main:app --config gunicorn.conf.py' + " + ports: + - "8001:8000" volumes: - ./docker-data/git-store:/data/git-store + - ./docker-data/wireguard:/data/wireguard depends_on: postgres: condition: service_healthy @@ -22,6 +40,8 @@ services: condition: service_healthy nats: condition: service_healthy + openbao: + condition: service_healthy deploy: resources: limits: @@ -34,6 +54,7 @@ services: max-file: "3" networks: - tod + - tod_remote_worker poller: build: @@ -44,6 +65,7 @@ services: environment: ENVIRONMENT: production LOG_LEVEL: info + DATABASE_URL: postgres://poller_user:poller_password@postgres:5432/mikrotik TUNNEL_PORT_MIN: 49000 TUNNEL_PORT_MAX: 49100 TUNNEL_IDLE_TIMEOUT: 300 @@ -65,6 +87,8 @@ services: condition: service_healthy nats: condition: service_healthy + openbao: + condition: service_healthy healthcheck: test: ["CMD-SHELL", "wget --spider -q http://localhost:8080/healthz || exit 1"] interval: 30s @@ -82,6 +106,32 @@ services: max-file: "3" networks: - tod + - tod_remote_worker + + openbao: + env_file: .env.prod + environment: + BAO_ADDR: "http://127.0.0.1:8200" + BAO_UNSEAL_KEY: "${BAO_UNSEAL_KEY}" + BAO_TOKEN: "${OPENBAO_TOKEN}" + ports: [] + restart: unless-stopped + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + + winbox-worker: + environment: + LOG_LEVEL: info + MAX_CONCURRENT_SESSIONS: 10 + logging: + driver: json-file + options: + max-size: "10m" + max-file: "3" + restart: unless-stopped frontend: build: @@ -89,7 +139,7 @@ services: dockerfile: infrastructure/docker/Dockerfile.frontend container_name: tod_frontend ports: - - "80:80" + - "3000:80" depends_on: - api deploy: diff --git a/docker-compose.yml b/docker-compose.yml index de3b8da..2927d44 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -87,28 +87,28 @@ services: command: - -c - | - # Start OpenBao in background - bao server -dev -dev-listen-address=0.0.0.0:8200 & + bao server -config=/etc/openbao/config.hcl & BAO_PID=$$! - # Wait for ready and run init sleep 2 /init/init.sh - # Wait for OpenBao process wait $$BAO_PID environment: - BAO_DEV_ROOT_TOKEN_ID: dev-openbao-token - BAO_DEV_LISTEN_ADDRESS: "0.0.0.0:8200" + BAO_ADDR: "http://127.0.0.1:8200" + BAO_UNSEAL_KEY: "${BAO_UNSEAL_KEY:-}" + BAO_TOKEN: "${OPENBAO_TOKEN:-}" ports: - "8200:8200" volumes: + - ./infrastructure/openbao/config.hcl:/etc/openbao/config.hcl:ro - ./infrastructure/openbao/init.sh:/init/init.sh:ro - cap_add: - - IPC_LOCK + - openbao_data:/openbao/data healthcheck: test: ["CMD-SHELL", "wget -qO- http://127.0.0.1:8200/v1/sys/health | grep -q '\"sealed\":false' || exit 1"] interval: 5s timeout: 3s - retries: 5 + retries: 12 + start_period: 30s + restart: unless-stopped deploy: resources: limits: @@ -159,6 +159,33 @@ services: limits: memory: 64M + winbox-worker: + build: + context: ./winbox-worker + platform: linux/amd64 + container_name: tod_winbox_worker + environment: + IDLE_TIMEOUT: 600 + MAX_LIFETIME: 7200 + MAX_CONCURRENT_SESSIONS: 10 + LOG_LEVEL: info + XDG_RUNTIME_DIR: /run/user/1001 + ports: + - "10100-10119:10100-10119" + deploy: + resources: + limits: + memory: 1G + networks: + - tod + - tod_remote_worker + +volumes: + openbao_data: + networks: tod: driver: bridge + tod_remote_worker: + driver: bridge + internal: true diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index a5a37cb..8ec5e97 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -71,7 +71,7 @@ TOD uses Pydantic Settings for configuration. All values can be set via environm | `SMTP_USER` | *(none)* | SMTP authentication username | | `SMTP_PASSWORD` | *(none)* | SMTP authentication password | | `SMTP_USE_TLS` | `false` | Enable STARTTLS for SMTP connections | -| `SMTP_FROM_ADDRESS` | `noreply@mikrotik-portal.local` | Sender address for outbound emails | +| `SMTP_FROM_ADDRESS` | `noreply@the-other-dude.local` | Sender address for outbound emails | ### Firmware diff --git a/docs/superpowers/plans/2026-03-12-remote-access.md b/docs/superpowers/plans/2026-03-12-remote-access.md deleted file mode 100644 index d94b68b..0000000 --- a/docs/superpowers/plans/2026-03-12-remote-access.md +++ /dev/null @@ -1,2704 +0,0 @@ -# Remote Access Implementation Plan — WinBox Tunnels + SSH Terminal (v9.5) - -> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add remote WinBox TCP tunnels and browser-based SSH terminal access to RouterOS devices through the TOD controller. - -**Architecture:** Poller gains two new packages: `tunnel/` (TCP proxy for WinBox on ports 49000-49100) and `sshrelay/` (WebSocket-to-SSH bridge via internal HTTP server on :8080). API issues session tokens and enforces RBAC. Frontend adds WinBox button and xterm.js terminal component. - -**Tech Stack:** Go 1.24, `golang.org/x/crypto/ssh`, `nhooyr.io/websocket`, Python/FastAPI, React, `@xterm/xterm` v5 - -**Spec:** `docs/superpowers/specs/2026-03-12-remote-access-design.md` - -**Parallelization:** Chunks 1-3 (Go poller packages) can run in parallel with Chunk 4 (Python API) and Chunk 6 (infrastructure). Chunk 5 (frontend) depends on Chunks 3 and 4 completing. - ---- - -## Chunk 1: Poller — Port Pool & Tunnel Manager Core - -### Task 1.1: Add WebSocket dependency to Go module - -**Files:** -- Modify: `poller/go.mod` - -- [ ] **Step 1: Add dependencies** - -```bash -cd poller && go get nhooyr.io/websocket@latest && go get github.com/google/uuid@latest -``` - -Note: `github.com/google/uuid` is already in go.mod. `nhooyr.io/websocket` is new — needed for SSH relay in Chunk 3. - -- [ ] **Step 2: Tidy** - -```bash -cd poller && go mod tidy -``` - -- [ ] **Step 3: Commit** - -```bash -git add poller/go.mod poller/go.sum -git commit -m "chore(poller): add websocket dependency for remote access" -``` - -### Task 1.2: Port Pool - -**Files:** -- Create: `poller/internal/tunnel/portpool.go` -- Create: `poller/internal/tunnel/portpool_test.go` - -- [ ] **Step 1: Write failing tests** - -```go -// poller/internal/tunnel/portpool_test.go -package tunnel - -import ( - "net" - "sync" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestPortPool_Allocate(t *testing.T) { - pp := NewPortPool(49000, 49002) // 3 ports: 49000, 49001, 49002 - p1, err := pp.Allocate() - require.NoError(t, err) - assert.GreaterOrEqual(t, p1, 49000) - assert.LessOrEqual(t, p1, 49002) -} - -func TestPortPool_AllocateAll(t *testing.T) { - pp := NewPortPool(49000, 49002) - ports := make(map[int]bool) - for i := 0; i < 3; i++ { - p, err := pp.Allocate() - require.NoError(t, err) - ports[p] = true - } - assert.Len(t, ports, 3) -} - -func TestPortPool_Exhausted(t *testing.T) { - pp := NewPortPool(49000, 49001) - _, _ = pp.Allocate() - _, _ = pp.Allocate() - _, err := pp.Allocate() - assert.Error(t, err) - assert.Contains(t, err.Error(), "no ports available") -} - -func TestPortPool_Release(t *testing.T) { - pp := NewPortPool(49000, 49000) // single port - p, _ := pp.Allocate() - pp.Release(p) - p2, err := pp.Allocate() - require.NoError(t, err) - assert.Equal(t, p, p2) -} - -func TestPortPool_ConcurrentAccess(t *testing.T) { - pp := NewPortPool(49000, 49099) // 100 ports - var wg sync.WaitGroup - allocated := make(chan int, 100) - for i := 0; i < 100; i++ { - wg.Add(1) - go func() { - defer wg.Done() - p, err := pp.Allocate() - if err == nil { - allocated <- p - } - }() - } - wg.Wait() - close(allocated) - ports := make(map[int]bool) - for p := range allocated { - assert.False(t, ports[p], "duplicate port allocated: %d", p) - ports[p] = true - } -} - -func TestPortPool_BindVerification(t *testing.T) { - // Occupy a port, then verify Allocate skips it - ln, err := net.Listen("tcp", "127.0.0.1:49050") - require.NoError(t, err) - defer ln.Close() - - pp := NewPortPool(49050, 49051) - p, err := pp.Allocate() - require.NoError(t, err) - assert.Equal(t, 49051, p) // should skip 49050 since it's occupied -} -``` - -- [ ] **Step 2: Run tests — verify they fail** - -```bash -cd poller && go test ./internal/tunnel/ -run TestPortPool -v -``` - -- [ ] **Step 3: Implement port pool** - -```go -// poller/internal/tunnel/portpool.go -package tunnel - -import ( - "fmt" - "net" - "sync" -) - -// PortPool tracks available ports in a fixed range for WinBox tunnel allocation. -type PortPool struct { - mu sync.Mutex - used []bool - base int - count int -} - -func NewPortPool(min, max int) *PortPool { - count := max - min + 1 - return &PortPool{ - used: make([]bool, count), - base: min, - count: count, - } -} - -// Allocate returns the next free port, verifying it can actually be bound. -// Returns error if all ports are exhausted. -func (pp *PortPool) Allocate() (int, error) { - pp.mu.Lock() - defer pp.mu.Unlock() - - for i := 0; i < pp.count; i++ { - if pp.used[i] { - continue - } - port := pp.base + i - if !canBind(port) { - continue - } - pp.used[i] = true - return port, nil - } - return 0, fmt.Errorf("no ports available in range %d-%d", pp.base, pp.base+pp.count-1) -} - -// Release returns a port to the pool. -func (pp *PortPool) Release(port int) { - pp.mu.Lock() - defer pp.mu.Unlock() - idx := port - pp.base - if idx >= 0 && idx < pp.count { - pp.used[idx] = false - } -} - -func canBind(port int) bool { - ln, err := net.Listen("tcp", fmt.Sprintf("127.0.0.1:%d", port)) - if err != nil { - return false - } - ln.Close() - return true -} -``` - -- [ ] **Step 4: Run tests — verify they pass** - -```bash -cd poller && go test ./internal/tunnel/ -run TestPortPool -v -``` - -- [ ] **Step 5: Commit** - -```bash -git add poller/internal/tunnel/ -git commit -m "feat(poller): add port pool for WinBox tunnel allocation" -``` - -### Task 1.3: Tunnel and TCP Proxy - -**Files:** -- Create: `poller/internal/tunnel/tunnel.go` -- Create: `poller/internal/tunnel/tunnel_test.go` - -- [ ] **Step 1: Write failing tests** - -```go -// poller/internal/tunnel/tunnel_test.go -package tunnel - -import ( - "context" - "io" - "net" - "sync/atomic" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// mockRouter simulates a RouterOS device accepting TCP connections -func mockRouter(t *testing.T) (string, func()) { - t.Helper() - ln, err := net.Listen("tcp", "127.0.0.1:0") - require.NoError(t, err) - go func() { - for { - conn, err := ln.Accept() - if err != nil { - return - } - go func(c net.Conn) { - defer c.Close() - io.Copy(c, c) // echo server - }(conn) - } - }() - return ln.Addr().String(), func() { ln.Close() } -} - -func TestTunnel_ProxyBidirectional(t *testing.T) { - routerAddr, cleanup := mockRouter(t) - defer cleanup() - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - tun := &Tunnel{ - ID: "test-1", - RemoteAddr: routerAddr, - LastActive: time.Now().UnixNano(), - cancel: cancel, - ctx: ctx, - } - - ln, err := net.Listen("tcp", "127.0.0.1:0") - require.NoError(t, err) - tun.listener = ln - - go tun.accept() - - // Connect as a WinBox client - conn, err := net.Dial("tcp", ln.Addr().String()) - require.NoError(t, err) - defer conn.Close() - - // Write and read back (echo) - msg := []byte("hello winbox") - _, err = conn.Write(msg) - require.NoError(t, err) - - buf := make([]byte, len(msg)) - _, err = io.ReadFull(conn, buf) - require.NoError(t, err) - assert.Equal(t, msg, buf) -} - -func TestTunnel_ActivityTracking(t *testing.T) { - routerAddr, cleanup := mockRouter(t) - defer cleanup() - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - before := time.Now().UnixNano() - tun := &Tunnel{ - ID: "test-2", - RemoteAddr: routerAddr, - LastActive: before, - cancel: cancel, - ctx: ctx, - } - - ln, err := net.Listen("tcp", "127.0.0.1:0") - require.NoError(t, err) - tun.listener = ln - go tun.accept() - - conn, err := net.Dial("tcp", ln.Addr().String()) - require.NoError(t, err) - conn.Write([]byte("data")) - buf := make([]byte, 4) - io.ReadFull(conn, buf) - conn.Close() - - time.Sleep(50 * time.Millisecond) - after := atomic.LoadInt64(&tun.LastActive) - assert.Greater(t, after, before) -} - -func TestTunnel_Close(t *testing.T) { - routerAddr, cleanup := mockRouter(t) - defer cleanup() - - ctx, cancel := context.WithCancel(context.Background()) - - tun := &Tunnel{ - ID: "test-3", - RemoteAddr: routerAddr, - LastActive: time.Now().UnixNano(), - cancel: cancel, - ctx: ctx, - } - - ln, err := net.Listen("tcp", "127.0.0.1:0") - require.NoError(t, err) - tun.listener = ln - go tun.accept() - - // Open a connection - conn, err := net.Dial("tcp", ln.Addr().String()) - require.NoError(t, err) - - // Close tunnel — should terminate everything - tun.Close() - - // Connection should be dead - conn.SetReadDeadline(time.Now().Add(500 * time.Millisecond)) - _, err = conn.Read(make([]byte, 1)) - assert.Error(t, err) -} - -func TestTunnel_DialFailure(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - tun := &Tunnel{ - ID: "test-4", - RemoteAddr: "127.0.0.1:1", // nothing listening - LastActive: time.Now().UnixNano(), - cancel: cancel, - ctx: ctx, - } - - ln, err := net.Listen("tcp", "127.0.0.1:0") - require.NoError(t, err) - tun.listener = ln - go tun.accept() - - conn, err := net.Dial("tcp", ln.Addr().String()) - require.NoError(t, err) - - // Should be closed quickly since dial to router fails - conn.SetReadDeadline(time.Now().Add(2 * time.Second)) - _, err = conn.Read(make([]byte, 1)) - assert.Error(t, err) -} -``` - -- [ ] **Step 2: Run tests — verify they fail** - -```bash -cd poller && go test ./internal/tunnel/ -run TestTunnel -v -``` - -- [ ] **Step 3: Implement tunnel** - -```go -// poller/internal/tunnel/tunnel.go -package tunnel - -import ( - "context" - "io" - "log/slog" - "net" - "sync" - "sync/atomic" - "time" -) - -// Tunnel represents an active WinBox TCP tunnel to a single router. -type Tunnel struct { - ID string - DeviceID string - TenantID string - UserID string - LocalPort int - RemoteAddr string // router IP:port - CreatedAt time.Time - LastActive int64 // atomic, unix nanoseconds - - listener net.Listener - ctx context.Context - cancel context.CancelFunc - conns sync.WaitGroup - activeConns int64 // atomic -} - -// Close shuts down the tunnel in the correct order. -func (t *Tunnel) Close() { - t.listener.Close() - t.cancel() - t.conns.Wait() - slog.Info("tunnel closed", "tunnel_id", t.ID, "device_id", t.DeviceID, "port", t.LocalPort) -} - -// IdleDuration returns how long the tunnel has been idle. -func (t *Tunnel) IdleDuration() time.Duration { - return time.Since(time.Unix(0, atomic.LoadInt64(&t.LastActive))) -} - -// ActiveConns returns the number of active TCP connections. -func (t *Tunnel) ActiveConns() int64 { - return atomic.LoadInt64(&t.activeConns) -} - -func (t *Tunnel) accept() { - for { - conn, err := t.listener.Accept() - if err != nil { - return // listener closed - } - t.conns.Add(1) - atomic.AddInt64(&t.activeConns, 1) - go t.handleConn(conn) - } -} - -func (t *Tunnel) handleConn(clientConn net.Conn) { - defer t.conns.Done() - defer atomic.AddInt64(&t.activeConns, -1) - - slog.Info("tunnel client connected", "tunnel_id", t.ID, "device_id", t.DeviceID) - - routerConn, err := net.DialTimeout("tcp", t.RemoteAddr, 10*time.Second) - if err != nil { - slog.Warn("tunnel dial failed", "tunnel_id", t.ID, "remote", t.RemoteAddr, "err", err) - clientConn.Close() - return - } - - ctx, cancel := context.WithCancel(t.ctx) - defer cancel() - - go func() { - io.Copy(routerConn, newActivityReader(clientConn, &t.LastActive)) - cancel() - }() - go func() { - io.Copy(clientConn, newActivityReader(routerConn, &t.LastActive)) - cancel() - }() - - <-ctx.Done() - clientConn.Close() - routerConn.Close() - - slog.Info("tunnel client disconnected", "tunnel_id", t.ID, "device_id", t.DeviceID) -} - -// activityReader wraps an io.Reader and updates a shared timestamp on every Read. -type activityReader struct { - r io.Reader - lastActive *int64 -} - -func newActivityReader(r io.Reader, lastActive *int64) *activityReader { - return &activityReader{r: r, lastActive: lastActive} -} - -func (a *activityReader) Read(p []byte) (int, error) { - n, err := a.r.Read(p) - if n > 0 { - atomic.StoreInt64(a.lastActive, time.Now().UnixNano()) - } - return n, err -} -``` - -- [ ] **Step 4: Run tests — verify they pass** - -```bash -cd poller && go test ./internal/tunnel/ -run TestTunnel -v -timeout 30s -``` - -- [ ] **Step 5: Commit** - -```bash -git add poller/internal/tunnel/ -git commit -m "feat(poller): add TCP tunnel with bidirectional proxy and activity tracking" -``` - -### Task 1.4: Tunnel Manager with NATS Integration - -**Files:** -- Create: `poller/internal/tunnel/manager.go` -- Create: `poller/internal/tunnel/manager_test.go` - -- [ ] **Step 1: Write failing tests** - -```go -// poller/internal/tunnel/manager_test.go -package tunnel - -import ( - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestManager_OpenTunnel(t *testing.T) { - routerAddr, cleanup := mockRouter(t) - defer cleanup() - - mgr := NewManager(49000, 49010, 5*time.Minute, nil, nil) - defer mgr.Shutdown() - - resp, err := mgr.OpenTunnel("dev-1", "ten-1", "usr-1", routerAddr) - require.NoError(t, err) - assert.NotEmpty(t, resp.TunnelID) - assert.GreaterOrEqual(t, resp.LocalPort, 49000) - assert.LessOrEqual(t, resp.LocalPort, 49010) -} - -func TestManager_CloseTunnel(t *testing.T) { - routerAddr, cleanup := mockRouter(t) - defer cleanup() - - mgr := NewManager(49000, 49010, 5*time.Minute, nil, nil) - defer mgr.Shutdown() - - resp, _ := mgr.OpenTunnel("dev-1", "ten-1", "usr-1", routerAddr) - err := mgr.CloseTunnel(resp.TunnelID) - assert.NoError(t, err) - - // Port should be released - resp2, err := mgr.OpenTunnel("dev-2", "ten-1", "usr-1", routerAddr) - require.NoError(t, err) - assert.Equal(t, resp.LocalPort, resp2.LocalPort) // reused -} - -func TestManager_PortExhaustion(t *testing.T) { - routerAddr, cleanup := mockRouter(t) - defer cleanup() - - mgr := NewManager(49000, 49001, 5*time.Minute, nil, nil) // 2 ports - defer mgr.Shutdown() - - _, err := mgr.OpenTunnel("dev-1", "ten-1", "usr-1", routerAddr) - require.NoError(t, err) - _, err = mgr.OpenTunnel("dev-2", "ten-1", "usr-1", routerAddr) - require.NoError(t, err) - _, err = mgr.OpenTunnel("dev-3", "ten-1", "usr-1", routerAddr) - assert.Error(t, err) -} - -func TestManager_IdleCleanup(t *testing.T) { - routerAddr, cleanup := mockRouter(t) - defer cleanup() - - mgr := NewManager(49000, 49010, 100*time.Millisecond, nil, nil) // very short idle - defer mgr.Shutdown() - - resp, _ := mgr.OpenTunnel("dev-1", "ten-1", "usr-1", routerAddr) - time.Sleep(500 * time.Millisecond) - mgr.cleanupIdle() // manually trigger - - _, err := mgr.GetTunnel(resp.TunnelID) - assert.Error(t, err) // should be gone -} - -func TestManager_StatusList(t *testing.T) { - routerAddr, cleanup := mockRouter(t) - defer cleanup() - - mgr := NewManager(49000, 49010, 5*time.Minute, nil, nil) - defer mgr.Shutdown() - - mgr.OpenTunnel("dev-1", "ten-1", "usr-1", routerAddr) - mgr.OpenTunnel("dev-1", "ten-1", "usr-2", routerAddr) - mgr.OpenTunnel("dev-2", "ten-1", "usr-1", routerAddr) - - list := mgr.ListTunnels("dev-1") - assert.Len(t, list, 2) -} -``` - -- [ ] **Step 2: Run tests — verify they fail** - -```bash -cd poller && go test ./internal/tunnel/ -run TestManager -v -``` - -- [ ] **Step 3: Implement manager** - -```go -// poller/internal/tunnel/manager.go -package tunnel - -import ( - "context" - "fmt" - "log/slog" - "net" - "sync" - "time" - - "github.com/google/uuid" - "github.com/mikrotik-portal/poller/internal/store" - "github.com/mikrotik-portal/poller/internal/vault" -) - -type OpenTunnelResponse struct { - TunnelID string `json:"tunnel_id"` - LocalPort int `json:"local_port"` -} - -type TunnelStatus struct { - TunnelID string `json:"tunnel_id"` - DeviceID string `json:"device_id"` - LocalPort int `json:"local_port"` - ActiveConns int64 `json:"active_conns"` - IdleSeconds int `json:"idle_seconds"` - CreatedAt string `json:"created_at"` -} - -type Manager struct { - mu sync.Mutex - tunnels map[string]*Tunnel - portPool *PortPool - idleTime time.Duration - deviceStore *store.DeviceStore - credCache *vault.CredentialCache - cancel context.CancelFunc -} - -func NewManager(portMin, portMax int, idleTime time.Duration, ds *store.DeviceStore, cc *vault.CredentialCache) *Manager { - ctx, cancel := context.WithCancel(context.Background()) - m := &Manager{ - tunnels: make(map[string]*Tunnel), - portPool: NewPortPool(portMin, portMax), - idleTime: idleTime, - deviceStore: ds, - credCache: cc, - cancel: cancel, - } - go m.idleLoop(ctx) - return m -} - -func (m *Manager) OpenTunnel(deviceID, tenantID, userID, remoteAddr string) (*OpenTunnelResponse, error) { - port, err := m.portPool.Allocate() - if err != nil { - return nil, err - } - - ln, err := net.Listen("tcp", fmt.Sprintf("127.0.0.1:%d", port)) - if err != nil { - m.portPool.Release(port) - return nil, fmt.Errorf("failed to listen on port %d: %w", port, err) - } - - ctx, cancel := context.WithCancel(context.Background()) - tun := &Tunnel{ - ID: uuid.New().String(), - DeviceID: deviceID, - TenantID: tenantID, - UserID: userID, - LocalPort: port, - RemoteAddr: remoteAddr, - CreatedAt: time.Now(), - LastActive: time.Now().UnixNano(), - listener: ln, - ctx: ctx, - cancel: cancel, - } - - m.mu.Lock() - m.tunnels[tun.ID] = tun - m.mu.Unlock() - - go tun.accept() - - slog.Info("tunnel opened", - "tunnel_id", tun.ID, - "device_id", deviceID, - "tenant_id", tenantID, - "port", port, - "remote", remoteAddr, - ) - - return &OpenTunnelResponse{TunnelID: tun.ID, LocalPort: port}, nil -} - -func (m *Manager) CloseTunnel(tunnelID string) error { - m.mu.Lock() - tun, ok := m.tunnels[tunnelID] - if !ok { - m.mu.Unlock() - return fmt.Errorf("tunnel not found: %s", tunnelID) - } - delete(m.tunnels, tunnelID) - m.mu.Unlock() - - tun.Close() - m.portPool.Release(tun.LocalPort) - return nil -} - -func (m *Manager) GetTunnel(tunnelID string) (*TunnelStatus, error) { - m.mu.Lock() - tun, ok := m.tunnels[tunnelID] - m.mu.Unlock() - if !ok { - return nil, fmt.Errorf("tunnel not found: %s", tunnelID) - } - return tunnelStatusFrom(tun), nil -} - -func (m *Manager) ListTunnels(deviceID string) []TunnelStatus { - m.mu.Lock() - defer m.mu.Unlock() - var out []TunnelStatus - for _, tun := range m.tunnels { - if tun.DeviceID == deviceID { - out = append(out, *tunnelStatusFrom(tun)) - } - } - return out -} - -func (m *Manager) Shutdown() { - m.cancel() - m.mu.Lock() - ids := make([]string, 0, len(m.tunnels)) - for id := range m.tunnels { - ids = append(ids, id) - } - m.mu.Unlock() - for _, id := range ids { - m.CloseTunnel(id) - } -} - -func (m *Manager) idleLoop(ctx context.Context) { - ticker := time.NewTicker(30 * time.Second) - defer ticker.Stop() - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - m.cleanupIdle() - } - } -} - -func (m *Manager) cleanupIdle() { - m.mu.Lock() - var toClose []string - for id, tun := range m.tunnels { - if tun.IdleDuration() > m.idleTime && tun.ActiveConns() == 0 { - toClose = append(toClose, id) - } - } - m.mu.Unlock() - - for _, id := range toClose { - slog.Info("tunnel idle timeout", "tunnel_id", id) - m.CloseTunnel(id) - } -} - -func tunnelStatusFrom(tun *Tunnel) *TunnelStatus { - return &TunnelStatus{ - TunnelID: tun.ID, - DeviceID: tun.DeviceID, - LocalPort: tun.LocalPort, - ActiveConns: tun.ActiveConns(), - IdleSeconds: int(tun.IdleDuration().Seconds()), - CreatedAt: tun.CreatedAt.Format(time.RFC3339), - } -} -``` - -- [ ] **Step 4: Run all tunnel tests** - -```bash -cd poller && go test ./internal/tunnel/ -v -timeout 30s -``` - -- [ ] **Step 5: Commit** - -```bash -git add poller/internal/tunnel/ -git commit -m "feat(poller): add tunnel manager with idle cleanup and status tracking" -``` - -### Task 1.5: NATS Tunnel Responder - -**Files:** -- Create: `poller/internal/bus/tunnel_responder.go` - -This wires the tunnel manager to NATS subjects `tunnel.open`, `tunnel.close`, `tunnel.status`, `tunnel.status.list`. Follow the existing pattern in `cmd_responder.go`. - -- [ ] **Step 1: Implement NATS responder** - -```go -// poller/internal/bus/tunnel_responder.go -package bus - -import ( - "encoding/json" - "log/slog" - - "github.com/mikrotik-portal/poller/internal/store" - "github.com/mikrotik-portal/poller/internal/tunnel" - "github.com/mikrotik-portal/poller/internal/vault" - "github.com/nats-io/nats.go" -) - -type TunnelOpenRequest struct { - DeviceID string `json:"device_id"` - TenantID string `json:"tenant_id"` - UserID string `json:"user_id"` - TargetPort int `json:"target_port"` -} - -type TunnelCloseRequest struct { - TunnelID string `json:"tunnel_id"` -} - -type TunnelStatusRequest struct { - TunnelID string `json:"tunnel_id,omitempty"` - DeviceID string `json:"device_id,omitempty"` -} - -type TunnelResponder struct { - nc *nats.Conn - manager *tunnel.Manager - deviceStore *store.DeviceStore - credCache *vault.CredentialCache -} - -func NewTunnelResponder(nc *nats.Conn, mgr *tunnel.Manager, ds *store.DeviceStore, cc *vault.CredentialCache) *TunnelResponder { - return &TunnelResponder{nc: nc, manager: mgr, deviceStore: ds, credCache: cc} -} - -func (tr *TunnelResponder) Subscribe() error { - if _, err := tr.nc.Subscribe("tunnel.open", tr.handleOpen); err != nil { - return err - } - if _, err := tr.nc.Subscribe("tunnel.close", tr.handleClose); err != nil { - return err - } - if _, err := tr.nc.Subscribe("tunnel.status", tr.handleStatus); err != nil { - return err - } - if _, err := tr.nc.Subscribe("tunnel.status.list", tr.handleStatusList); err != nil { - return err - } - slog.Info("tunnel NATS responder subscribed") - return nil -} - -func (tr *TunnelResponder) handleOpen(msg *nats.Msg) { - var req TunnelOpenRequest - if err := json.Unmarshal(msg.Data, &req); err != nil { - replyError(msg, "invalid request") - return - } - - // Look up device to get IP and decrypt credentials - dev, err := tr.deviceStore.GetDevice(req.DeviceID) - if err != nil { - slog.Error("tunnel: device lookup failed", "device_id", req.DeviceID, "err", err) - replyError(msg, "device not found") - return - } - - targetPort := req.TargetPort - if targetPort == 0 { - targetPort = 8291 - } - remoteAddr := dev.IPAddress + ":" + itoa(targetPort) - - resp, err := tr.manager.OpenTunnel(req.DeviceID, req.TenantID, req.UserID, remoteAddr) - if err != nil { - slog.Error("tunnel: open failed", "device_id", req.DeviceID, "err", err) - replyError(msg, err.Error()) - return - } - - data, _ := json.Marshal(resp) - msg.Respond(data) -} - -func (tr *TunnelResponder) handleClose(msg *nats.Msg) { - var req TunnelCloseRequest - if err := json.Unmarshal(msg.Data, &req); err != nil { - replyError(msg, "invalid request") - return - } - - err := tr.manager.CloseTunnel(req.TunnelID) - if err != nil { - replyError(msg, err.Error()) - return - } - msg.Respond([]byte(`{"ok":true}`)) -} - -func (tr *TunnelResponder) handleStatus(msg *nats.Msg) { - var req TunnelStatusRequest - if err := json.Unmarshal(msg.Data, &req); err != nil { - replyError(msg, "invalid request") - return - } - - status, err := tr.manager.GetTunnel(req.TunnelID) - if err != nil { - replyError(msg, err.Error()) - return - } - data, _ := json.Marshal(status) - msg.Respond(data) -} - -func (tr *TunnelResponder) handleStatusList(msg *nats.Msg) { - var req TunnelStatusRequest - if err := json.Unmarshal(msg.Data, &req); err != nil { - replyError(msg, "invalid request") - return - } - - list := tr.manager.ListTunnels(req.DeviceID) - data, _ := json.Marshal(list) - msg.Respond(data) -} - -func replyError(msg *nats.Msg, errMsg string) { - resp, _ := json.Marshal(map[string]string{"error": errMsg}) - msg.Respond(resp) -} - -func itoa(i int) string { - return fmt.Sprintf("%d", i) -} -``` - -Note: Add `import "fmt"` to the imports. - -- [ ] **Step 2: Verify compilation** - -```bash -cd poller && go build ./internal/bus/ -``` - -- [ ] **Step 3: Commit** - -```bash -git add poller/internal/bus/tunnel_responder.go -git commit -m "feat(poller): add NATS tunnel responder for WinBox tunnel management" -``` - ---- - -## Chunk 2: Poller — SSH Relay - -### Task 2.1: SSH Relay Server Core - -**Files:** -- Create: `poller/internal/sshrelay/server.go` -- Create: `poller/internal/sshrelay/session.go` -- Create: `poller/internal/sshrelay/bridge.go` -- Create: `poller/internal/sshrelay/server_test.go` - -This is a large task. The SSH relay server handles: WebSocket upgrade, Redis token validation, SSH dial + PTY, bidirectional bridge, idle timeout, session limits. - -- [ ] **Step 1: Write session and bridge types** - -```go -// poller/internal/sshrelay/session.go -package sshrelay - -import ( - "context" - "sync/atomic" - "time" - - "golang.org/x/crypto/ssh" -) - -type Session struct { - ID string - DeviceID string - TenantID string - UserID string - SourceIP string - StartTime time.Time - LastActive int64 // atomic, unix nanoseconds - sshClient *ssh.Client - sshSession *ssh.Session - ptyCols int - ptyRows int - cancel context.CancelFunc -} - -func (s *Session) IdleDuration() time.Duration { - return time.Since(time.Unix(0, atomic.LoadInt64(&s.LastActive))) -} -``` - -```go -// poller/internal/sshrelay/bridge.go -package sshrelay - -import ( - "context" - "encoding/json" - "io" - "log/slog" - "sync/atomic" - "time" - - "golang.org/x/crypto/ssh" - "nhooyr.io/websocket" -) - -type ControlMsg struct { - Type string `json:"type"` - Cols int `json:"cols"` - Rows int `json:"rows"` -} - -func bridge(ctx context.Context, cancel context.CancelFunc, ws *websocket.Conn, - sshSess *ssh.Session, stdin io.WriteCloser, stdout, stderr io.Reader, lastActive *int64) { - - // WebSocket → SSH stdin - go func() { - defer cancel() - for { - typ, data, err := ws.Read(ctx) - if err != nil { - return - } - atomic.StoreInt64(lastActive, time.Now().UnixNano()) - - if typ == websocket.MessageText { - var ctrl ControlMsg - if json.Unmarshal(data, &ctrl) != nil { - continue - } - if ctrl.Type == "resize" && ctrl.Cols > 0 && ctrl.Cols <= 500 && ctrl.Rows > 0 && ctrl.Rows <= 200 { - sshSess.WindowChange(ctrl.Rows, ctrl.Cols) - } - continue - } - stdin.Write(data) - } - }() - - // SSH stdout → WebSocket - go func() { - defer cancel() - buf := make([]byte, 4096) - for { - n, err := stdout.Read(buf) - if err != nil { - return - } - atomic.StoreInt64(lastActive, time.Now().UnixNano()) - ws.Write(ctx, websocket.MessageBinary, buf[:n]) - } - }() - - // SSH stderr → WebSocket - go func() { - defer cancel() - buf := make([]byte, 4096) - for { - n, err := stderr.Read(buf) - if err != nil { - return - } - ws.Write(ctx, websocket.MessageBinary, buf[:n]) - } - }() - - <-ctx.Done() -} -``` - -- [ ] **Step 2: Write server** - -```go -// poller/internal/sshrelay/server.go -package sshrelay - -import ( - "context" - "encoding/json" - "fmt" - "log/slog" - "net/http" - "strings" - "sync" - "time" - - "github.com/google/uuid" - "github.com/mikrotik-portal/poller/internal/store" - "github.com/mikrotik-portal/poller/internal/vault" - "github.com/redis/go-redis/v9" - "golang.org/x/crypto/ssh" - "nhooyr.io/websocket" -) - -type TokenPayload struct { - DeviceID string `json:"device_id"` - TenantID string `json:"tenant_id"` - UserID string `json:"user_id"` - SourceIP string `json:"source_ip"` - Cols int `json:"cols"` - Rows int `json:"rows"` - CreatedAt int64 `json:"created_at"` -} - -type Server struct { - redis *redis.Client - credCache *vault.CredentialCache - deviceStore *store.DeviceStore - sessions map[string]*Session - mu sync.Mutex - idleTime time.Duration - maxSessions int - maxPerUser int - maxPerDevice int - cancel context.CancelFunc -} - -type Config struct { - IdleTimeout time.Duration - MaxSessions int - MaxPerUser int - MaxPerDevice int -} - -func NewServer(rc *redis.Client, cc *vault.CredentialCache, ds *store.DeviceStore, cfg Config) *Server { - ctx, cancel := context.WithCancel(context.Background()) - s := &Server{ - redis: rc, - credCache: cc, - deviceStore: ds, - sessions: make(map[string]*Session), - idleTime: cfg.IdleTimeout, - maxSessions: cfg.MaxSessions, - maxPerUser: cfg.MaxPerUser, - maxPerDevice: cfg.MaxPerDevice, - cancel: cancel, - } - go s.idleLoop(ctx) - return s -} - -func (s *Server) Handler() http.Handler { - mux := http.NewServeMux() - mux.HandleFunc("/ws/ssh", s.handleSSH) - mux.HandleFunc("/healthz", s.handleHealth) - return mux -} - -func (s *Server) Shutdown() { - s.cancel() - s.mu.Lock() - for _, sess := range s.sessions { - sess.cancel() - } - s.mu.Unlock() -} - -func (s *Server) handleHealth(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - w.Write([]byte(`{"status":"ok"}`)) -} - -func (s *Server) handleSSH(w http.ResponseWriter, r *http.Request) { - token := r.URL.Query().Get("token") - if token == "" { - http.Error(w, "missing token", http.StatusUnauthorized) - return - } - - // Validate single-use token via Redis GETDEL - payload, err := s.validateToken(r.Context(), token) - if err != nil { - slog.Warn("ssh: token validation failed", "err", err) - http.Error(w, "unauthorized", http.StatusUnauthorized) - return - } - - // Check session limits - if err := s.checkLimits(payload.UserID, payload.DeviceID); err != nil { - http.Error(w, err.Error(), http.StatusTooManyRequests) - return - } - - // Upgrade to WebSocket - ws, err := websocket.Accept(w, r, &websocket.AcceptOptions{ - OriginPatterns: []string{"*"}, // nginx handles origin - }) - if err != nil { - slog.Error("ssh: websocket upgrade failed", "err", err) - return - } - ws.SetReadLimit(1 << 20) - - // Extract source IP - sourceIP := r.Header.Get("X-Real-IP") - if sourceIP == "" { - sourceIP = r.RemoteAddr - } - - // Look up device - dev, err := s.deviceStore.GetDevice(payload.DeviceID) - if err != nil { - slog.Error("ssh: device lookup failed", "device_id", payload.DeviceID, "err", err) - ws.Close(websocket.StatusInternalError, "device not found") - return - } - - // Decrypt credentials - creds, err := s.credCache.GetCredentials(dev.ID, payload.TenantID, dev.EncryptedCredentialsTransit, dev.EncryptedCredentials) - if err != nil { - slog.Error("ssh: credential decryption failed", "device_id", payload.DeviceID, "err", err) - ws.Close(websocket.StatusInternalError, "credential error") - return - } - - // SSH dial - sshPort := "22" - sshAddr := dev.IPAddress + ":" + sshPort - sshClient, err := ssh.Dial("tcp", sshAddr, &ssh.ClientConfig{ - User: creds.Username, - Auth: []ssh.AuthMethod{ssh.Password(creds.Password)}, - HostKeyCallback: ssh.InsecureIgnoreHostKey(), - Timeout: 10 * time.Second, - }) - if err != nil { - slog.Error("ssh: dial failed", "device_id", payload.DeviceID, "addr", sshAddr, "err", err) - ws.Close(websocket.StatusInternalError, "ssh connection failed") - return - } - - sshSess, err := sshClient.NewSession() - if err != nil { - sshClient.Close() - ws.Close(websocket.StatusInternalError, "ssh session failed") - return - } - - cols, rows := payload.Cols, payload.Rows - if cols <= 0 { - cols = 80 - } - if rows <= 0 { - rows = 24 - } - - if err := sshSess.RequestPty("xterm-256color", rows, cols, ssh.TerminalModes{ - ssh.ECHO: 1, - }); err != nil { - sshSess.Close() - sshClient.Close() - ws.Close(websocket.StatusInternalError, "pty request failed") - return - } - - stdin, _ := sshSess.StdinPipe() - stdout, _ := sshSess.StdoutPipe() - stderr, _ := sshSess.StderrPipe() - - if err := sshSess.Shell(); err != nil { - sshSess.Close() - sshClient.Close() - ws.Close(websocket.StatusInternalError, "shell start failed") - return - } - - ctx, cancel := context.WithCancel(context.Background()) - - sess := &Session{ - ID: uuid.New().String(), - DeviceID: payload.DeviceID, - TenantID: payload.TenantID, - UserID: payload.UserID, - SourceIP: sourceIP, - StartTime: time.Now(), - LastActive: time.Now().UnixNano(), - sshClient: sshClient, - sshSession: sshSess, - ptyCols: cols, - ptyRows: rows, - cancel: cancel, - } - - s.mu.Lock() - s.sessions[sess.ID] = sess - s.mu.Unlock() - - slog.Info("ssh session started", - "session_id", sess.ID, - "device_id", payload.DeviceID, - "tenant_id", payload.TenantID, - "user_id", payload.UserID, - "source_ip", sourceIP, - ) - - // Bridge WebSocket ↔ SSH - bridge(ctx, cancel, ws, sshSess, stdin, stdout, stderr, &sess.LastActive) - - // Cleanup - ws.Close(websocket.StatusNormalClosure, "session ended") - sshSess.Close() - sshClient.Close() - - s.mu.Lock() - delete(s.sessions, sess.ID) - s.mu.Unlock() - - endTime := time.Now() - duration := endTime.Sub(sess.StartTime) - slog.Info("ssh session ended", - "session_id", sess.ID, - "device_id", payload.DeviceID, - "duration", duration.String(), - ) - - // Publish audit event for session end via NATS (TODO: wire NATS publisher) -} - -func (s *Server) validateToken(ctx context.Context, token string) (*TokenPayload, error) { - key := "ssh:token:" + token - val, err := s.redis.GetDel(ctx, key).Result() - if err != nil { - return nil, fmt.Errorf("token not found or expired") - } - var payload TokenPayload - if err := json.Unmarshal([]byte(val), &payload); err != nil { - return nil, fmt.Errorf("invalid token payload") - } - return &payload, nil -} - -func (s *Server) checkLimits(userID, deviceID string) error { - s.mu.Lock() - defer s.mu.Unlock() - - if len(s.sessions) >= s.maxSessions { - return fmt.Errorf("max sessions exceeded") - } - - userCount := 0 - deviceCount := 0 - for _, sess := range s.sessions { - if sess.UserID == userID { - userCount++ - } - if sess.DeviceID == deviceID { - deviceCount++ - } - } - if userCount >= s.maxPerUser { - return fmt.Errorf("max sessions per user exceeded") - } - if deviceCount >= s.maxPerDevice { - return fmt.Errorf("max sessions per device exceeded") - } - return nil -} - -func (s *Server) idleLoop(ctx context.Context) { - ticker := time.NewTicker(30 * time.Second) - defer ticker.Stop() - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - s.cleanupIdle() - } - } -} - -func (s *Server) cleanupIdle() { - s.mu.Lock() - var toCancel []*Session - for _, sess := range s.sessions { - if sess.IdleDuration() > s.idleTime { - toCancel = append(toCancel, sess) - } - } - s.mu.Unlock() - - for _, sess := range toCancel { - slog.Info("ssh session idle timeout", "session_id", sess.ID) - sess.cancel() - } -} - -// SessionList returns active SSH sessions for a device. -func (s *Server) SessionList(deviceID string) []map[string]interface{} { - s.mu.Lock() - defer s.mu.Unlock() - var out []map[string]interface{} - for _, sess := range s.sessions { - if sess.DeviceID == deviceID { - out = append(out, map[string]interface{}{ - "session_id": sess.ID, - "idle_seconds": int(sess.IdleDuration().Seconds()), - "created_at": sess.StartTime.Format(time.RFC3339), - }) - } - } - return out -} -``` - -- [ ] **Step 3: Write tests** - -```go -// poller/internal/sshrelay/server_test.go -package sshrelay - -import ( - "context" - "encoding/json" - "testing" - "time" - - "github.com/alicebob/miniredis/v2" - "github.com/redis/go-redis/v9" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func setupRedis(t *testing.T) (*redis.Client, *miniredis.Miniredis) { - t.Helper() - mr := miniredis.RunT(t) - rc := redis.NewClient(&redis.Options{Addr: mr.Addr()}) - return rc, mr -} - -func TestValidateToken_Valid(t *testing.T) { - rc, _ := setupRedis(t) - s := &Server{redis: rc, sessions: make(map[string]*Session)} - - payload := TokenPayload{DeviceID: "d1", TenantID: "t1", UserID: "u1", Cols: 80, Rows: 24, CreatedAt: time.Now().Unix()} - data, _ := json.Marshal(payload) - rc.Set(context.Background(), "ssh:token:abc123", string(data), 120*time.Second) - - result, err := s.validateToken(context.Background(), "abc123") - require.NoError(t, err) - assert.Equal(t, "d1", result.DeviceID) - - // Token consumed — second use should fail - _, err = s.validateToken(context.Background(), "abc123") - assert.Error(t, err) -} - -func TestValidateToken_Expired(t *testing.T) { - rc, mr := setupRedis(t) - s := &Server{redis: rc, sessions: make(map[string]*Session)} - - payload := TokenPayload{DeviceID: "d1", TenantID: "t1", UserID: "u1"} - data, _ := json.Marshal(payload) - rc.Set(context.Background(), "ssh:token:expired", string(data), 1*time.Millisecond) - mr.FastForward(2 * time.Second) - - _, err := s.validateToken(context.Background(), "expired") - assert.Error(t, err) -} - -func TestCheckLimits_MaxSessions(t *testing.T) { - s := &Server{ - sessions: make(map[string]*Session), - maxSessions: 2, - maxPerUser: 10, - maxPerDevice: 10, - } - s.sessions["s1"] = &Session{UserID: "u1", DeviceID: "d1"} - s.sessions["s2"] = &Session{UserID: "u2", DeviceID: "d2"} - - err := s.checkLimits("u3", "d3") - assert.Error(t, err) - assert.Contains(t, err.Error(), "max sessions exceeded") -} - -func TestCheckLimits_MaxPerUser(t *testing.T) { - s := &Server{ - sessions: make(map[string]*Session), - maxSessions: 100, - maxPerUser: 2, - maxPerDevice: 100, - } - s.sessions["s1"] = &Session{UserID: "u1", DeviceID: "d1"} - s.sessions["s2"] = &Session{UserID: "u1", DeviceID: "d2"} - - err := s.checkLimits("u1", "d3") - assert.Error(t, err) - assert.Contains(t, err.Error(), "per user") -} - -func TestCheckLimits_MaxPerDevice(t *testing.T) { - s := &Server{ - sessions: make(map[string]*Session), - maxSessions: 100, - maxPerUser: 100, - maxPerDevice: 1, - } - s.sessions["s1"] = &Session{UserID: "u1", DeviceID: "d1"} - - err := s.checkLimits("u2", "d1") - assert.Error(t, err) - assert.Contains(t, err.Error(), "per device") -} - -func TestSessionList(t *testing.T) { - s := &Server{sessions: make(map[string]*Session)} - s.sessions["s1"] = &Session{ID: "s1", DeviceID: "d1", StartTime: time.Now(), LastActive: time.Now().UnixNano()} - s.sessions["s2"] = &Session{ID: "s2", DeviceID: "d1", StartTime: time.Now(), LastActive: time.Now().UnixNano()} - s.sessions["s3"] = &Session{ID: "s3", DeviceID: "d2", StartTime: time.Now(), LastActive: time.Now().UnixNano()} - - list := s.SessionList("d1") - assert.Len(t, list, 2) -} -``` - -- [ ] **Step 4: Add miniredis test dependency** - -```bash -cd poller && go get github.com/alicebob/miniredis/v2@latest && go mod tidy -``` - -- [ ] **Step 5: Run tests** - -```bash -cd poller && go test ./internal/sshrelay/ -v -timeout 30s -``` - -- [ ] **Step 6: Commit** - -```bash -git add poller/internal/sshrelay/ poller/go.mod poller/go.sum -git commit -m "feat(poller): add SSH relay server with WebSocket-to-PTY bridge" -``` - -### Task 2.2: Wire HTTP Server and Tunnel Manager into Poller Main - -**Files:** -- Modify: `poller/cmd/poller/main.go` -- Modify: `poller/internal/poller/scheduler.go` (add tunnel manager to scheduler dependencies if needed) - -- [ ] **Step 1: Read existing main.go to understand startup pattern** - -Read `poller/cmd/poller/main.go` to understand how services are initialized and how graceful shutdown works. The changes need to: - -1. Create tunnel manager -2. Create SSH relay server -3. Start HTTP server for SSH relay + healthz -4. Subscribe tunnel NATS responder -5. Add both to graceful shutdown - -- [ ] **Step 2: Add initialization code** - -Add to the main startup (after existing NATS/Redis/DB initialization): - -```go -// Tunnel manager -tunnelMgr := tunnel.NewManager( - cfg.TunnelPortMin, // env: TUNNEL_PORT_MIN, default 49000 - cfg.TunnelPortMax, // env: TUNNEL_PORT_MAX, default 49100 - time.Duration(cfg.TunnelIdleTimeout) * time.Second, - deviceStore, - credCache, -) - -// NATS tunnel responder -tunnelResp := bus.NewTunnelResponder(nc, tunnelMgr, deviceStore, credCache) -if err := tunnelResp.Subscribe(); err != nil { - slog.Error("failed to subscribe tunnel responder", "err", err) -} - -// SSH relay server -sshServer := sshrelay.NewServer(redisClient, credCache, deviceStore, sshrelay.Config{ - IdleTimeout: time.Duration(cfg.SSHIdleTimeout) * time.Second, - MaxSessions: cfg.SSHMaxSessions, - MaxPerUser: cfg.SSHMaxPerUser, - MaxPerDevice: cfg.SSHMaxPerDevice, -}) - -// HTTP server (SSH relay + healthz) -httpServer := &http.Server{ - Addr: ":" + cfg.SSHRelayPort, - Handler: sshServer.Handler(), -} -go func() { - slog.Info("SSH relay HTTP server starting", "port", cfg.SSHRelayPort) - if err := httpServer.ListenAndServe(); err != http.ErrServerClosed { - slog.Error("HTTP server error", "err", err) - } -}() -``` - -Add to graceful shutdown: - -```go -// In shutdown handler: -shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) -defer shutdownCancel() -httpServer.Shutdown(shutdownCtx) -sshServer.Shutdown() -tunnelMgr.Shutdown() -``` - -- [ ] **Step 3: Add config fields** - -Add to the poller config struct (wherever `cfg` is defined): - -```go -TunnelPortMin int `env:"TUNNEL_PORT_MIN" default:"49000"` -TunnelPortMax int `env:"TUNNEL_PORT_MAX" default:"49100"` -TunnelIdleTimeout int `env:"TUNNEL_IDLE_TIMEOUT" default:"300"` -SSHRelayPort string `env:"SSH_RELAY_PORT" default:"8080"` -SSHIdleTimeout int `env:"SSH_IDLE_TIMEOUT" default:"900"` -SSHMaxSessions int `env:"SSH_MAX_SESSIONS" default:"200"` -SSHMaxPerUser int `env:"SSH_MAX_PER_USER" default:"10"` -SSHMaxPerDevice int `env:"SSH_MAX_PER_DEVICE" default:"20"` -``` - -- [ ] **Step 4: Verify compilation** - -```bash -cd poller && go build ./cmd/poller/ -``` - -- [ ] **Step 5: Commit** - -```bash -git add poller/cmd/poller/ poller/internal/ -git commit -m "feat(poller): wire tunnel manager and SSH relay into poller startup" -``` - ---- - -## Chunk 3: Backend API — Remote Access Endpoints - -### Task 3.1: Pydantic Schemas - -**Files:** -- Create: `backend/app/schemas/remote_access.py` - -- [ ] **Step 1: Create schemas** - -```python -# backend/app/schemas/remote_access.py -from pydantic import BaseModel, Field - - -class WinboxSessionResponse(BaseModel): - tunnel_id: str - host: str = "127.0.0.1" - port: int - winbox_uri: str - idle_timeout_seconds: int = 300 - - -class SSHSessionRequest(BaseModel): - cols: int = Field(default=80, gt=0, le=500) - rows: int = Field(default=24, gt=0, le=200) - - -class SSHSessionResponse(BaseModel): - token: str - websocket_url: str - idle_timeout_seconds: int = 900 - - -class TunnelStatusItem(BaseModel): - tunnel_id: str - local_port: int - active_conns: int - idle_seconds: int - created_at: str - - -class SSHSessionStatusItem(BaseModel): - session_id: str - idle_seconds: int - created_at: str - - -class ActiveSessionsResponse(BaseModel): - winbox_tunnels: list[TunnelStatusItem] = [] - ssh_sessions: list[SSHSessionStatusItem] = [] -``` - -- [ ] **Step 2: Commit** - -```bash -git add backend/app/schemas/remote_access.py -git commit -m "feat(api): add remote access pydantic schemas" -``` - -### Task 3.2: Remote Access Router - -**Files:** -- Create: `backend/app/routers/remote_access.py` -- Create: `backend/tests/test_remote_access.py` - -- [ ] **Step 1: Write tests** - -```python -# backend/tests/test_remote_access.py -import pytest -from unittest.mock import AsyncMock, patch, MagicMock -from httpx import AsyncClient - - -@pytest.fixture -def mock_nats(): - """Mock NATS request-reply for tunnel operations.""" - with patch("app.routers.remote_access.nats_request") as mock: - mock.return_value = {"tunnel_id": "test-uuid", "local_port": 49001} - yield mock - - -@pytest.fixture -def mock_redis(): - """Mock Redis for SSH token storage.""" - with patch("app.routers.remote_access.redis_client") as mock: - mock.setex = AsyncMock() - mock.get = AsyncMock(return_value=None) - yield mock - - -class TestWinboxSession: - async def test_viewer_forbidden(self, client: AsyncClient, viewer_token): - resp = await client.post( - "/api/tenants/t1/devices/d1/winbox-session", - headers={"Authorization": f"Bearer {viewer_token}"}, - ) - assert resp.status_code == 403 - - async def test_operator_allowed(self, client: AsyncClient, operator_token, mock_nats): - resp = await client.post( - "/api/tenants/t1/devices/d1/winbox-session", - headers={"Authorization": f"Bearer {operator_token}"}, - ) - assert resp.status_code == 200 - data = resp.json() - assert data["host"] == "127.0.0.1" - assert 49000 <= data["port"] <= 49100 - - async def test_device_not_found(self, client: AsyncClient, operator_token): - resp = await client.post( - "/api/tenants/t1/devices/nonexistent/winbox-session", - headers={"Authorization": f"Bearer {operator_token}"}, - ) - assert resp.status_code == 404 - - -class TestSSHSession: - async def test_viewer_forbidden(self, client: AsyncClient, viewer_token): - resp = await client.post( - "/api/tenants/t1/devices/d1/ssh-session", - headers={"Authorization": f"Bearer {viewer_token}"}, - json={"cols": 80, "rows": 24}, - ) - assert resp.status_code == 403 - - async def test_operator_gets_token(self, client: AsyncClient, operator_token, mock_redis): - resp = await client.post( - "/api/tenants/t1/devices/d1/ssh-session", - headers={"Authorization": f"Bearer {operator_token}"}, - json={"cols": 80, "rows": 24}, - ) - assert resp.status_code == 200 - data = resp.json() - assert "token" in data - assert "websocket_url" in data - - async def test_invalid_cols(self, client: AsyncClient, operator_token): - resp = await client.post( - "/api/tenants/t1/devices/d1/ssh-session", - headers={"Authorization": f"Bearer {operator_token}"}, - json={"cols": 9999, "rows": 24}, - ) - assert resp.status_code == 422 -``` - -- [ ] **Step 2: Implement router** - -```python -# backend/app/routers/remote_access.py -""" -Remote access endpoints for WinBox tunnels and SSH terminal sessions. - -All routes are tenant-scoped under: - /api/tenants/{tenant_id}/devices/{device_id}/ - -RBAC: operator and above (viewer gets 403). -""" - -import json -import logging -import secrets -import time - -from fastapi import APIRouter, Depends, HTTPException, Request -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from app.config import settings -from app.database import get_db -from app.middleware.rbac import require_role -from app.middleware.tenant_context import CurrentUser -from app.models.device import Device -from app.schemas.remote_access import ( - ActiveSessionsResponse, - SSHSessionRequest, - SSHSessionResponse, - WinboxSessionResponse, -) -from app.services.audit_service import log_action -from app.services.nats_service import nats_request -from app.services.redis_service import redis_client - -logger = logging.getLogger(__name__) - -router = APIRouter( - prefix="/tenants/{tenant_id}/devices/{device_id}", - tags=["remote-access"], -) - - -def _source_ip(request: Request) -> str: - return request.headers.get("x-real-ip", "") or request.client.host - - -async def _get_device(db: AsyncSession, device_id: str) -> Device: - result = await db.execute(select(Device).where(Device.id == device_id)) - device = result.scalar_one_or_none() - if not device: - raise HTTPException(status_code=404, detail="Device not found") - return device - - -@router.post("/winbox-session", response_model=WinboxSessionResponse) -async def open_winbox( - tenant_id: str, - device_id: str, - request: Request, - current_user: CurrentUser = Depends(require_role("operator")), - db: AsyncSession = Depends(get_db), -): - device = await _get_device(db, device_id) - source_ip = _source_ip(request) - - await log_action( - "winbox_tunnel_open", current_user.id, tenant_id, - device_id=device_id, ip_address=source_ip, - ) - - payload = json.dumps({ - "device_id": str(device_id), - "tenant_id": str(tenant_id), - "user_id": str(current_user.id), - "target_port": 8291, - }) - - try: - resp = await nats_request("tunnel.open", payload.encode(), timeout=10) - except Exception as e: - logger.error("NATS tunnel.open failed: %s", e) - raise HTTPException(status_code=503, detail="Tunnel service unavailable") - - data = json.loads(resp.data) - if "error" in data: - raise HTTPException(status_code=503, detail=data["error"]) - - port = data["local_port"] - if not (49000 <= port <= 49100): - raise HTTPException(status_code=503, detail="Invalid port allocation") - - return WinboxSessionResponse( - tunnel_id=data["tunnel_id"], - host="127.0.0.1", - port=port, - winbox_uri=f"winbox://127.0.0.1:{port}", - ) - - -@router.post("/ssh-session", response_model=SSHSessionResponse) -async def open_ssh( - tenant_id: str, - device_id: str, - request: Request, - body: SSHSessionRequest, - current_user: CurrentUser = Depends(require_role("operator")), - db: AsyncSession = Depends(get_db), -): - await _get_device(db, device_id) - source_ip = _source_ip(request) - - await log_action( - "ssh_session_open", current_user.id, tenant_id, - device_id=device_id, ip_address=source_ip, - ) - - token = secrets.token_urlsafe(32) - token_payload = json.dumps({ - "device_id": str(device_id), - "tenant_id": str(tenant_id), - "user_id": str(current_user.id), - "source_ip": source_ip, - "cols": body.cols, - "rows": body.rows, - "created_at": int(time.time()), - }) - - await redis_client.setex(f"ssh:token:{token}", 120, token_payload) - - return SSHSessionResponse( - token=token, - websocket_url=f"/ws/ssh?token={token}", - ) - - -@router.delete("/winbox-session/{tunnel_id}") -async def close_winbox( - tenant_id: str, - device_id: str, - tunnel_id: str, - request: Request, - current_user: CurrentUser = Depends(require_role("operator")), -): - source_ip = _source_ip(request) - - await log_action( - "winbox_tunnel_close", current_user.id, tenant_id, - device_id=device_id, ip_address=source_ip, - ) - - try: - payload = json.dumps({"tunnel_id": tunnel_id}) - await nats_request("tunnel.close", payload.encode(), timeout=10) - except Exception: - pass # Idempotent — tunnel may already be closed - - return {"status": "closed"} - - -@router.get("/sessions", response_model=ActiveSessionsResponse) -async def list_sessions( - tenant_id: str, - device_id: str, - current_user: CurrentUser = Depends(require_role("operator")), -): - try: - payload = json.dumps({"device_id": str(device_id)}) - resp = await nats_request("tunnel.status.list", payload.encode(), timeout=10) - tunnels = json.loads(resp.data) - except Exception: - tunnels = [] - - # SSH sessions would come from a similar NATS query - # For now, return empty until SSH relay exposes a NATS status endpoint - return ActiveSessionsResponse( - winbox_tunnels=tunnels if isinstance(tunnels, list) else [], - ssh_sessions=[], - ) -``` - -- [ ] **Step 3: Register router in main.py** - -Add to `backend/app/main.py` where other routers are registered: - -```python -from app.routers import remote_access -app.include_router(remote_access.router, prefix="/api") -``` - -- [ ] **Step 4: Run tests** - -```bash -cd backend && python -m pytest tests/test_remote_access.py -v -``` - -Note: Tests may need adjustment based on existing test fixtures. Follow the patterns in existing test files like `tests/test_config_editor.py`. - -- [ ] **Step 5: Commit** - -```bash -git add backend/app/routers/remote_access.py backend/app/schemas/remote_access.py backend/app/main.py backend/tests/test_remote_access.py -git commit -m "feat(api): add remote access endpoints for WinBox tunnels and SSH sessions" -``` - ---- - -## Chunk 4: Infrastructure Changes - -### Task 4.1: nginx WebSocket Configuration - -**Files:** -- Modify: `infrastructure/docker/nginx-spa.conf` - -- [ ] **Step 1: Add WebSocket upgrade map (before server block)** - -Add at the top of the file, before the `server {` block: - -```nginx -map $http_upgrade $connection_upgrade { - default upgrade; - '' close; -} -``` - -- [ ] **Step 2: Add WebSocket location (inside server block)** - -Add after the existing `/api/` location block: - -```nginx - # WebSocket proxy for SSH terminal - location /ws/ssh { - resolver 127.0.0.11 valid=10s ipv6=off; - set $poller_upstream http://poller:8080; - - proxy_pass $poller_upstream; - proxy_http_version 1.1; - - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection $connection_upgrade; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header Host $host; - - proxy_read_timeout 1800s; - proxy_send_timeout 1800s; - - proxy_buffering off; - proxy_request_buffering off; - proxy_busy_buffers_size 512k; - proxy_buffers 8 512k; - } -``` - -- [ ] **Step 3: Update CSP header to allow WebSocket** - -In the existing CSP `add_header` directive, ensure `connect-src` includes `ws: wss:`. - -- [ ] **Step 4: Commit** - -```bash -git add infrastructure/docker/nginx-spa.conf -git commit -m "feat(infra): add nginx WebSocket proxy for SSH relay" -``` - -### Task 4.2: Docker Compose Changes - -**Files:** -- Modify: `docker-compose.override.yml` -- Modify: `docker-compose.prod.yml` -- Modify: `docker-compose.staging.yml` - -- [ ] **Step 1: Update docker-compose.override.yml** - -Add to the poller service: - -```yaml - ports: - - "127.0.0.1:49000-49100:49000-49100" - ulimits: - nofile: - soft: 8192 - hard: 8192 - environment: - # ... existing env vars ... - TUNNEL_PORT_MIN: 49000 - TUNNEL_PORT_MAX: 49100 - TUNNEL_IDLE_TIMEOUT: 300 - SSH_RELAY_PORT: 8080 - SSH_IDLE_TIMEOUT: 900 - SSH_MAX_SESSIONS: 200 - SSH_MAX_PER_USER: 10 - SSH_MAX_PER_DEVICE: 20 - healthcheck: - test: ["CMD-SHELL", "wget --spider -q http://localhost:8080/healthz || exit 1"] - interval: 30s - timeout: 3s - retries: 3 -``` - -- [ ] **Step 2: Update docker-compose.prod.yml** - -Same additions plus increased memory limit: - -```yaml - deploy: - resources: - limits: - memory: 512M # increased from 256M for tunnel/SSH overhead -``` - -- [ ] **Step 3: Update docker-compose.staging.yml** - -Same as prod. - -- [ ] **Step 4: Commit** - -```bash -git add docker-compose.override.yml docker-compose.prod.yml docker-compose.staging.yml -git commit -m "feat(infra): add tunnel port range and SSH relay config to compose files" -``` - ---- - -## Chunk 5: Frontend — Remote Access UI - -### Task 5.1: Install xterm.js - -**Files:** -- Modify: `frontend/package.json` - -- [ ] **Step 1: Install dependencies** - -```bash -cd frontend && npm install @xterm/xterm @xterm/addon-fit @xterm/addon-web-links -``` - -- [ ] **Step 2: Commit** - -```bash -git add frontend/package.json frontend/package-lock.json -git commit -m "chore(frontend): add xterm.js dependencies for SSH terminal" -``` - -### Task 5.2: API Client Extension - -**Files:** -- Modify: `frontend/src/lib/api.ts` - -- [ ] **Step 1: Add remote access API methods** - -Add to the existing API client file: - -```typescript -// Remote Access API -export const remoteAccessApi = { - openWinbox: (tenantId: string, deviceId: string) => - client.post<{ - tunnel_id: string - host: string - port: number - winbox_uri: string - idle_timeout_seconds: number - }>(`/tenants/${tenantId}/devices/${deviceId}/winbox-session`), - - closeWinbox: (tenantId: string, deviceId: string, tunnelId: string) => - client.delete(`/tenants/${tenantId}/devices/${deviceId}/winbox-session/${tunnelId}`), - - openSSH: (tenantId: string, deviceId: string, cols: number, rows: number) => - client.post<{ - token: string - websocket_url: string - idle_timeout_seconds: number - }>(`/tenants/${tenantId}/devices/${deviceId}/ssh-session`, { cols, rows }), - - getSessions: (tenantId: string, deviceId: string) => - client.get<{ - winbox_tunnels: Array<{ tunnel_id: string; local_port: number; idle_seconds: number; created_at: string }> - ssh_sessions: Array<{ session_id: string; idle_seconds: number; created_at: string }> - }>(`/tenants/${tenantId}/devices/${deviceId}/sessions`), -} -``` - -- [ ] **Step 2: Commit** - -```bash -git add frontend/src/lib/api.ts -git commit -m "feat(frontend): add remote access API client methods" -``` - -### Task 5.3: WinBox Button Component - -**Files:** -- Create: `frontend/src/components/fleet/WinBoxButton.tsx` - -- [ ] **Step 1: Implement component** - -```tsx -// frontend/src/components/fleet/WinBoxButton.tsx -import { useState } from 'react' -import { useMutation } from '@tanstack/react-query' -import { Monitor, Copy, X, Loader2 } from 'lucide-react' -import { remoteAccessApi } from '@/lib/api' - -interface WinBoxButtonProps { - tenantId: string - deviceId: string -} - -type State = 'idle' | 'requesting' | 'ready' | 'closing' | 'error' - -export function WinBoxButton({ tenantId, deviceId }: WinBoxButtonProps) { - const [state, setState] = useState('idle') - const [tunnelInfo, setTunnelInfo] = useState<{ - tunnel_id: string - host: string - port: number - winbox_uri: string - } | null>(null) - const [error, setError] = useState(null) - const [copied, setCopied] = useState(false) - - const openMutation = useMutation({ - mutationFn: () => remoteAccessApi.openWinbox(tenantId, deviceId), - onSuccess: (resp) => { - const data = resp.data - setTunnelInfo(data) - setState('ready') - - // Attempt deep link on Windows only - if (navigator.userAgent.includes('Windows')) { - window.open(data.winbox_uri, '_blank') - } - }, - onError: (err: any) => { - setState('error') - setError(err.response?.data?.detail || 'Failed to open tunnel') - }, - }) - - const closeMutation = useMutation({ - mutationFn: () => { - if (!tunnelInfo) throw new Error('No tunnel') - return remoteAccessApi.closeWinbox(tenantId, deviceId, tunnelInfo.tunnel_id) - }, - onSuccess: () => { - setState('idle') - setTunnelInfo(null) - }, - }) - - const copyAddress = async () => { - if (!tunnelInfo) return - const addr = `${tunnelInfo.host}:${tunnelInfo.port}` - try { - await navigator.clipboard.writeText(addr) - } catch { - // Fallback for HTTP - const ta = document.createElement('textarea') - ta.value = addr - document.body.appendChild(ta) - ta.select() - document.execCommand('copy') - document.body.removeChild(ta) - } - setCopied(true) - setTimeout(() => setCopied(false), 2000) - } - - if (state === 'idle' || state === 'error') { - return ( -
- - {error &&

{error}

} -
- ) - } - - if (state === 'ready' && tunnelInfo) { - return ( -
-

WinBox tunnel ready

-

- Connect to: {tunnelInfo.host}:{tunnelInfo.port} -

-
- - -
-

- Tunnel closes after 5 min of inactivity -

-
- ) - } - - return null -} -``` - -- [ ] **Step 2: Commit** - -```bash -git add frontend/src/components/fleet/WinBoxButton.tsx -git commit -m "feat(frontend): add WinBox tunnel button component" -``` - -### Task 5.4: SSH Terminal Component - -**Files:** -- Create: `frontend/src/components/fleet/SSHTerminal.tsx` - -- [ ] **Step 1: Implement component** - -```tsx -// frontend/src/components/fleet/SSHTerminal.tsx -import { useCallback, useEffect, useRef, useState } from 'react' -import { useMutation } from '@tanstack/react-query' -import { Terminal as TerminalIcon, Maximize2, Minimize2, X } from 'lucide-react' -import { Terminal } from '@xterm/xterm' -import { FitAddon } from '@xterm/addon-fit' -import '@xterm/xterm/css/xterm.css' -import { remoteAccessApi } from '@/lib/api' - -interface SSHTerminalProps { - tenantId: string - deviceId: string - deviceName: string -} - -type State = 'closed' | 'connecting' | 'connected' | 'disconnected' - -export function SSHTerminal({ tenantId, deviceId, deviceName }: SSHTerminalProps) { - const [state, setState] = useState('closed') - const [expanded, setExpanded] = useState(false) - const termRef = useRef(null) - const terminalRef = useRef(null) - const fitAddonRef = useRef(null) - const wsRef = useRef(null) - const resizeTimerRef = useRef | null>(null) - - const openMutation = useMutation({ - mutationFn: () => { - const cols = terminalRef.current?.cols || 80 - const rows = terminalRef.current?.rows || 24 - return remoteAccessApi.openSSH(tenantId, deviceId, cols, rows) - }, - onSuccess: (resp) => { - const { token, websocket_url } = resp.data - const scheme = location.protocol === 'https:' ? 'wss' : 'ws' - const url = `${scheme}://${location.host}${websocket_url}` - connectWebSocket(url) - }, - onError: () => { - terminalRef.current?.write('\r\n\x1b[31mFailed to create SSH session.\x1b[0m\r\n') - setState('disconnected') - }, - }) - - const connectWebSocket = useCallback((url: string) => { - const ws = new WebSocket(url) - ws.binaryType = 'arraybuffer' - wsRef.current = ws - - ws.onopen = () => { - setState('connected') - terminalRef.current?.write('Connecting to router...\r\n') - } - - ws.onmessage = (event) => { - if (event.data instanceof ArrayBuffer) { - terminalRef.current?.write(new Uint8Array(event.data)) - } - } - - ws.onclose = (event) => { - setState('disconnected') - const reason = event.code === 1006 ? 'Connection dropped' - : event.code === 1008 ? 'Authentication failed' - : event.code === 1011 ? 'Server error' - : 'Session closed' - terminalRef.current?.write(`\r\n\x1b[31m${reason}.\x1b[0m\r\n`) - } - - ws.onerror = () => { - terminalRef.current?.write('\r\n\x1b[31mConnection error.\x1b[0m\r\n') - } - }, []) - - const initTerminal = useCallback(() => { - if (!termRef.current || terminalRef.current) return - - const isDark = document.documentElement.classList.contains('dark') - const term = new Terminal({ - cursorBlink: true, - fontFamily: 'Geist Mono, monospace', - fontSize: 14, - scrollback: 2000, - convertEol: true, - theme: isDark - ? { background: '#09090b', foreground: '#fafafa' } - : { background: '#ffffff', foreground: '#09090b' }, - }) - - const fitAddon = new FitAddon() - term.loadAddon(fitAddon) - term.open(termRef.current) - fitAddon.fit() - - terminalRef.current = term - fitAddonRef.current = fitAddon - - // User input → WebSocket - term.onData((data) => { - if (wsRef.current?.readyState === WebSocket.OPEN) { - const encoder = new TextEncoder() - wsRef.current.send(encoder.encode(data)) - } - }) - - // Resize → throttled WebSocket message - term.onResize(({ cols, rows }) => { - if (resizeTimerRef.current) clearTimeout(resizeTimerRef.current) - resizeTimerRef.current = setTimeout(() => { - if (wsRef.current?.readyState === WebSocket.OPEN) { - wsRef.current.send(JSON.stringify({ type: 'resize', cols, rows })) - } - }, 75) - }) - - // Refit on window resize - const observer = new ResizeObserver(() => fitAddon.fit()) - observer.observe(termRef.current) - - return () => { - observer.disconnect() - term.dispose() - terminalRef.current = null - } - }, []) - - // Cleanup on unmount - useEffect(() => { - return () => { - wsRef.current?.close() - terminalRef.current?.dispose() - } - }, []) - - const handleOpen = () => { - setState('connecting') - // Defer terminal init to next tick so ref is available - requestAnimationFrame(() => { - initTerminal() - openMutation.mutate() - }) - } - - const handleReconnect = () => { - terminalRef.current?.dispose() - terminalRef.current = null - wsRef.current?.close() - wsRef.current = null - setState('connecting') - requestAnimationFrame(() => { - initTerminal() - openMutation.mutate() - }) - } - - const handleDisconnect = () => { - wsRef.current?.close() - terminalRef.current?.dispose() - terminalRef.current = null - setState('closed') - } - - if (state === 'closed') { - return ( - - ) - } - - return ( -
-
- SSH: {deviceName} -
- - {state === 'disconnected' ? ( - - ) : ( - - )} -
-
-
- {state === 'connected' && ( -
- SSH session active — idle timeout: 15 min -
- )} -
- ) -} -``` - -- [ ] **Step 2: Commit** - -```bash -git add frontend/src/components/fleet/SSHTerminal.tsx -git commit -m "feat(frontend): add SSH terminal component with xterm.js" -``` - -### Task 5.5: Integrate into Device Page - -**Files:** -- Modify: The device detail page/route component (find via `frontend/src/routes/` — look for the device detail route) - -- [ ] **Step 1: Read the device detail page to find where to add buttons** - -Look for the route that renders individual device details. Add the WinBoxButton and SSHTerminal components in the device header area, conditionally rendered for `operator+` roles. - -```tsx -import { WinBoxButton } from '@/components/fleet/WinBoxButton' -import { SSHTerminal } from '@/components/fleet/SSHTerminal' - -// Inside the device header section, after existing device info: -{user.role !== 'viewer' && ( -
- {device.device_type === 'routeros' && ( - - )} - -
-)} -``` - -- [ ] **Step 2: Commit** - -```bash -git add frontend/src/ -git commit -m "feat(frontend): integrate WinBox and SSH buttons into device page" -``` - ---- - -## Chunk 6: Documentation Updates - -### Task 6.1: Update Documentation - -**Files:** -- Modify: `docs/ARCHITECTURE.md` -- Modify: `docs/DEPLOYMENT.md` -- Modify: `docs/SECURITY.md` -- Modify: `docs/CONFIGURATION.md` -- Modify: `README.md` - -- [ ] **Step 1: Update ARCHITECTURE.md** - -Add tunnel manager and SSH relay to the Go Poller section. Update the network topology diagram to show ports 49000-49100 and the SSH WebSocket path. Add SSH relay to the file structure section. - -- [ ] **Step 2: Update DEPLOYMENT.md** - -Add new environment variables table. Document tunnel port range requirement. Add Docker `userland-proxy: false` recommendation for production. - -- [ ] **Step 3: Update SECURITY.md** - -Add section on remote access session tokens, audit trail for WinBox/SSH sessions. - -- [ ] **Step 4: Update CONFIGURATION.md** - -Add all new environment variables with descriptions and defaults. - -- [ ] **Step 5: Update README.md** - -Add "Remote Access" to the Key Features list: -``` -- **Remote Access** -- WinBox TCP tunnels and browser-based SSH terminal for managing devices behind NAT. One-click connection through the WireGuard VPN overlay. -``` - -- [ ] **Step 6: Commit** - -```bash -git add docs/ README.md -git commit -m "docs: update documentation for v9.5 remote access feature" -``` - -### Task 6.2: Version Tag - -- [ ] **Step 1: Tag release** - -```bash -git tag -a v9.5.0 -m "feat: remote access - WinBox tunnels + SSH terminal" -``` - -Note: Do not push the tag until all testing is complete. - ---- - -## Execution Notes - -**Build order (critical):** -1. Chunks 1-2 (Go poller) — can be built together -2. Chunk 3 (Python API) — can be built in parallel with Chunks 1-2 -3. Chunk 4 (infrastructure) — can be built in parallel with Chunks 1-3 -4. Chunk 5 (frontend) — depends on Chunks 3 and 4 -5. Chunk 6 (docs) — last - -**Testing after all chunks complete:** -- Build all Docker images: `docker compose build api poller frontend` -- Start stack: `docker compose up -d` -- Verify poller healthcheck passes -- Test WinBox tunnel: open tunnel via API, connect with WinBox -- Test SSH terminal: open in browser, verify interactive shell -- Run full test suites: `cd poller && go test ./...` and `cd backend && pytest` diff --git a/docs/superpowers/specs/2026-03-12-remote-access-design.md b/docs/superpowers/specs/2026-03-12-remote-access-design.md deleted file mode 100644 index 3644fa3..0000000 --- a/docs/superpowers/specs/2026-03-12-remote-access-design.md +++ /dev/null @@ -1,841 +0,0 @@ -# Remote Access Design — WinBox Tunnels + SSH Terminal (v9.5) - -## Overview - -Add remote WinBox and SSH terminal access to TOD. Users connect to RouterOS devices behind NAT through the TOD controller without direct network access to the router. - -- **WinBox**: TCP tunnel through the poller container. User's native WinBox app connects to `127.0.0.1:`. -- **SSH Terminal**: Browser-based xterm.js terminal. WebSocket to poller, which bridges to SSH PTY on the router. - -### Device Type Scope - -- **WinBox tunnels**: RouterOS devices only (WinBox is MikroTik-specific, port 8291) -- **SSH terminal**: All device types that support SSH (RouterOS and future `linuxrtr` devices) -- The frontend should show/hide the "Open WinBox" button based on device type. The "SSH Terminal" button renders for all SSH-capable device types. - -## System Architecture - -``` - ┌─────────────────────────────────┐ - │ User's Machine │ - │ │ - │ Browser (TOD UI) │ - │ ├─ xterm.js SSH terminal │ - │ └─ "Open WinBox" button │ - │ │ - │ WinBox app │ - │ └─ connects 127.0.0.1:491xx │ - └──────────┬──────────┬───────────┘ - │ │ - WebSocket TCP (WinBox) - /ws/ssh/ 127.0.0.1:49000-49100 - │ │ -┌────────────────────────────────────┼──────────┼────────────────┐ -│ Docker Network: tod │ │ │ -│ │ │ │ -│ ┌──────────────┐ │ │ │ -│ │ nginx │──────────────────┘ │ │ -│ │ port 3000 │ (proxy /ws/ssh → poller) │ │ -│ │ │ (proxy /api → api) │ │ -│ └──────┬───────┘ │ │ -│ │ │ │ -│ ┌──────▼───────┐ NATS ┌───────────────▼──────────┐ │ -│ │ API │◄───────────►│ Poller │ │ -│ │ FastAPI │ │ Go │ │ -│ │ │ │ ├─ tunnel manager │ │ -│ │ - RBAC │ session │ │ (TCP proxy :49000+) │ │ -│ │ - audit log │ tokens │ ├─ SSH relay │ │ -│ │ - session │ (Redis) │ │ (WebSocket ↔ PTY) │ │ -│ │ tokens │ │ ├─ device poller │ │ -│ └──────────────┘ │ └─ cmd responder │ │ -│ └───────────────┬───────────┘ │ -│ │ │ -│ ┌───────────────▼───────────┐ │ -│ │ WireGuard │ │ -│ │ 10.10.0.1/24 │ │ -│ │ port 51820/udp │ │ -│ └───────────────┬───────────┘ │ -└───────────────────────────────────────────────┼────────────────┘ - │ - ┌─────────────────────┼──────────────┐ - │ │ │ - RouterOS RouterOS RouterOS - (direct IP) (VPN peer) (VPN peer) - :8291 :22 10.10.0.x 10.10.0.y - :8291 :22 :8291 :22 -``` - -**Key data paths:** - -- **WinBox**: Browser click → API (auth+audit) → NATS → Poller allocates port → Docker maps `127.0.0.1:491xx` → Poller TCP proxy → WireGuard → Router:8291 -- **SSH**: Browser click → API (auth+audit+token) → Browser opens WebSocket → nginx → Poller validates token → SSH+PTY → Router:22 -- **Auth boundary**: API handles all RBAC and audit logging. Poller validates single-use session tokens but never does primary auth. - -## RBAC - -Roles allowed for remote access: `operator`, `admin`, `super_admin`. - -`viewer` role receives 403 Forbidden. The API is the enforcement point; frontend hides buttons for viewers but does not rely on that for security. - -Every remote access operation produces an audit log entry: - -- `user_id`, `tenant_id`, `device_id`, `session_type`, `source_ip`, `timestamp` -- SSH sessions additionally log `start_time` and `end_time` - -## Poller: Tunnel Manager - -New package: `poller/internal/tunnel/` - -### Data Structures - -```go -type TunnelManager struct { - mu sync.Mutex - tunnels map[string]*Tunnel // keyed by tunnel ID (uuid) - portPool *PortPool // tracks available ports 49000-49100 - idleTime time.Duration // 5 minutes - deviceStore *store.DeviceStore // DB lookup for device connection details - credCache *vault.CredentialCache -} - -type Tunnel struct { - ID string - DeviceID string - TenantID string - UserID string - LocalPort int - RemoteAddr string // router IP:8291 - CreatedAt time.Time - LastActive int64 // atomic, unix nanoseconds - listener net.Listener - cancel context.CancelFunc - conns sync.WaitGroup - activeConns int64 // atomic counter -} -``` - -### LastActive Concurrency - -`LastActive` stored as `int64` (unix nanoseconds) using atomic operations: - -- Write: `atomic.StoreInt64(&t.LastActive, time.Now().UnixNano())` -- Read: `time.Since(time.Unix(0, atomic.LoadInt64(&t.LastActive)))` - -### Port Pool - -```go -type PortPool struct { - mu sync.Mutex - ports []bool // true = in use - base int // 49000 -} -``` - -- `Allocate()` returns next free port or error if exhausted -- `Release()` marks port as free -- Before allocation, attempt bind to verify port is actually free (handles stale Docker mappings after restart) -- All operations protected by mutex - -### Tunnel Lifecycle - -1. NATS message arrives on `tunnel.open` -2. Manager looks up device from database via `DeviceStore.GetDevice(deviceID)` to obtain encrypted credentials and connection details (same pattern as `CmdResponder`) -3. Decrypts device credentials via credential cache -4. Allocates port from pool (verify bind succeeds) -5. Starts TCP listener on `127.0.0.1:` (never `0.0.0.0`) -6. Returns allocated port via NATS reply -7. For each incoming TCP connection: - - `t.conns.Add(1)`, increment `activeConns` - - Dial `router_ip:8291` through WireGuard (10s timeout) - - If dial fails: close client connection, decrement counter, do not update LastActive - - Bidirectional proxy with context cancellation (see below) - - On exit: decrement `activeConns`, `t.conns.Done()` -8. Background goroutine checks every 30s: - - If idle > 5 minutes AND `activeConns == 0`: close tunnel -9. Never close a tunnel while WinBox has an active socket - -### TCP Proxy (per connection) - -```go -func (t *Tunnel) handleConn(tunnelCtx context.Context, clientConn net.Conn) { - defer t.conns.Done() - defer atomic.AddInt64(&t.activeConns, -1) - - routerConn, err := net.DialTimeout("tcp", t.RemoteAddr, 10*time.Second) - if err != nil { - clientConn.Close() - return - } - - ctx, cancel := context.WithCancel(tunnelCtx) // derived from tunnel context for shutdown propagation - defer cancel() // ensure context cleanup on all exit paths - - go func() { - io.Copy(routerConn, newActivityReader(clientConn, &t.LastActive)) - cancel() - }() - go func() { - io.Copy(clientConn, newActivityReader(routerConn, &t.LastActive)) - cancel() - }() - - <-ctx.Done() - clientConn.Close() - routerConn.Close() -} -``` - -`activityReader` wraps `io.Reader` and calls `atomic.StoreInt64` on every `Read()`. - -### Tunnel Shutdown Order - -```go -func (t *Tunnel) Close() { - t.listener.Close() // 1. stop accepting new connections - t.cancel() // 2. cancel context - t.conns.Wait() // 3. wait for active connections - // 4. release port (done by manager) - // 5. delete from manager map (done by manager) -} -``` - -### NATS Subjects - -- `tunnel.open` — Request: `{device_id, tenant_id, user_id, target_port}` → Reply: `{tunnel_id, local_port}` -- `tunnel.close` — Request: `{tunnel_id}` → Reply: `{ok}` -- `tunnel.status` — Request: `{tunnel_id}` → Reply: `{active, local_port, connected_clients, idle_seconds}` -- `tunnel.status.list` — Request: `{device_id}` → Reply: list of active tunnels - -### Logging - -Structured JSON logs for: tunnel creation, port allocation, client connection, client disconnect, idle timeout, tunnel close. Fields: `tunnel_id`, `device_id`, `tenant_id`, `local_port`, `remote_addr`. - -## Poller: SSH Relay - -New package: `poller/internal/sshrelay/` - -### Data Structures - -```go -type Server struct { - redis *redis.Client - credCache *vault.CredentialCache - deviceStore *store.DeviceStore - sessions map[string]*Session - mu sync.Mutex - idleTime time.Duration // 15 minutes - maxSessions int // 200 - maxPerUser int // 10 - maxPerDevice int // 20 -} - -type Session struct { - ID string // uuid - DeviceID string - TenantID string - UserID string - SourceIP string - StartTime time.Time - LastActive int64 // atomic, unix nanoseconds - sshClient *ssh.Client - sshSession *ssh.Session - ptyCols int - ptyRows int - cancel context.CancelFunc -} -``` - -### HTTP Server - -Runs on port 8080 inside the container (configurable via `SSH_RELAY_PORT`). Not exposed to host — only accessible through nginx on Docker network. - -Endpoints: - -- `/ws/ssh?token=` — WebSocket upgrade for SSH terminal -- `/healthz` — Health check (returns `{"status":"ok"}`) - -### Connection Flow - -1. Browser opens `ws://host/ws/ssh?token=` -2. nginx proxies to poller `:8080/ws/ssh` -3. Poller validates single-use token via Redis `GETDEL` -4. Token must contain: `device_id`, `tenant_id`, `user_id`, `source_ip`, `cols`, `rows`, `created_at` -5. Verify `tenant_id` matches device's tenant -6. Check session limits (200 total, 10 per user, 20 per device) — reject with close frame if exceeded -7. Upgrade to WebSocket with hardening: - - `SetReadLimit(1 << 20)` (1MB) - - Read deadline management - - Ping/pong keepalive - - Origin validation -8. Decrypt device credentials via credential cache -9. SSH dial to router (port 22, password auth, `InsecureIgnoreHostKey`) - - Log host key fingerprint on first connect - - If dial fails: close WebSocket with error message, clean up -10. Open SSH session, request PTY (`xterm-256color`, initial cols/rows from token) -11. Obtain stdin, stdout, stderr pipes -12. Start shell -13. Bridge WebSocket ↔ SSH PTY - -### WebSocket Message Protocol - -- **Binary frames**: Terminal data — forwarded directly to/from SSH PTY -- **Text frames**: JSON control messages - -``` -{"type": "resize", "cols": 120, "rows": 40} -{"type": "ping"} -``` - -Resize validation: `cols > 0 && cols <= 500 && rows > 0 && rows <= 200`. Reject invalid values. - -### Bridge Function - -```go -func bridge(ctx context.Context, cancel context.CancelFunc, - wsConn, sshSession, stdin, stdout, stderr, lastActive *int64) { - - // WebSocket → SSH stdin - go func() { - defer cancel() - for { - msgType, data, err := wsConn.Read(ctx) - if err != nil { return } - atomic.StoreInt64(lastActive, time.Now().UnixNano()) - - if msgType == websocket.TextMessage { - var ctrl ControlMsg - if json.Unmarshal(data, &ctrl) != nil { continue } - if ctrl.Type == "resize" { - // validate bounds - if ctrl.Cols > 0 && ctrl.Cols <= 500 && ctrl.Rows > 0 && ctrl.Rows <= 200 { - sshSession.WindowChange(ctrl.Rows, ctrl.Cols) - } - } - continue - } - stdin.Write(data) - } - }() - - // SSH stdout → WebSocket - go func() { - defer cancel() - buf := make([]byte, 4096) - for { - n, err := stdout.Read(buf) - if err != nil { return } - atomic.StoreInt64(lastActive, time.Now().UnixNano()) - wsConn.Write(ctx, websocket.BinaryMessage, buf[:n]) - } - }() - - // SSH stderr → WebSocket (merged into same stream) - go func() { - defer cancel() // stderr EOF also triggers cleanup - io.Copy(wsWriterAdapter(wsConn), stderr) - }() - - <-ctx.Done() -} -``` - -### Session Cleanup Order - -1. Cancel context (triggers bridge shutdown) -2. Close WebSocket -3. Close SSH session -4. Close SSH client -5. Remove session from server map (under mutex) -6. Publish audit event via NATS: `audit.session.end` with payload `{session_id, user_id, tenant_id, device_id, start_time, end_time, source_ip, reason}` - -### Audit End-Time Pipeline - -The API subscribes to the NATS subject `audit.session.end` (durable consumer, same pattern as existing NATS subscribers in `backend/app/services/nats_subscribers.py`). When a message arrives, the subscriber calls `log_action("ssh_session_end", ...)` with the session details including `end_time` and duration. This uses the existing self-committing audit service — no new persistence mechanism needed. - -### Idle Timeout - -Per-session goroutine, every 30s: - -``` -idle := time.Since(time.Unix(0, atomic.LoadInt64(&sess.LastActive))) -if idle > 15 minutes: - cancel() -``` - -### Source IP - -Extracted from `X-Real-IP` header (set by nginx from `$remote_addr`), fallback to `X-Forwarded-For` last entry before nginx, fallback to `r.RemoteAddr`. Using `X-Real-IP` as primary avoids client-spoofed `X-Forwarded-For` entries. - -### Logging - -Structured JSON logs for: session start, session end (with duration and reason: disconnect/idle/error). Fields: `session_id`, `device_id`, `tenant_id`, `user_id`, `source_ip`. - -## API: Remote Access Endpoints - -New router: `backend/app/routers/remote_access.py` - -### WinBox Tunnel - -``` -POST /api/tenants/{tenant_id}/devices/{device_id}/winbox-session - -RBAC: operator+ -``` - -Flow: - -1. Validate JWT, require `operator+` -2. Verify device exists, belongs to tenant, is active (not disabled/deleted) -3. Return 404 if not found, 403 if tenant mismatch (never leak cross-tenant existence) -4. Extract source IP from `X-Real-IP` header (preferred, set by nginx), fallback to `request.client.host` -5. Audit log: `log_action("winbox_tunnel_open", ...)` -6. NATS request to `tunnel.open` (10s timeout) -7. If timeout or error: return 503 -8. Validate returned port is in range 49000–49100 -9. Response: - -```json -{ - "tunnel_id": "uuid", - "host": "127.0.0.1", - "port": 49023, - "winbox_uri": "winbox://127.0.0.1:49023", - "idle_timeout_seconds": 300 -} -``` - -`host` is always hardcoded to `"127.0.0.1"` — never overridden by poller response. - -Rate limit: 10 requests/min per user. - -### SSH Session Token - -``` -POST /api/tenants/{tenant_id}/devices/{device_id}/ssh-session - -RBAC: operator+ - -Body: {"cols": 80, "rows": 24} -``` - -Flow: - -1. Validate JWT, require `operator+` -2. Verify device exists, belongs to tenant, is active -3. Check session limits (10 per user, 20 per device) — return 429 if exceeded -4. Audit log: `log_action("ssh_session_open", ...)` -5. Generate token: `secrets.token_urlsafe(32)` -6. Store in Redis with SETEX (atomic), 120s TTL. Key format: `ssh:token:` - -```json -{ - "device_id": "uuid", - "tenant_id": "uuid", - "user_id": "uuid", - "source_ip": "1.2.3.4", - "cols": 80, - "rows": 24, - "created_at": 1710288000 -} -``` - -7. Response: - -```json -{ - "token": "...", - "websocket_url": "/ws/ssh?token=", - "idle_timeout_seconds": 900 -} -``` - -Rate limit: 10 requests/min per user. - -Input validation: `cols` 1–500, `rows` 1–200. - -### Tunnel Close - -``` -DELETE /api/tenants/{tenant_id}/devices/{device_id}/winbox-session/{tunnel_id} - -RBAC: operator+ -``` - -Idempotent — returns 200 even if tunnel already closed. Audit log recorded. - -### Active Sessions - -``` -GET /api/tenants/{tenant_id}/devices/{device_id}/sessions - -RBAC: operator+ -``` - -NATS request to poller. If poller doesn't respond within 10s, return empty session lists (degrade gracefully). - -### Schemas - -```python -class WinboxSessionResponse(BaseModel): - tunnel_id: str - host: str = "127.0.0.1" - port: int - winbox_uri: str - idle_timeout_seconds: int = 300 - -class SSHSessionRequest(BaseModel): - cols: int = Field(default=80, gt=0, le=500) - rows: int = Field(default=24, gt=0, le=200) - -class SSHSessionResponse(BaseModel): - token: str - websocket_url: str - idle_timeout_seconds: int = 900 -``` - -### Error Responses - -- 403: insufficient role or tenant mismatch -- 404: device not found -- 429: session or rate limit exceeded -- 503: poller unavailable or port range exhausted - -## Frontend: Remote Access UI - -### Dependencies - -New: `@xterm/xterm` (v5+), `@xterm/addon-fit`, `@xterm/addon-web-links`. No other new dependencies. - -### Device Page - -Remote access buttons render in the device header for `operator+` roles: - -``` -┌──────────────────────────────────────────┐ -│ site-branch-01 Online ● │ -│ 10.10.0.5 RB4011 RouterOS 7.16 │ -│ │ -│ [ Open WinBox ] [ SSH Terminal ] │ -│ │ -└──────────────────────────────────────────┘ -``` - -### WinBox Button - -States: `idle`, `requesting`, `ready`, `closing`, `error`. - -On click: - -1. Mutation: `POST .../winbox-session` -2. On success, display: - -``` -WinBox tunnel ready - -Connect to: 127.0.0.1:49023 - -[ Copy Address ] [ Close Tunnel ] - -Tunnel closes after 5 min of inactivity -``` - -3. Attempt deep link on Windows only (detect via `navigator.userAgent`): `window.open("winbox://127.0.0.1:49023")` — must fire directly inside the click handler chain (no setTimeout) to avoid browser blocking. On macOS/Linux, skip the deep link attempt and rely on the copy-address fallback. -4. Copy button with clipboard fallback for HTTP environments (textarea + `execCommand("copy")`) -5. Navigating away does not close the tunnel — backend idle timeout handles cleanup -6. Close button disabled while DELETE request is in flight - -### SSH Terminal - -Two phases: - -**Phase 1 — Token acquisition:** - -``` -POST .../ssh-session { cols, rows } -→ { token, websocket_url } -``` - -**Phase 2 — Terminal session:** - -```typescript -const term = new Terminal({ - cursorBlink: true, - fontFamily: 'Geist Mono, monospace', - fontSize: 14, - scrollback: 2000, - convertEol: true, - theme: darkMode ? darkTheme : lightTheme -}) -const fitAddon = new FitAddon() -term.loadAddon(fitAddon) -term.open(containerRef) -// fit after font load -fitAddon.fit() -``` - -WebSocket scheme derived dynamically: `location.protocol === "https:" ? "wss" : "ws"` - -**Data flow:** - -- User keystroke → `term.onData` → `ws.send(binaryFrame)` → poller → SSH stdin -- Router output → SSH stdout → poller → `ws.onmessage` → `term.write(new Uint8Array(data))` -- Resize → `term.onResize` → throttled (75ms) → `ws.send(JSON.stringify({type:"resize", cols, rows}))` - -**WebSocket lifecycle:** - -- `onopen`: `term.write("Connecting to router...\r\n")` -- `onmessage`: binary → `term.write`, text → parse control -- `onclose`: display "Session closed." in red, disable input, show Reconnect button -- `onerror`: display "Connection error." in red -- Abnormal close codes (1006, 1008, 1011) display appropriate messages - -**Reconnect**: Always requests a new token. Never reuses WebSocket or token. - -**Cleanup on unmount:** - -```typescript -useEffect(() => { - return () => { - term?.dispose() - ws?.close() - } -}, []) -``` - -**Terminal UI:** - -``` -┌──────────────────────────────────────────────────┐ -│ SSH: site-branch-01 [ Disconnect ] │ -├──────────────────────────────────────────────────┤ -│ │ -│ [admin@site-branch-01] > │ -│ │ -└──────────────────────────────────────────────────┘ -SSH session active — idle timeout: 15 min -``` - -- Inline on device page by default, expandable to full viewport -- Auto-expand to full viewport on screens < 900px width -- Dark/light theme maps to existing Tailwind HSL tokens (no hardcoded hex) -- `tabindex=0` on terminal container for keyboard focus -- Active session indicator when sessions list returns data - -### API Client Extension - -```typescript -const remoteAccessApi = { - openWinbox: (tenantId: string, deviceId: string) => - client.post( - `/tenants/${tenantId}/devices/${deviceId}/winbox-session` - ), - closeWinbox: (tenantId: string, deviceId: string, tunnelId: string) => - client.delete( - `/tenants/${tenantId}/devices/${deviceId}/winbox-session/${tunnelId}` - ), - openSSH: (tenantId: string, deviceId: string, req: SSHSessionRequest) => - client.post( - `/tenants/${tenantId}/devices/${deviceId}/ssh-session`, req - ), - getSessions: (tenantId: string, deviceId: string) => - client.get( - `/tenants/${tenantId}/devices/${deviceId}/sessions` - ), -} -``` - -## Infrastructure - -### nginx — WebSocket Proxy - -Add to `infrastructure/docker/nginx-spa.conf`: - -```nginx -# WebSocket upgrade mapping (top-level, outside server block) -map $http_upgrade $connection_upgrade { - default upgrade; - '' close; -} - -# Inside server block: -location /ws/ssh { - resolver 127.0.0.11 valid=10s ipv6=off; - set $poller_upstream http://poller:8080; - - proxy_pass $poller_upstream; - proxy_http_version 1.1; - - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection $connection_upgrade; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header Host $host; - - proxy_read_timeout 1800s; - proxy_send_timeout 1800s; - - proxy_buffering off; - proxy_request_buffering off; - proxy_busy_buffers_size 512k; - proxy_buffers 8 512k; - -} -``` - -**CSP**: The existing `connect-src 'self'` should be sufficient for same-origin WebSocket connections in modern browsers (CSP `self` matches same-origin `ws://` and `wss://`). For maximum compatibility across all environments, explicitly add `ws: wss:` to the `connect-src` directive. HTTPS-only deployments can restrict to just `wss:`. - -### Docker Compose - -**Poller service additions — apply to these specific files:** - -- `docker-compose.override.yml` (dev): ports, environment, ulimits, healthcheck -- `docker-compose.prod.yml` (production): ports, environment, ulimits, healthcheck, increased memory limit -- `docker-compose.staging.yml` (staging): same as prod - -```yaml -poller: - ports: - - "127.0.0.1:49000-49100:49000-49100" - ulimits: - nofile: - soft: 8192 - hard: 8192 - environment: - TUNNEL_PORT_MIN: 49000 - TUNNEL_PORT_MAX: 49100 - TUNNEL_IDLE_TIMEOUT: 300 - SSH_RELAY_PORT: 8080 - SSH_IDLE_TIMEOUT: 900 - SSH_MAX_SESSIONS: 200 - SSH_MAX_PER_USER: 10 - SSH_MAX_PER_DEVICE: 20 - healthcheck: - test: ["CMD-SHELL", "wget --spider -q http://localhost:8080/healthz || exit 1"] - interval: 30s - timeout: 3s - retries: 3 -``` - -**Production memory limit**: Increase poller from 256MB to 384–512MB. - -**Redis dependency**: Ensure `depends_on: redis: condition: service_started`. - -**Docker proxy note**: The 101-port range mapping creates individual `docker-proxy` processes. For production, set `"userland-proxy": false` in `/etc/docker/daemon.json` to use iptables-based forwarding instead, which avoids spawning 101 proxy processes and improves startup time. - -### Poller HTTP Server - -```go -httpServer := &http.Server{ - Addr: ":" + cfg.SSHRelayPort, - Handler: sshrelay.NewServer(redisClient, credCache).Handler(), -} -go httpServer.ListenAndServe() -// Graceful shutdown with 5s timeout -httpServer.Shutdown(ctx) -``` - -### New Environment Variables - -| Variable | Default | Description | -|----------|---------|-------------| -| `TUNNEL_PORT_MIN` | `49000` | Start of WinBox tunnel port range | -| `TUNNEL_PORT_MAX` | `49100` | End of WinBox tunnel port range | -| `TUNNEL_IDLE_TIMEOUT` | `300` | WinBox tunnel idle timeout (seconds) | -| `SSH_RELAY_PORT` | `8080` | Internal HTTP/WebSocket port for SSH relay | -| `SSH_IDLE_TIMEOUT` | `900` | SSH session idle timeout (seconds) | -| `SSH_MAX_SESSIONS` | `200` | Max concurrent SSH sessions per poller | -| `SSH_MAX_PER_USER` | `10` | Max concurrent SSH sessions per user | -| `SSH_MAX_PER_DEVICE` | `20` | Max concurrent SSH sessions per device | - -### Graceful Shutdown - -When poller container shuts down: - -1. Stop accepting new tunnels and SSH sessions -2. Close HTTP/WebSocket server (5s timeout) -3. Gracefully terminate SSH sessions -4. Close all tunnel listeners -5. Wait for active connections -6. Release tunnel ports - -## Testing Strategy - -### Unit Tests - -**Poller (Go):** - -- Port pool: allocation, release, reuse after close, concurrent access, exhaustion, bind failure retry -- Tunnel manager: lifecycle, idle detection with zero active connections, multiple concurrent connections on same tunnel, cleanup when listener creation fails -- TCP proxy: activity tracking (atomic), bidirectional shutdown, dial failure cleanup -- SSH relay: token validation (valid/expired/reused/wrong tenant), session limits, resize parsing and validation, malformed control messages, invalid JSON frames, binary frame size limits, resize flood protection, cleanup on SSH dial failure, cleanup on abrupt WebSocket close - -**Backend (Python):** - -- RBAC: viewer gets 403, operator gets 200 -- Device validation: wrong tenant gets 404, disabled device rejected -- Token generation: stored in Redis with correct TTL -- Rate limiting: 11th request gets 429 -- Session limits: exceed per-user/per-device limits gets 429 -- Source IP extraction from X-Forwarded-For -- NATS timeout returns 503 -- Redis unavailable during token storage -- Malformed request payloads rejected - -### Integration Tests - -- **Tunnel end-to-end**: API → NATS → poller allocates port → verify listening on 127.0.0.1 → TCP connect → data forwarded to mock router -- **SSH end-to-end**: API issues token → WebSocket → poller validates → SSH to mock SSHD → verify keystroke round-trip and resize -- **Token lifecycle**: consumed on first use, second use rejected, expired token rejected -- **Idle timeout**: open tunnel, no traffic, verify closes after 5min; open SSH, no activity, verify closes after 15min -- **Concurrent sessions**: 10 SSH from same user succeeds, 11th rejected -- **Tunnel stress**: 50 concurrent tunnels, verify unique ports, verify cleanup -- **SSH stress**: many simultaneous WebSocket sessions, verify limits and stability -- **Router unreachable**: SSH dial fails, WebSocket closes with error, no zombie session -- **Poller restart**: sessions terminate, frontend shows disconnect, reconnect works -- **Backward compatibility**: existing polling, config push, NATS subjects unchanged - -### Security Tests - -- Token replay: reuse consumed token → rejected -- Cross-tenant: user from tenant A accesses device from tenant B → rejected -- Malformed token: invalid base64, wrong length → rejected without panic - -### Resource Leak Detection - -During integration testing, monitor: open file descriptors, goroutine count, memory usage. Verify SSH sessions and tunnels release all resources after closure. - -### Manual Testing - -- WinBox tunnel to router behind WireGuard — full WinBox functionality -- SSH terminal — tab completion, arrow keys, command history, line wrapping after resize -- Deep link `winbox://` on Windows — auto-launch -- Copy address fallback on macOS/Linux -- Navigate away with open tunnel — stays open, closes on idle -- Poller restart — frontend handles disconnect, reconnect works -- Multiple SSH terminals to different devices simultaneously -- Dark/light mode terminal theme -- Chrome, Firefox, Safari — WebSocket stability, clipboard, deep link, resize - -### Observability Verification - -Verify structured JSON logs exist with correct fields for: tunnel created/closed, port allocated, SSH session started/ended (with duration and reason), idle timeout events. - -## Rollout Sequence - -1. Deploy poller changes to staging (tunnel manager, SSH relay, HTTP server, NATS subjects) -2. Deploy infrastructure changes (docker-compose ports, nginx WebSocket config, CSP, ulimits) -3. Validate tunnels and SSH relay in staging -4. Deploy API endpoints (remote access router, session tokens, audit logging, rate limiting) -5. Deploy frontend (WinBox button, SSH terminal, API client) -6. Update documentation (ARCHITECTURE, DEPLOYMENT, SECURITY, CONFIGURATION, README) -7. Tag as v9.5 with release notes covering: WinBox remote access, browser SSH terminal, new env vars, port range requirement - -Never deploy frontend before backend endpoints exist. - -## Out of Scope - -- WinBox protocol reimplementation in browser -- SSH key authentication (password only, matching existing credential model) -- Session recording/playback -- File transfer through SSH terminal -- Multi-user shared terminal sessions diff --git a/docs/website/docs.html b/docs/website/docs.html index fd1d31c..89ce867 100644 --- a/docs/website/docs.html +++ b/docs/website/docs.html @@ -1415,7 +1415,7 @@ open http://localhost SMTP_USER(none)SMTP authentication username SMTP_PASSWORD(none)SMTP authentication password SMTP_USE_TLSfalseEnable STARTTLS for SMTP connections - SMTP_FROM_ADDRESSnoreply@mikrotik-portal.localSender address for outbound emails + SMTP_FROM_ADDRESSnoreply@the-other-dude.localSender address for outbound emails diff --git a/frontend/src/components/fleet/RemoteWinBoxButton.tsx b/frontend/src/components/fleet/RemoteWinBoxButton.tsx new file mode 100644 index 0000000..dcc3c81 --- /dev/null +++ b/frontend/src/components/fleet/RemoteWinBoxButton.tsx @@ -0,0 +1,295 @@ +import { useState, useEffect, useCallback, useRef } from 'react' +import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query' +import { Globe, X, Loader2, RefreshCw, Maximize2, Minimize2 } from 'lucide-react' +import { remoteWinboxApi, type RemoteWinBoxSession } from '@/lib/api' + +interface RemoteWinBoxButtonProps { + tenantId: string + deviceId: string +} + +type State = 'idle' | 'requesting' | 'connecting' | 'active' | 'closing' | 'terminated' | 'failed' + +export function RemoteWinBoxButton({ tenantId, deviceId }: RemoteWinBoxButtonProps) { + const [state, setState] = useState('idle') + const [session, setSession] = useState(null) + const [error, setError] = useState(null) + const [expanded, setExpanded] = useState(false) + const [countdown, setCountdown] = useState(null) + const pollRef = useRef | null>(null) + const queryClient = useQueryClient() + + // Check for existing active sessions on mount + const { data: existingSessions } = useQuery({ + queryKey: ['remote-winbox-sessions', tenantId, deviceId], + queryFn: () => remoteWinboxApi.list(tenantId, deviceId), + refetchOnWindowFocus: false, + }) + + useEffect(() => { + if (existingSessions && state === 'idle') { + const active = existingSessions.find( + (s) => s.status === 'active' || s.status === 'creating', + ) + if (active) { + setSession(active) + setState(active.status === 'active' ? 'active' : 'connecting') + } + } + }, [existingSessions, state]) + + // Poll session status while connecting + useEffect(() => { + if (state !== 'connecting' || !session) return + + const poll = setInterval(async () => { + try { + const updated = await remoteWinboxApi.get(tenantId, deviceId, session.session_id) + setSession(updated) + if (updated.status === 'active') { + setState('active') + } else if (updated.status === 'failed') { + setState('failed') + setError('Session failed to provision') + } else if (updated.status === 'terminated') { + setState('terminated') + } + } catch { + // ignore transient polling errors + } + }, 2000) + + pollRef.current = poll + return () => clearInterval(poll) + }, [state, session, tenantId, deviceId]) + + // Countdown timer for session expiry + useEffect(() => { + if (state !== 'active' || !session?.expires_at) { + setCountdown(null) + return + } + + const tick = () => { + const remaining = Math.max(0, new Date(session.expires_at).getTime() - Date.now()) + if (remaining <= 0) { + setCountdown('Expired') + setState('terminated') + return + } + const mins = Math.floor(remaining / 60000) + const secs = Math.floor((remaining % 60000) / 1000) + setCountdown(`${mins}:${secs.toString().padStart(2, '0')}`) + } + tick() + const interval = setInterval(tick, 1000) + return () => clearInterval(interval) + }, [state, session?.expires_at]) + + const createMutation = useMutation({ + mutationFn: () => remoteWinboxApi.create(tenantId, deviceId), + onSuccess: (data) => { + setSession(data) + if (data.status === 'active') { + setState('active') + } else { + setState('connecting') + } + }, + onError: (err: any) => { + setState('failed') + setError(err.response?.data?.detail || 'Failed to create session') + }, + }) + + const closeMutation = useMutation({ + mutationFn: () => { + if (!session) throw new Error('No session') + return remoteWinboxApi.delete(tenantId, deviceId, session.session_id) + }, + onSuccess: () => { + setState('idle') + setSession(null) + setError(null) + queryClient.invalidateQueries({ queryKey: ['remote-winbox-sessions', tenantId, deviceId] }) + }, + onError: (err: any) => { + setState('failed') + setError(err.response?.data?.detail || 'Failed to close session') + }, + }) + + const handleOpen = useCallback(() => { + setState('requesting') + setError(null) + createMutation.mutate() + }, [createMutation]) + + const handleClose = useCallback(() => { + setState('closing') + closeMutation.mutate() + }, [closeMutation]) + + const handleRetry = useCallback(() => { + setSession(null) + setError(null) + handleOpen() + }, [handleOpen]) + + const handleReset = useCallback(async () => { + try { + const sessions = await remoteWinboxApi.list(tenantId, deviceId) + for (const s of sessions) { + if (s.status === 'active' || s.status === 'creating' || s.status === 'grace') { + await remoteWinboxApi.delete(tenantId, deviceId, s.session_id) + } + } + } catch { + // ignore cleanup errors + } + setState('idle') + setSession(null) + setError(null) + queryClient.invalidateQueries({ queryKey: ['remote-winbox-sessions', tenantId, deviceId] }) + }, [tenantId, deviceId, queryClient]) + + // Build iframe URL: load Xpra HTML5 client directly via nginx /xpra/{port}/ proxy + // path= tells the Xpra HTML5 client where to open the WebSocket connection + const iframeSrc = session?.session_id && session?.xpra_ws_port + ? `/xpra/${session.xpra_ws_port}/index.html?path=/xpra/${session.xpra_ws_port}/&keyboard=false&floating_menu=false&sharing=false&clipboard=false` + : null + + // Idle / Failed / Terminated states — show button + if (state === 'idle' || state === 'failed' || state === 'terminated') { + return ( +
+
+ + +
+ {state === 'failed' && error && ( +
+

{error}

+
+ )} + {state === 'terminated' && ( +

Session ended

+ )} +
+ ) + } + + // Requesting / Connecting — spinner + if (state === 'requesting' || state === 'connecting') { + return ( +
+
+ +

+ {state === 'requesting' ? 'Requesting session...' : 'Provisioning WinBox container...'} +

+
+

This may take a few seconds

+
+ ) + } + + // Closing + if (state === 'closing') { + return ( +
+
+ +

Closing session...

+
+
+ ) + } + + // Active — show iframe + if (state === 'active' && iframeSrc) { + return ( +
+ {/* Header bar */} +
+
+ + Remote WinBox + {countdown && ( + + Expires in {countdown} + + )} +
+
+ + +
+
+ {/* Xpra iframe */} +