feat: implement Remote WinBox worker, API, frontend integration, OpenBao persistence, and supporting docs

This commit is contained in:
Jason Staack
2026-03-14 09:05:14 -05:00
parent 7af08276ea
commit 970501e453
86 changed files with 3440 additions and 3764 deletions

View File

@@ -6,7 +6,7 @@
# docker compose --profile full build api && docker compose --profile full build poller && docker compose --profile full build frontend
# docker compose --profile full up -d
# open http://localhost:3000
# Login: admin@mikrotik-portal.dev / changeme-in-production
# Login: admin@the-other-dude.dev / changeme-in-production
# Environment (dev | staging | production)
ENVIRONMENT=dev
@@ -37,8 +37,13 @@ NATS_URL=nats://nats:4222
JWT_SECRET_KEY=dev-jwt-secret-do-not-use-in-production-replace-me
CREDENTIAL_ENCRYPTION_KEY=LLLjnfBZTSycvL2U07HDSxUeTtLxb9cZzryQl0R9E4w=
# OpenBao unseal key (generated on first run - see init.sh output)
BAO_UNSEAL_KEY=
# OpenBao root token (generated on first run - replaces dev-openbao-token)
# OPENBAO_TOKEN=
# First admin bootstrap (dev only)
FIRST_ADMIN_EMAIL=admin@mikrotik-portal.dev
FIRST_ADMIN_EMAIL=admin@the-other-dude.dev
FIRST_ADMIN_PASSWORD=changeme-in-production
# CORS (comma-separated origins)

View File

@@ -30,7 +30,7 @@ JWT_SECRET_KEY=CHANGE_ME_STAGING
CREDENTIAL_ENCRYPTION_KEY=CHANGE_ME_STAGING
# First admin bootstrap
FIRST_ADMIN_EMAIL=admin@mikrotik-portal.staging
FIRST_ADMIN_EMAIL=admin@the-other-dude.staging
FIRST_ADMIN_PASSWORD=CHANGE_ME_STAGING
# CORS (staging URL)

4
.gitignore vendored
View File

@@ -38,3 +38,7 @@ Thumbs.db
# Playwright MCP logs
.playwright-mcp/
# Local-only planning and design docs
.planning/
docs/superpowers/

View File

@@ -22,7 +22,7 @@ KNOWN_INSECURE_DEFAULTS: dict[str, list[str]] = {
],
"OPENBAO_TOKEN": [
"dev-openbao-token",
"CHANGE_ME_IN_PRODUCTION",
"",
],
}
@@ -43,7 +43,8 @@ def validate_production_settings(settings: "Settings") -> None:
f"FATAL: {field} uses a known insecure default in '{settings.ENVIRONMENT}' environment.\n"
f"Generate a secure value and set it in your .env.prod file.\n"
f"For JWT_SECRET_KEY: python -c \"import secrets; print(secrets.token_urlsafe(64))\"\n"
f"For CREDENTIAL_ENCRYPTION_KEY: python -c \"import secrets, base64; print(base64.b64encode(secrets.token_bytes(32)).decode())\"",
f"For CREDENTIAL_ENCRYPTION_KEY: python -c \"import secrets, base64; print(base64.b64encode(secrets.token_bytes(32)).decode())\"\n"
f"For OPENBAO_TOKEN: use the token from your OpenBao server (not the dev token)",
file=sys.stderr,
)
sys.exit(1)
@@ -92,7 +93,7 @@ class Settings(BaseSettings):
# OpenBao Transit (KMS for per-tenant credential encryption)
OPENBAO_ADDR: str = "http://localhost:8200"
OPENBAO_TOKEN: str = "dev-openbao-token"
OPENBAO_TOKEN: str = ""
# First admin bootstrap
FIRST_ADMIN_EMAIL: Optional[str] = None
@@ -119,7 +120,7 @@ class Settings(BaseSettings):
SMTP_USER: Optional[str] = None
SMTP_PASSWORD: Optional[str] = None
SMTP_USE_TLS: bool = False
SMTP_FROM_ADDRESS: str = "noreply@mikrotik-portal.local"
SMTP_FROM_ADDRESS: str = "noreply@the-other-dude.local"
# Password reset
PASSWORD_RESET_TOKEN_EXPIRE_MINUTES: int = 30

View File

@@ -1,7 +1,8 @@
"""FastAPI application entry point."""
import asyncio
from contextlib import asynccontextmanager
from typing import AsyncGenerator
from typing import AsyncGenerator, Optional
import structlog
from fastapi import FastAPI
@@ -232,11 +233,80 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
except Exception as exc:
logger.warning("retention scheduler could not start (API will run without it)", error=str(exc))
# Start Remote WinBox session reconciliation loop (60s interval).
# Detects orphaned sessions (worker lost them) and cleans up Redis + tunnels.
winbox_reconcile_task: Optional[asyncio.Task] = None # type: ignore[type-arg]
try:
from app.routers.winbox_remote import _get_redis as _wb_get_redis, _close_tunnel
from app.services.winbox_remote import get_session as _wb_worker_get, health_check as _wb_health
async def _winbox_reconcile_loop() -> None:
"""Scan Redis for winbox-remote:* keys and reconcile with worker."""
import json as _json
while True:
try:
await asyncio.sleep(60)
rd = await _wb_get_redis()
cursor = "0"
while True:
cursor, keys = await rd.scan(
cursor=cursor, match="winbox-remote:*", count=100
)
for key in keys:
raw = await rd.get(key)
if raw is None:
continue
try:
sess = _json.loads(raw)
except Exception:
await rd.delete(key)
continue
sess_status = sess.get("status")
if sess_status not in ("creating", "active", "grace"):
continue
session_id = sess.get("session_id")
if not session_id:
await rd.delete(key)
continue
# Health-check against worker
worker_info = await _wb_worker_get(session_id)
if worker_info is None:
# Worker lost the session — clean up
logger.warning(
"reconcile: worker lost session %s, cleaning up",
session_id,
)
tunnel_id = sess.get("tunnel_id")
if tunnel_id:
await _close_tunnel(tunnel_id)
await rd.delete(key)
if cursor == "0" or cursor == 0:
break
except asyncio.CancelledError:
break
except Exception as exc:
logger.warning("winbox reconcile loop error: %s", exc)
winbox_reconcile_task = asyncio.create_task(_winbox_reconcile_loop())
except Exception as exc:
logger.warning("winbox reconcile loop could not start (non-fatal)", error=str(exc))
logger.info("startup complete, ready to serve requests")
yield
# Shutdown
logger.info("shutting down TOD API")
if winbox_reconcile_task and not winbox_reconcile_task.done():
winbox_reconcile_task.cancel()
try:
await winbox_reconcile_task
except asyncio.CancelledError:
pass
await stop_backup_scheduler()
await stop_nats_subscriber(nats_connection)
await stop_metrics_subscriber(metrics_nc)
@@ -311,6 +381,7 @@ def create_app() -> FastAPI:
from app.routers.transparency import router as transparency_router
from app.routers.settings import router as settings_router
from app.routers.remote_access import router as remote_access_router
from app.routers.winbox_remote import router as winbox_remote_router
app.include_router(auth_router, prefix="/api")
app.include_router(tenants_router, prefix="/api")
@@ -339,6 +410,7 @@ def create_app() -> FastAPI:
app.include_router(transparency_router, prefix="/api")
app.include_router(settings_router, prefix="/api")
app.include_router(remote_access_router, prefix="/api")
app.include_router(winbox_remote_router, prefix="/api")
# Health check endpoints
@app.get("/health", tags=["health"])

View File

@@ -164,6 +164,67 @@ async def get_current_user(
)
async def get_current_user_ws(
websocket: "WebSocket",
) -> CurrentUser:
"""
WebSocket authentication helper.
Extracts JWT from the ``access_token`` cookie or ``token`` query parameter,
decodes it, and returns a :class:`CurrentUser`. Unlike :func:`get_current_user`
this does **not** touch the database (no RLS tenant context) because WebSocket
handlers typically manage their own DB sessions.
Raises:
WebSocketException 1008: If no token is provided or the token is invalid.
"""
from starlette.websockets import WebSocket, WebSocketState
from fastapi import WebSocketException
# 1. Try cookie
token: Optional[str] = websocket.cookies.get("access_token")
# 2. Fall back to query param
if not token:
token = websocket.query_params.get("token")
if not token:
raise WebSocketException(code=1008, reason="Not authenticated")
try:
payload = verify_token(token, expected_type="access")
except HTTPException:
raise WebSocketException(code=1008, reason="Invalid or expired token")
user_id_str = payload.get("sub")
tenant_id_str = payload.get("tenant_id")
role = payload.get("role")
if not user_id_str or not role:
raise WebSocketException(code=1008, reason="Invalid token payload")
try:
user_id = uuid.UUID(user_id_str)
except ValueError:
raise WebSocketException(code=1008, reason="Invalid token payload")
tenant_id: Optional[uuid.UUID] = None
if tenant_id_str:
try:
tenant_id = uuid.UUID(tenant_id_str)
except ValueError:
pass
if role != "super_admin" and tenant_id is None:
raise WebSocketException(code=1008, reason="Invalid token: no tenant context")
return CurrentUser(
user_id=user_id,
tenant_id=tenant_id,
role=role,
)
async def get_optional_current_user(
request: Request,
credentials: Annotated[Optional[HTTPAuthorizationCredentials], Depends(bearer_scheme)] = None,

View File

@@ -817,7 +817,7 @@ async def get_emergency_kit_template(
io.BytesIO(pdf_bytes),
media_type="application/pdf",
headers={
"Content-Disposition": 'attachment; filename="MikroTik-Portal-Emergency-Kit.pdf"',
"Content-Disposition": 'attachment; filename="The-Other-Dude-Emergency-Kit.pdf"',
},
)

View File

@@ -29,6 +29,7 @@ from app.schemas.remote_access import (
TunnelStatusItem,
WinboxSessionResponse,
)
from app.schemas.winbox_remote import RemoteWinboxSessionItem
from app.middleware.rate_limit import limiter
from app.services.audit_service import log_action
from sqlalchemy import select
@@ -329,4 +330,26 @@ async def list_sessions(
logger.warning("tunnel.status.list NATS request failed: %s", exc)
# Return empty list rather than error — poller may be unavailable
return ActiveSessionsResponse(winbox_tunnels=tunnels, ssh_sessions=[])
# Query Redis for remote winbox (browser) sessions for this device
remote_winbox: list[RemoteWinboxSessionItem] = []
try:
rd = await _get_redis()
pattern = f"winbox-remote:{device_id}:*"
cursor, keys = await rd.scan(0, match=pattern, count=100)
while keys or cursor:
for key in keys:
raw = await rd.get(key)
if raw:
data = json.loads(raw)
remote_winbox.append(RemoteWinboxSessionItem(**data))
if not cursor:
break
cursor, keys = await rd.scan(cursor, match=pattern, count=100)
except Exception as exc:
logger.warning("Redis winbox-remote scan failed: %s", exc)
return ActiveSessionsResponse(
winbox_tunnels=tunnels,
ssh_sessions=[],
remote_winbox_sessions=remote_winbox,
)

View File

@@ -7,6 +7,7 @@ Transit encryption for passwords. Falls back to .env values.
import logging
from typing import Optional
import redis.asyncio as aioredis
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from sqlalchemy import text
@@ -153,3 +154,20 @@ async def test_smtp(
return await send_test_email(data.to, config)
return conn_result
@router.delete("/winbox-sessions")
async def clear_winbox_sessions(user=Depends(require_role("super_admin"))):
"""Clear all WinBox remote session and rate-limit keys from Redis."""
rd = aioredis.from_url(settings.REDIS_URL, decode_responses=True)
try:
deleted = 0
for pattern in ["winbox-remote:*", "winbox-remote-rate:*"]:
keys = []
async for key in rd.scan_iter(match=pattern):
keys.append(key)
if keys:
deleted += await rd.delete(*keys)
return {"status": "ok", "deleted": deleted}
finally:
await rd.aclose()

View File

@@ -0,0 +1,781 @@
"""
Remote WinBox (Browser) endpoints — Xpra-based in-browser WinBox sessions.
All routes are tenant-scoped under /api/tenants/{tenant_id}/devices/{device_id}.
RBAC: operator+ required for all endpoints.
"""
import asyncio
import json
import logging
import uuid
from datetime import datetime, timezone
from typing import Optional
import httpx
import nats
import redis.asyncio as aioredis
from fastapi import (
APIRouter,
Depends,
HTTPException,
Request,
WebSocket,
WebSocketDisconnect,
status,
)
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings
from app.database import get_db
from app.middleware.rbac import require_operator_or_above
from app.middleware.rate_limit import limiter
from app.middleware.tenant_context import CurrentUser, get_current_user
from app.models.device import Device
from app.schemas.winbox_remote import (
RemoteWinboxCreateRequest,
RemoteWinboxSessionResponse,
RemoteWinboxState,
RemoteWinboxStatusResponse,
RemoteWinboxTerminateResponse,
)
from app.services.audit_service import log_action
from app.services.winbox_remote import (
WorkerCapacityError,
WorkerLaunchError,
create_session as worker_create_session,
get_session as worker_get_session,
terminate_session as worker_terminate_session,
)
logger = logging.getLogger(__name__)
router = APIRouter(tags=["winbox-remote"])
REDIS_PREFIX = "winbox-remote:"
RATE_PREFIX = "winbox-remote-rate:"
# ---------------------------------------------------------------------------
# Lazy NATS and Redis clients (same pattern as remote_access.py)
# ---------------------------------------------------------------------------
_nc: Optional[nats.aio.client.Client] = None
_redis: Optional[aioredis.Redis] = None
async def _get_nats() -> nats.aio.client.Client:
"""Get or create a shared NATS client."""
global _nc
if _nc is None or _nc.is_closed:
_nc = await nats.connect(settings.NATS_URL)
return _nc
async def _get_redis() -> aioredis.Redis:
"""Get or create a shared Redis client."""
global _redis
if _redis is None:
_redis = aioredis.from_url(settings.REDIS_URL, decode_responses=True)
return _redis
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _source_ip(request: Request) -> Optional[str]:
return request.headers.get("x-real-ip") or (request.client.host if request.client else None)
async def _get_device(db: AsyncSession, tenant_id: uuid.UUID, device_id: uuid.UUID) -> Device:
result = await db.execute(
select(Device).where(Device.id == device_id, Device.tenant_id == tenant_id)
)
device = result.scalar_one_or_none()
if not device:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Device not found")
return device
async def _check_tenant_access(
current_user: CurrentUser, tenant_id: uuid.UUID, db: AsyncSession
) -> None:
if current_user.is_super_admin:
from app.database import set_tenant_context
await set_tenant_context(db, str(tenant_id))
return
if current_user.tenant_id != tenant_id:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Access denied: you do not belong to this tenant.",
)
async def _check_rate_limit(user_id: uuid.UUID) -> None:
"""Allow max 3 session creates per 5 minutes per user."""
rd = await _get_redis()
key = f"{RATE_PREFIX}{user_id}"
count = await rd.incr(key)
if count == 1:
await rd.expire(key, 300)
if count > 3:
raise HTTPException(
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
detail="Too many session requests. Try again later.",
)
async def _get_session_from_redis(session_id: str) -> Optional[dict]:
rd = await _get_redis()
raw = await rd.get(f"{REDIS_PREFIX}{session_id}")
if raw is None:
return None
return json.loads(raw)
async def _save_session_to_redis(session_id: str, data: dict, ttl: int = 14400) -> None:
rd = await _get_redis()
await rd.setex(f"{REDIS_PREFIX}{session_id}", ttl, json.dumps(data, default=str))
async def _delete_session_from_redis(session_id: str) -> None:
rd = await _get_redis()
await rd.delete(f"{REDIS_PREFIX}{session_id}")
async def _open_tunnel(
device_id: uuid.UUID, tenant_id: uuid.UUID, user_id: uuid.UUID
) -> dict:
"""Open a TCP tunnel to device port 8291 via NATS request-reply."""
payload = json.dumps({
"device_id": str(device_id),
"tenant_id": str(tenant_id),
"user_id": str(user_id),
"target_port": 8291,
}).encode()
try:
nc = await _get_nats()
msg = await nc.request("tunnel.open", payload, timeout=10)
except Exception as exc:
logger.error("NATS tunnel.open failed: %s", exc)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Tunnel service unavailable",
)
try:
data = json.loads(msg.data)
except Exception:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Invalid response from tunnel service",
)
if "error" in data:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=data["error"]
)
return data
async def _close_tunnel(tunnel_id: str) -> None:
"""Close a tunnel via NATS — idempotent."""
try:
nc = await _get_nats()
payload = json.dumps({"tunnel_id": tunnel_id}).encode()
await nc.request("tunnel.close", payload, timeout=10)
except Exception:
pass # Idempotent — tunnel may already be closed
# ---------------------------------------------------------------------------
# POST — Create a Remote WinBox (Browser) session
# ---------------------------------------------------------------------------
@router.post(
"/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions",
response_model=RemoteWinboxSessionResponse,
summary="Create a Remote WinBox browser session",
dependencies=[Depends(require_operator_or_above)],
)
@limiter.limit("10/minute")
async def create_winbox_remote_session(
tenant_id: uuid.UUID,
device_id: uuid.UUID,
request: Request,
body: RemoteWinboxCreateRequest = RemoteWinboxCreateRequest(),
current_user: CurrentUser = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> RemoteWinboxSessionResponse:
"""
Create an Xpra-based WinBox session accessible via WebSocket in the browser.
Flow: auth -> tenant check -> device exists -> duplicate check -> rate limit ->
credential decrypt -> tunnel open -> worker create -> Redis save -> audit log.
Full rollback on failure.
"""
await _check_tenant_access(current_user, tenant_id, db)
device = await _get_device(db, tenant_id, device_id)
source_ip = _source_ip(request)
# Check for duplicate active session for this user+device
rd = await _get_redis()
cursor = "0"
while True:
cursor, keys = await rd.scan(cursor=cursor, match=f"{REDIS_PREFIX}*", count=100)
for key in keys:
raw = await rd.get(key)
if raw is None:
continue
try:
sess = json.loads(raw)
except Exception:
continue
if (
sess.get("device_id") == str(device_id)
and sess.get("user_id") == str(current_user.user_id)
and sess.get("status") in ("creating", "active", "grace")
):
# Verify the worker actually has this session — if not, clean up
# the stale Redis entry instead of blocking the user.
stale_sid = sess.get("session_id", "")
try:
worker_info = await worker_get_session(stale_sid)
except Exception:
worker_info = None
if worker_info is None:
logger.warning(
"Cleaning stale Redis session %s (worker 404)", stale_sid
)
tunnel_id = sess.get("tunnel_id")
if tunnel_id:
await _close_tunnel(tunnel_id)
await _delete_session_from_redis(stale_sid)
continue
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail="Active session already exists for this device",
)
if cursor == "0" or cursor == 0:
break
# Rate limit
await _check_rate_limit(current_user.user_id)
# Decrypt device credentials
try:
from app.services.crypto import decrypt_credentials_hybrid
creds_json = await decrypt_credentials_hybrid(
transit_ciphertext=device.encrypted_credentials_transit,
legacy_ciphertext=device.encrypted_credentials,
tenant_id=str(tenant_id),
legacy_key=settings.get_encryption_key_bytes(),
)
creds = json.loads(creds_json)
username = creds.get("username", "")
password = creds.get("password", "")
except Exception as exc:
logger.error("Failed to decrypt credentials for device %s: %s", device_id, exc)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Unable to retrieve device credentials",
)
# Open tunnel to device
tunnel_data = None
session_id = str(uuid.uuid4())
now = datetime.now(timezone.utc)
try:
tunnel_data = await _open_tunnel(device_id, tenant_id, current_user.user_id)
tunnel_id = tunnel_data.get("tunnel_id", "")
tunnel_port = tunnel_data.get("local_port")
if not isinstance(tunnel_port, int) or not (49000 <= tunnel_port <= 49100):
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Invalid port allocation from tunnel service",
)
# Create session on worker
# Tunnel listener runs on the poller container, reachable via Docker DNS
try:
worker_resp = await worker_create_session(
session_id=session_id,
device_ip="tod_poller",
device_port=tunnel_port,
username=username,
password=password,
idle_timeout_seconds=body.idle_timeout_seconds,
max_lifetime_seconds=body.max_lifetime_seconds,
)
except WorkerCapacityError:
await _close_tunnel(tunnel_id)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="No capacity for new sessions",
)
except WorkerLaunchError as exc:
await _close_tunnel(tunnel_id)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail=f"Session launch failed: {exc}",
)
finally:
# Zero credentials
username = "" # noqa: F841
password = "" # noqa: F841
expires_at = datetime.fromisoformat(
worker_resp.get("expires_at", now.isoformat())
)
max_expires_at = datetime.fromisoformat(
worker_resp.get("max_expires_at", now.isoformat())
)
# Save session to Redis
session_data = {
"session_id": session_id,
"tenant_id": str(tenant_id),
"device_id": str(device_id),
"user_id": str(current_user.user_id),
"tunnel_id": tunnel_id,
"tunnel_port": tunnel_port,
"status": RemoteWinboxState.active.value,
"created_at": now.isoformat(),
"expires_at": expires_at.isoformat(),
"max_expires_at": max_expires_at.isoformat(),
"idle_timeout_seconds": body.idle_timeout_seconds,
"max_lifetime_seconds": body.max_lifetime_seconds,
"xpra_ws_port": worker_resp.get("xpra_ws_port"),
}
await _save_session_to_redis(session_id, session_data, ttl=body.max_lifetime_seconds + 60)
# Audit log (fire-and-forget)
try:
await log_action(
db,
tenant_id,
current_user.user_id,
"winbox_remote_session_create",
resource_type="device",
resource_id=str(device_id),
device_id=device_id,
details={"session_id": session_id, "source_ip": source_ip},
ip_address=source_ip,
)
except Exception:
pass
ws_path = (
f"/api/tenants/{tenant_id}/devices/{device_id}"
f"/winbox-remote-sessions/{session_id}/ws"
)
return RemoteWinboxSessionResponse(
session_id=uuid.UUID(session_id),
websocket_path=ws_path,
expires_at=expires_at,
max_expires_at=max_expires_at,
idle_timeout_seconds=body.idle_timeout_seconds,
max_lifetime_seconds=body.max_lifetime_seconds,
xpra_ws_port=worker_resp.get("xpra_ws_port"),
)
except HTTPException:
raise
except Exception as exc:
# Full rollback
logger.error("Unexpected error creating winbox remote session: %s", exc)
if tunnel_data and tunnel_data.get("tunnel_id"):
await _close_tunnel(tunnel_data["tunnel_id"])
await _delete_session_from_redis(session_id)
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Session creation failed",
)
# ---------------------------------------------------------------------------
# GET — Session status
# ---------------------------------------------------------------------------
@router.get(
"/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions/{session_id}",
response_model=RemoteWinboxStatusResponse,
summary="Get Remote WinBox session status",
dependencies=[Depends(require_operator_or_above)],
)
async def get_winbox_remote_session(
tenant_id: uuid.UUID,
device_id: uuid.UUID,
session_id: uuid.UUID,
current_user: CurrentUser = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> RemoteWinboxStatusResponse:
await _check_tenant_access(current_user, tenant_id, db)
sess = await _get_session_from_redis(str(session_id))
if sess is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="Session not found"
)
if sess.get("tenant_id") != str(tenant_id) or sess.get("device_id") != str(device_id):
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="Session not found"
)
return RemoteWinboxStatusResponse(
session_id=uuid.UUID(sess["session_id"]),
status=RemoteWinboxState(sess.get("status", "active")),
created_at=datetime.fromisoformat(sess["created_at"]),
expires_at=datetime.fromisoformat(sess["expires_at"]),
max_expires_at=datetime.fromisoformat(sess["max_expires_at"]),
idle_timeout_seconds=sess.get("idle_timeout_seconds", 600),
max_lifetime_seconds=sess.get("max_lifetime_seconds", 7200),
xpra_ws_port=sess.get("xpra_ws_port"),
)
# ---------------------------------------------------------------------------
# GET — List sessions for a device
# ---------------------------------------------------------------------------
@router.get(
"/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions",
response_model=list[RemoteWinboxStatusResponse],
summary="List Remote WinBox sessions for a device",
dependencies=[Depends(require_operator_or_above)],
)
async def list_winbox_remote_sessions(
tenant_id: uuid.UUID,
device_id: uuid.UUID,
current_user: CurrentUser = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> list[RemoteWinboxStatusResponse]:
await _check_tenant_access(current_user, tenant_id, db)
sessions = []
rd = await _get_redis()
cursor = "0"
while True:
cursor, keys = await rd.scan(cursor=cursor, match=f"{REDIS_PREFIX}*", count=100)
for key in keys:
raw = await rd.get(key)
if raw is None:
continue
try:
sess = json.loads(raw)
except Exception:
continue
if (
sess.get("tenant_id") == str(tenant_id)
and sess.get("device_id") == str(device_id)
):
sessions.append(
RemoteWinboxStatusResponse(
session_id=uuid.UUID(sess["session_id"]),
status=RemoteWinboxState(sess.get("status", "active")),
created_at=datetime.fromisoformat(sess["created_at"]),
expires_at=datetime.fromisoformat(sess["expires_at"]),
max_expires_at=datetime.fromisoformat(sess["max_expires_at"]),
idle_timeout_seconds=sess.get("idle_timeout_seconds", 600),
max_lifetime_seconds=sess.get("max_lifetime_seconds", 7200),
xpra_ws_port=sess.get("xpra_ws_port"),
)
)
if cursor == "0" or cursor == 0:
break
return sessions
# ---------------------------------------------------------------------------
# DELETE — Terminate session (idempotent)
# ---------------------------------------------------------------------------
@router.delete(
"/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions/{session_id}",
response_model=RemoteWinboxTerminateResponse,
summary="Terminate a Remote WinBox session",
dependencies=[Depends(require_operator_or_above)],
)
async def terminate_winbox_remote_session(
tenant_id: uuid.UUID,
device_id: uuid.UUID,
session_id: uuid.UUID,
request: Request,
current_user: CurrentUser = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> RemoteWinboxTerminateResponse:
await _check_tenant_access(current_user, tenant_id, db)
source_ip = _source_ip(request)
sess = await _get_session_from_redis(str(session_id))
# Idempotent — if already gone, return terminated
if sess is None:
return RemoteWinboxTerminateResponse(
session_id=session_id,
status=RemoteWinboxState.terminated,
reason="Session already terminated or not found",
)
if sess.get("tenant_id") != str(tenant_id):
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="Session not found"
)
# Rollback order: worker -> tunnel -> redis -> audit
await worker_terminate_session(str(session_id))
tunnel_id = sess.get("tunnel_id")
if tunnel_id:
await _close_tunnel(tunnel_id)
await _delete_session_from_redis(str(session_id))
try:
await log_action(
db,
tenant_id,
current_user.user_id,
"winbox_remote_session_terminate",
resource_type="device",
resource_id=str(device_id),
device_id=device_id,
details={"session_id": str(session_id), "source_ip": source_ip},
ip_address=source_ip,
)
except Exception:
pass
return RemoteWinboxTerminateResponse(
session_id=session_id,
status=RemoteWinboxState.terminated,
reason="Terminated by user",
)
# ---------------------------------------------------------------------------
# HTTP Proxy — Serve Xpra HTML5 client files from worker
# ---------------------------------------------------------------------------
@router.get(
"/tenants/{tenant_id}/devices/{device_id}"
"/winbox-remote-sessions/{session_id}/xpra/{path:path}",
summary="Proxy Xpra HTML5 client files",
dependencies=[Depends(require_operator_or_above)],
)
@router.get(
"/tenants/{tenant_id}/devices/{device_id}"
"/winbox-remote-sessions/{session_id}/xpra",
summary="Proxy Xpra HTML5 client (root)",
dependencies=[Depends(require_operator_or_above)],
)
async def proxy_xpra_html(
tenant_id: uuid.UUID,
device_id: uuid.UUID,
session_id: uuid.UUID,
request: Request,
path: str = "",
current_user: CurrentUser = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> None:
"""Reverse-proxy HTTP requests to the Xpra HTML5 server inside the worker."""
from starlette.responses import Response
await _check_tenant_access(current_user, tenant_id, db)
sess = await _get_session_from_redis(str(session_id))
if sess is None:
raise HTTPException(status_code=404, detail="Session not found")
if sess.get("tenant_id") != str(tenant_id) or sess.get("device_id") != str(device_id):
raise HTTPException(status_code=404, detail="Session not found")
xpra_ws_port = sess.get("xpra_ws_port")
if not xpra_ws_port:
raise HTTPException(status_code=503, detail="Xpra port unavailable")
# Proxy the request to Xpra's built-in HTTP server
target_url = f"http://tod_winbox_worker:{xpra_ws_port}/{path}"
try:
async with httpx.AsyncClient(timeout=httpx.Timeout(10.0)) as client:
proxy_resp = await client.get(
target_url,
params=dict(request.query_params),
)
except Exception as exc:
logger.error("Xpra HTTP proxy error: %s", exc)
raise HTTPException(status_code=502, detail="Xpra server unreachable")
# Forward the response with correct content type
return Response(
content=proxy_resp.content,
status_code=proxy_resp.status_code,
headers={
k: v for k, v in proxy_resp.headers.items()
if k.lower() in ("content-type", "cache-control", "content-encoding")
},
)
# ---------------------------------------------------------------------------
# WebSocket — Proxy browser <-> Xpra worker
# ---------------------------------------------------------------------------
@router.websocket(
"/tenants/{tenant_id}/devices/{device_id}/winbox-remote-sessions/{session_id}/ws"
)
async def winbox_remote_ws_proxy(
websocket: WebSocket,
tenant_id: uuid.UUID,
device_id: uuid.UUID,
session_id: uuid.UUID,
) -> None:
"""
Bidirectional WebSocket proxy between the browser and the worker's Xpra
WebSocket. Authentication via access_token cookie or query param.
1. Authenticate via cookie/query param token
2. Validate session in Redis (ownership, status, expiry)
3. Resolve Xpra WebSocket port from worker
4. Accept browser WebSocket upgrade
5. Proxy bidirectionally until close
"""
# --- Auth: extract token from cookie or query param ---
token = websocket.cookies.get("access_token") or websocket.query_params.get("token")
if not token:
await websocket.close(code=4001, reason="Authentication required")
return
from app.services.auth import verify_token
try:
payload = verify_token(token, expected_type="access")
except Exception:
await websocket.close(code=4001, reason="Invalid token")
return
user_id_str = payload.get("sub")
user_tenant_str = payload.get("tenant_id")
role = payload.get("role")
if not user_id_str or not role:
await websocket.close(code=4001, reason="Invalid token payload")
return
# Tenant access check
if role != "super_admin":
if user_tenant_str != str(tenant_id):
await websocket.close(code=4003, reason="Tenant access denied")
return
# --- Session validation ---
sess = await _get_session_from_redis(str(session_id))
if sess is None:
await websocket.close(code=4004, reason="Session not found")
return
if sess.get("tenant_id") != str(tenant_id) or sess.get("device_id") != str(device_id):
await websocket.close(code=4004, reason="Session not found")
return
# Ownership check: user must own the session (or be super_admin)
if role != "super_admin" and sess.get("user_id") != user_id_str:
await websocket.close(code=4003, reason="Not your session")
return
sess_status = sess.get("status")
if sess_status not in ("active", "grace"):
await websocket.close(code=4004, reason=f"Session not active (status={sess_status})")
return
# Check max expiry
max_expires = datetime.fromisoformat(sess["max_expires_at"])
if datetime.now(timezone.utc) > max_expires:
await websocket.close(code=4004, reason="Session expired")
return
# Resolve Xpra WebSocket port from worker
xpra_ws_port = sess.get("xpra_ws_port")
if not xpra_ws_port:
worker_info = await worker_get_session(str(session_id))
if not worker_info:
await websocket.close(code=4004, reason="Worker session not found")
return
xpra_ws_port = worker_info.get("xpra_ws_port") or worker_info.get("ws_port")
if not xpra_ws_port:
await websocket.close(code=4004, reason="Xpra port unavailable")
return
# Update last_client_connect_at in Redis
sess["last_client_connect_at"] = datetime.now(timezone.utc).isoformat()
try:
await _save_session_to_redis(str(session_id), sess)
except Exception:
pass
# Accept browser WebSocket
await websocket.accept()
# Connect to worker Xpra WebSocket
import websockets
worker_ws_url = f"ws://tod_winbox_worker:{xpra_ws_port}"
try:
async with websockets.connect(worker_ws_url) as worker_ws:
async def browser_to_worker() -> None:
try:
while True:
data = await websocket.receive_bytes()
await worker_ws.send(data)
except WebSocketDisconnect:
pass
except Exception:
pass
async def worker_to_browser() -> None:
try:
async for message in worker_ws:
if isinstance(message, bytes):
await websocket.send_bytes(message)
else:
await websocket.send_text(message)
except Exception:
pass
# Run both directions concurrently
done, pending = await asyncio.wait(
[
asyncio.create_task(browser_to_worker()),
asyncio.create_task(worker_to_browser()),
],
return_when=asyncio.FIRST_COMPLETED,
)
for task in pending:
task.cancel()
except Exception as exc:
logger.warning("WebSocket proxy error for session %s: %s", session_id, exc)
finally:
try:
await websocket.close()
except Exception:
pass

View File

@@ -1,5 +1,7 @@
from pydantic import BaseModel, Field
from app.schemas.winbox_remote import RemoteWinboxSessionItem
class WinboxSessionResponse(BaseModel):
tunnel_id: str
@@ -37,3 +39,4 @@ class SSHSessionStatusItem(BaseModel):
class ActiveSessionsResponse(BaseModel):
winbox_tunnels: list[TunnelStatusItem] = []
ssh_sessions: list[SSHSessionStatusItem] = []
remote_winbox_sessions: list[RemoteWinboxSessionItem] = []

View File

@@ -0,0 +1,63 @@
"""Request/response schemas for Remote WinBox (Browser) sessions."""
import uuid
from datetime import datetime
from enum import Enum
from typing import Optional
from pydantic import BaseModel, Field
class RemoteWinboxState(str, Enum):
creating = "creating"
active = "active"
grace = "grace"
terminating = "terminating"
terminated = "terminated"
failed = "failed"
class RemoteWinboxCreateRequest(BaseModel):
idle_timeout_seconds: int = Field(default=600, ge=60, le=3600)
max_lifetime_seconds: int = Field(default=7200, ge=300, le=14400)
class RemoteWinboxSessionResponse(BaseModel):
session_id: uuid.UUID
status: RemoteWinboxState = RemoteWinboxState.active
websocket_path: str
expires_at: datetime
max_expires_at: datetime
idle_timeout_seconds: int
max_lifetime_seconds: int
xpra_ws_port: Optional[int] = None
class RemoteWinboxStatusResponse(BaseModel):
session_id: uuid.UUID
status: RemoteWinboxState
created_at: datetime
expires_at: datetime
max_expires_at: datetime
idle_timeout_seconds: int
max_lifetime_seconds: int
xpra_ws_port: Optional[int] = None
class RemoteWinboxTerminateResponse(BaseModel):
session_id: uuid.UUID
status: RemoteWinboxState
reason: str
class RemoteWinboxDuplicateDetail(BaseModel):
detail: str = "Active session exists"
session: RemoteWinboxStatusResponse
class RemoteWinboxSessionItem(BaseModel):
"""Used in the combined active sessions list."""
session_id: uuid.UUID
status: RemoteWinboxState
created_at: datetime
expires_at: datetime

View File

@@ -120,7 +120,7 @@ async def _send_email(channel: dict, alert_event: dict, device_hostname: str) ->
user=channel.get("smtp_user"),
password=smtp_password,
use_tls=channel.get("smtp_use_tls", False),
from_address=channel.get("from_address") or "alerts@mikrotik-portal.local",
from_address=channel.get("from_address") or "alerts@the-other-dude.local",
)
to = channel.get("to_address")

View File

@@ -43,7 +43,7 @@ from app.services.push_tracker import record_push, clear_push
logger = logging.getLogger(__name__)
# Name of the panic-revert scheduler installed on the RouterOS device
_PANIC_REVERT_SCHEDULER = "mikrotik-portal-panic-revert"
_PANIC_REVERT_SCHEDULER = "the-other-dude-panic-revert"
# Name of the pre-push binary backup saved on device flash
_PRE_PUSH_BACKUP = "portal-pre-push"
# Name of the RSC file used for /import on device

View File

@@ -35,7 +35,7 @@ logger = logging.getLogger(__name__)
_env = SandboxedEnvironment()
# Names used on the RouterOS device during template push
_PANIC_REVERT_SCHEDULER = "mikrotik-portal-template-revert"
_PANIC_REVERT_SCHEDULER = "the-other-dude-template-revert"
_PRE_PUSH_BACKUP = "portal-template-pre-push"
_TEMPLATE_RSC = "portal-template.rsc"

View File

@@ -0,0 +1,126 @@
"""HTTP client for the winbox-worker container.
Provides async helpers to create, terminate, query, and health-check
Remote WinBox (Xpra) sessions running inside the worker container.
All communication uses the internal Docker network.
"""
import logging
from typing import Any, Optional
import httpx
logger = logging.getLogger(__name__)
WORKER_BASE_URL = "http://tod_winbox_worker:9090"
_HEADERS = {"X-Internal-Service": "api"}
_TIMEOUT = httpx.Timeout(15.0, connect=5.0)
class WorkerCapacityError(Exception):
"""Worker has no capacity for new sessions."""
class WorkerLaunchError(Exception):
"""Worker failed to launch a session."""
async def create_session(
session_id: str,
device_ip: str,
device_port: int,
username: str,
password: str,
idle_timeout_seconds: int,
max_lifetime_seconds: int,
) -> dict[str, Any]:
"""POST /sessions — ask the worker to launch an Xpra+WinBox session.
Credentials are zeroed from locals after the request is sent.
Raises WorkerCapacityError (503) or WorkerLaunchError on failure.
"""
payload = {
"session_id": session_id,
"tunnel_host": device_ip,
"tunnel_port": device_port,
"username": username,
"password": password,
"idle_timeout_seconds": idle_timeout_seconds,
"max_lifetime_seconds": max_lifetime_seconds,
}
try:
async with httpx.AsyncClient(
base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=_TIMEOUT
) as client:
resp = await client.post("/sessions", json=payload)
finally:
# Zero credentials in the payload dict
payload["username"] = ""
payload["password"] = ""
del username, password # noqa: F821 — local unbind
if resp.status_code == 503:
raise WorkerCapacityError(resp.text)
if resp.status_code >= 400:
raise WorkerLaunchError(f"Worker returned {resp.status_code}: {resp.text}")
return resp.json()
async def terminate_session(session_id: str) -> bool:
"""DELETE /sessions/{session_id} — idempotent (ignores 404).
Returns True if the worker acknowledged termination, False if 404.
"""
async with httpx.AsyncClient(
base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=_TIMEOUT
) as client:
resp = await client.delete(f"/sessions/{session_id}")
if resp.status_code == 404:
return False
if resp.status_code >= 400:
logger.error("Worker terminate error %s: %s", resp.status_code, resp.text)
return False
return True
async def get_session(session_id: str) -> Optional[dict[str, Any]]:
"""GET /sessions/{session_id} — returns None if 404."""
async with httpx.AsyncClient(
base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=_TIMEOUT
) as client:
resp = await client.get(f"/sessions/{session_id}")
if resp.status_code == 404:
return None
if resp.status_code >= 400:
logger.error("Worker get_session error %s: %s", resp.status_code, resp.text)
return None
return resp.json()
async def list_sessions() -> list[dict[str, Any]]:
"""GET /sessions — return all sessions known to the worker."""
async with httpx.AsyncClient(
base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=_TIMEOUT
) as client:
resp = await client.get("/sessions")
if resp.status_code >= 400:
logger.error("Worker list_sessions error %s: %s", resp.status_code, resp.text)
return []
data = resp.json()
return data if isinstance(data, list) else []
async def health_check() -> bool:
"""GET /healthz — returns True if the worker is healthy."""
try:
async with httpx.AsyncClient(
base_url=WORKER_BASE_URL, headers=_HEADERS, timeout=httpx.Timeout(5.0)
) as client:
resp = await client.get("/healthz")
return resp.status_code == 200
except Exception:
return False

View File

@@ -3,7 +3,7 @@ requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "mikrotik-portal-backend"
name = "the-other-dude-backend"
version = "9.0.1"
description = "MikroTik Fleet Management Portal - Backend API"
requires-python = ">=3.12"

View File

@@ -16,7 +16,7 @@ async def test_recovery_commits_reachable_device_with_scheduler():
push_op.device_id = uuid4()
push_op.tenant_id = uuid4()
push_op.status = "pending_verification"
push_op.scheduler_name = "mikrotik-portal-panic-revert"
push_op.scheduler_name = "the-other-dude-panic-revert"
push_op.started_at = datetime.now(timezone.utc) - timedelta(minutes=10)
device = MagicMock()
@@ -71,7 +71,7 @@ async def test_recovery_marks_unreachable_device_failed():
push_op.device_id = uuid4()
push_op.tenant_id = uuid4()
push_op.status = "pending_verification"
push_op.scheduler_name = "mikrotik-portal-panic-revert"
push_op.scheduler_name = "the-other-dude-panic-revert"
push_op.started_at = datetime.now(timezone.utc) - timedelta(minutes=10)
device = MagicMock()

View File

@@ -21,7 +21,7 @@ services:
APP_USER_DATABASE_URL: postgresql+asyncpg://app_user:app_password@postgres:5432/mikrotik
REDIS_URL: redis://redis:6379/0
NATS_URL: nats://nats:4222
FIRST_ADMIN_EMAIL: ${FIRST_ADMIN_EMAIL:-admin@mikrotik-portal.dev}
FIRST_ADMIN_EMAIL: ${FIRST_ADMIN_EMAIL:-admin@the-other-dude.dev}
FIRST_ADMIN_PASSWORD: ${FIRST_ADMIN_PASSWORD:-changeme-in-production}
CREDENTIAL_ENCRYPTION_KEY: ${CREDENTIAL_ENCRYPTION_KEY:?Set CREDENTIAL_ENCRYPTION_KEY in .env}
JWT_SECRET_KEY: ${JWT_SECRET_KEY:?Set JWT_SECRET_KEY in .env}
@@ -62,6 +62,7 @@ services:
memory: 512M
networks:
- tod
- tod_remote_worker
poller:
build:
@@ -115,6 +116,17 @@ services:
memory: 256M
networks:
- tod
- tod_remote_worker
winbox-worker:
environment:
LOG_LEVEL: debug
MAX_CONCURRENT_SESSIONS: 5
deploy:
resources:
limits:
memory: 512M
restart: on-failure
frontend:
build:

View File

@@ -12,9 +12,27 @@ services:
ENVIRONMENT: production
LOG_LEVEL: info
GUNICORN_WORKERS: "2"
command: ["gunicorn", "app.main:app", "--config", "gunicorn.conf.py"]
WIREGUARD_CONFIG_PATH: /data/wireguard
WIREGUARD_GATEWAY: wireguard
cap_add:
- NET_ADMIN
user: root
command: >
sh -c "
if [ -n \"$$WIREGUARD_GATEWAY\" ]; then
apt-get update -qq && apt-get install -y -qq iproute2 >/dev/null 2>&1 || true;
GW_IP=$$(getent hosts $$WIREGUARD_GATEWAY 2>/dev/null | awk '{print $$1}');
[ -z \"$$GW_IP\" ] && GW_IP=$$WIREGUARD_GATEWAY;
ip route add 10.10.0.0/16 via $$GW_IP 2>/dev/null || true;
echo VPN route: 10.10.0.0/16 via $$GW_IP;
fi;
exec su -s /bin/sh appuser -c 'gunicorn app.main:app --config gunicorn.conf.py'
"
ports:
- "8001:8000"
volumes:
- ./docker-data/git-store:/data/git-store
- ./docker-data/wireguard:/data/wireguard
depends_on:
postgres:
condition: service_healthy
@@ -22,6 +40,8 @@ services:
condition: service_healthy
nats:
condition: service_healthy
openbao:
condition: service_healthy
deploy:
resources:
limits:
@@ -34,6 +54,7 @@ services:
max-file: "3"
networks:
- tod
- tod_remote_worker
poller:
build:
@@ -44,6 +65,7 @@ services:
environment:
ENVIRONMENT: production
LOG_LEVEL: info
DATABASE_URL: postgres://poller_user:poller_password@postgres:5432/mikrotik
TUNNEL_PORT_MIN: 49000
TUNNEL_PORT_MAX: 49100
TUNNEL_IDLE_TIMEOUT: 300
@@ -65,6 +87,8 @@ services:
condition: service_healthy
nats:
condition: service_healthy
openbao:
condition: service_healthy
healthcheck:
test: ["CMD-SHELL", "wget --spider -q http://localhost:8080/healthz || exit 1"]
interval: 30s
@@ -82,6 +106,32 @@ services:
max-file: "3"
networks:
- tod
- tod_remote_worker
openbao:
env_file: .env.prod
environment:
BAO_ADDR: "http://127.0.0.1:8200"
BAO_UNSEAL_KEY: "${BAO_UNSEAL_KEY}"
BAO_TOKEN: "${OPENBAO_TOKEN}"
ports: []
restart: unless-stopped
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
winbox-worker:
environment:
LOG_LEVEL: info
MAX_CONCURRENT_SESSIONS: 10
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
restart: unless-stopped
frontend:
build:
@@ -89,7 +139,7 @@ services:
dockerfile: infrastructure/docker/Dockerfile.frontend
container_name: tod_frontend
ports:
- "80:80"
- "3000:80"
depends_on:
- api
deploy:

View File

@@ -87,28 +87,28 @@ services:
command:
- -c
- |
# Start OpenBao in background
bao server -dev -dev-listen-address=0.0.0.0:8200 &
bao server -config=/etc/openbao/config.hcl &
BAO_PID=$$!
# Wait for ready and run init
sleep 2
/init/init.sh
# Wait for OpenBao process
wait $$BAO_PID
environment:
BAO_DEV_ROOT_TOKEN_ID: dev-openbao-token
BAO_DEV_LISTEN_ADDRESS: "0.0.0.0:8200"
BAO_ADDR: "http://127.0.0.1:8200"
BAO_UNSEAL_KEY: "${BAO_UNSEAL_KEY:-}"
BAO_TOKEN: "${OPENBAO_TOKEN:-}"
ports:
- "8200:8200"
volumes:
- ./infrastructure/openbao/config.hcl:/etc/openbao/config.hcl:ro
- ./infrastructure/openbao/init.sh:/init/init.sh:ro
cap_add:
- IPC_LOCK
- openbao_data:/openbao/data
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://127.0.0.1:8200/v1/sys/health | grep -q '\"sealed\":false' || exit 1"]
interval: 5s
timeout: 3s
retries: 5
retries: 12
start_period: 30s
restart: unless-stopped
deploy:
resources:
limits:
@@ -159,6 +159,33 @@ services:
limits:
memory: 64M
winbox-worker:
build:
context: ./winbox-worker
platform: linux/amd64
container_name: tod_winbox_worker
environment:
IDLE_TIMEOUT: 600
MAX_LIFETIME: 7200
MAX_CONCURRENT_SESSIONS: 10
LOG_LEVEL: info
XDG_RUNTIME_DIR: /run/user/1001
ports:
- "10100-10119:10100-10119"
deploy:
resources:
limits:
memory: 1G
networks:
- tod
- tod_remote_worker
volumes:
openbao_data:
networks:
tod:
driver: bridge
tod_remote_worker:
driver: bridge
internal: true

View File

@@ -71,7 +71,7 @@ TOD uses Pydantic Settings for configuration. All values can be set via environm
| `SMTP_USER` | *(none)* | SMTP authentication username |
| `SMTP_PASSWORD` | *(none)* | SMTP authentication password |
| `SMTP_USE_TLS` | `false` | Enable STARTTLS for SMTP connections |
| `SMTP_FROM_ADDRESS` | `noreply@mikrotik-portal.local` | Sender address for outbound emails |
| `SMTP_FROM_ADDRESS` | `noreply@the-other-dude.local` | Sender address for outbound emails |
### Firmware

File diff suppressed because it is too large Load Diff

View File

@@ -1,841 +0,0 @@
# Remote Access Design — WinBox Tunnels + SSH Terminal (v9.5)
## Overview
Add remote WinBox and SSH terminal access to TOD. Users connect to RouterOS devices behind NAT through the TOD controller without direct network access to the router.
- **WinBox**: TCP tunnel through the poller container. User's native WinBox app connects to `127.0.0.1:<port>`.
- **SSH Terminal**: Browser-based xterm.js terminal. WebSocket to poller, which bridges to SSH PTY on the router.
### Device Type Scope
- **WinBox tunnels**: RouterOS devices only (WinBox is MikroTik-specific, port 8291)
- **SSH terminal**: All device types that support SSH (RouterOS and future `linuxrtr` devices)
- The frontend should show/hide the "Open WinBox" button based on device type. The "SSH Terminal" button renders for all SSH-capable device types.
## System Architecture
```
┌─────────────────────────────────┐
│ User's Machine │
│ │
│ Browser (TOD UI) │
│ ├─ xterm.js SSH terminal │
│ └─ "Open WinBox" button │
│ │
│ WinBox app │
│ └─ connects 127.0.0.1:491xx │
└──────────┬──────────┬───────────┘
│ │
WebSocket TCP (WinBox)
/ws/ssh/ 127.0.0.1:49000-49100
│ │
┌────────────────────────────────────┼──────────┼────────────────┐
│ Docker Network: tod │ │ │
│ │ │ │
│ ┌──────────────┐ │ │ │
│ │ nginx │──────────────────┘ │ │
│ │ port 3000 │ (proxy /ws/ssh → poller) │ │
│ │ │ (proxy /api → api) │ │
│ └──────┬───────┘ │ │
│ │ │ │
│ ┌──────▼───────┐ NATS ┌───────────────▼──────────┐ │
│ │ API │◄───────────►│ Poller │ │
│ │ FastAPI │ │ Go │ │
│ │ │ │ ├─ tunnel manager │ │
│ │ - RBAC │ session │ │ (TCP proxy :49000+) │ │
│ │ - audit log │ tokens │ ├─ SSH relay │ │
│ │ - session │ (Redis) │ │ (WebSocket ↔ PTY) │ │
│ │ tokens │ │ ├─ device poller │ │
│ └──────────────┘ │ └─ cmd responder │ │
│ └───────────────┬───────────┘ │
│ │ │
│ ┌───────────────▼───────────┐ │
│ │ WireGuard │ │
│ │ 10.10.0.1/24 │ │
│ │ port 51820/udp │ │
│ └───────────────┬───────────┘ │
└───────────────────────────────────────────────┼────────────────┘
┌─────────────────────┼──────────────┐
│ │ │
RouterOS RouterOS RouterOS
(direct IP) (VPN peer) (VPN peer)
:8291 :22 10.10.0.x 10.10.0.y
:8291 :22 :8291 :22
```
**Key data paths:**
- **WinBox**: Browser click → API (auth+audit) → NATS → Poller allocates port → Docker maps `127.0.0.1:491xx` → Poller TCP proxy → WireGuard → Router:8291
- **SSH**: Browser click → API (auth+audit+token) → Browser opens WebSocket → nginx → Poller validates token → SSH+PTY → Router:22
- **Auth boundary**: API handles all RBAC and audit logging. Poller validates single-use session tokens but never does primary auth.
## RBAC
Roles allowed for remote access: `operator`, `admin`, `super_admin`.
`viewer` role receives 403 Forbidden. The API is the enforcement point; frontend hides buttons for viewers but does not rely on that for security.
Every remote access operation produces an audit log entry:
- `user_id`, `tenant_id`, `device_id`, `session_type`, `source_ip`, `timestamp`
- SSH sessions additionally log `start_time` and `end_time`
## Poller: Tunnel Manager
New package: `poller/internal/tunnel/`
### Data Structures
```go
type TunnelManager struct {
mu sync.Mutex
tunnels map[string]*Tunnel // keyed by tunnel ID (uuid)
portPool *PortPool // tracks available ports 49000-49100
idleTime time.Duration // 5 minutes
deviceStore *store.DeviceStore // DB lookup for device connection details
credCache *vault.CredentialCache
}
type Tunnel struct {
ID string
DeviceID string
TenantID string
UserID string
LocalPort int
RemoteAddr string // router IP:8291
CreatedAt time.Time
LastActive int64 // atomic, unix nanoseconds
listener net.Listener
cancel context.CancelFunc
conns sync.WaitGroup
activeConns int64 // atomic counter
}
```
### LastActive Concurrency
`LastActive` stored as `int64` (unix nanoseconds) using atomic operations:
- Write: `atomic.StoreInt64(&t.LastActive, time.Now().UnixNano())`
- Read: `time.Since(time.Unix(0, atomic.LoadInt64(&t.LastActive)))`
### Port Pool
```go
type PortPool struct {
mu sync.Mutex
ports []bool // true = in use
base int // 49000
}
```
- `Allocate()` returns next free port or error if exhausted
- `Release()` marks port as free
- Before allocation, attempt bind to verify port is actually free (handles stale Docker mappings after restart)
- All operations protected by mutex
### Tunnel Lifecycle
1. NATS message arrives on `tunnel.open`
2. Manager looks up device from database via `DeviceStore.GetDevice(deviceID)` to obtain encrypted credentials and connection details (same pattern as `CmdResponder`)
3. Decrypts device credentials via credential cache
4. Allocates port from pool (verify bind succeeds)
5. Starts TCP listener on `127.0.0.1:<port>` (never `0.0.0.0`)
6. Returns allocated port via NATS reply
7. For each incoming TCP connection:
- `t.conns.Add(1)`, increment `activeConns`
- Dial `router_ip:8291` through WireGuard (10s timeout)
- If dial fails: close client connection, decrement counter, do not update LastActive
- Bidirectional proxy with context cancellation (see below)
- On exit: decrement `activeConns`, `t.conns.Done()`
8. Background goroutine checks every 30s:
- If idle > 5 minutes AND `activeConns == 0`: close tunnel
9. Never close a tunnel while WinBox has an active socket
### TCP Proxy (per connection)
```go
func (t *Tunnel) handleConn(tunnelCtx context.Context, clientConn net.Conn) {
defer t.conns.Done()
defer atomic.AddInt64(&t.activeConns, -1)
routerConn, err := net.DialTimeout("tcp", t.RemoteAddr, 10*time.Second)
if err != nil {
clientConn.Close()
return
}
ctx, cancel := context.WithCancel(tunnelCtx) // derived from tunnel context for shutdown propagation
defer cancel() // ensure context cleanup on all exit paths
go func() {
io.Copy(routerConn, newActivityReader(clientConn, &t.LastActive))
cancel()
}()
go func() {
io.Copy(clientConn, newActivityReader(routerConn, &t.LastActive))
cancel()
}()
<-ctx.Done()
clientConn.Close()
routerConn.Close()
}
```
`activityReader` wraps `io.Reader` and calls `atomic.StoreInt64` on every `Read()`.
### Tunnel Shutdown Order
```go
func (t *Tunnel) Close() {
t.listener.Close() // 1. stop accepting new connections
t.cancel() // 2. cancel context
t.conns.Wait() // 3. wait for active connections
// 4. release port (done by manager)
// 5. delete from manager map (done by manager)
}
```
### NATS Subjects
- `tunnel.open` — Request: `{device_id, tenant_id, user_id, target_port}` → Reply: `{tunnel_id, local_port}`
- `tunnel.close` — Request: `{tunnel_id}` → Reply: `{ok}`
- `tunnel.status` — Request: `{tunnel_id}` → Reply: `{active, local_port, connected_clients, idle_seconds}`
- `tunnel.status.list` — Request: `{device_id}` → Reply: list of active tunnels
### Logging
Structured JSON logs for: tunnel creation, port allocation, client connection, client disconnect, idle timeout, tunnel close. Fields: `tunnel_id`, `device_id`, `tenant_id`, `local_port`, `remote_addr`.
## Poller: SSH Relay
New package: `poller/internal/sshrelay/`
### Data Structures
```go
type Server struct {
redis *redis.Client
credCache *vault.CredentialCache
deviceStore *store.DeviceStore
sessions map[string]*Session
mu sync.Mutex
idleTime time.Duration // 15 minutes
maxSessions int // 200
maxPerUser int // 10
maxPerDevice int // 20
}
type Session struct {
ID string // uuid
DeviceID string
TenantID string
UserID string
SourceIP string
StartTime time.Time
LastActive int64 // atomic, unix nanoseconds
sshClient *ssh.Client
sshSession *ssh.Session
ptyCols int
ptyRows int
cancel context.CancelFunc
}
```
### HTTP Server
Runs on port 8080 inside the container (configurable via `SSH_RELAY_PORT`). Not exposed to host — only accessible through nginx on Docker network.
Endpoints:
- `/ws/ssh?token=<token>` — WebSocket upgrade for SSH terminal
- `/healthz` — Health check (returns `{"status":"ok"}`)
### Connection Flow
1. Browser opens `ws://host/ws/ssh?token=<session_token>`
2. nginx proxies to poller `:8080/ws/ssh`
3. Poller validates single-use token via Redis `GETDEL`
4. Token must contain: `device_id`, `tenant_id`, `user_id`, `source_ip`, `cols`, `rows`, `created_at`
5. Verify `tenant_id` matches device's tenant
6. Check session limits (200 total, 10 per user, 20 per device) — reject with close frame if exceeded
7. Upgrade to WebSocket with hardening:
- `SetReadLimit(1 << 20)` (1MB)
- Read deadline management
- Ping/pong keepalive
- Origin validation
8. Decrypt device credentials via credential cache
9. SSH dial to router (port 22, password auth, `InsecureIgnoreHostKey`)
- Log host key fingerprint on first connect
- If dial fails: close WebSocket with error message, clean up
10. Open SSH session, request PTY (`xterm-256color`, initial cols/rows from token)
11. Obtain stdin, stdout, stderr pipes
12. Start shell
13. Bridge WebSocket ↔ SSH PTY
### WebSocket Message Protocol
- **Binary frames**: Terminal data — forwarded directly to/from SSH PTY
- **Text frames**: JSON control messages
```
{"type": "resize", "cols": 120, "rows": 40}
{"type": "ping"}
```
Resize validation: `cols > 0 && cols <= 500 && rows > 0 && rows <= 200`. Reject invalid values.
### Bridge Function
```go
func bridge(ctx context.Context, cancel context.CancelFunc,
wsConn, sshSession, stdin, stdout, stderr, lastActive *int64) {
// WebSocket → SSH stdin
go func() {
defer cancel()
for {
msgType, data, err := wsConn.Read(ctx)
if err != nil { return }
atomic.StoreInt64(lastActive, time.Now().UnixNano())
if msgType == websocket.TextMessage {
var ctrl ControlMsg
if json.Unmarshal(data, &ctrl) != nil { continue }
if ctrl.Type == "resize" {
// validate bounds
if ctrl.Cols > 0 && ctrl.Cols <= 500 && ctrl.Rows > 0 && ctrl.Rows <= 200 {
sshSession.WindowChange(ctrl.Rows, ctrl.Cols)
}
}
continue
}
stdin.Write(data)
}
}()
// SSH stdout → WebSocket
go func() {
defer cancel()
buf := make([]byte, 4096)
for {
n, err := stdout.Read(buf)
if err != nil { return }
atomic.StoreInt64(lastActive, time.Now().UnixNano())
wsConn.Write(ctx, websocket.BinaryMessage, buf[:n])
}
}()
// SSH stderr → WebSocket (merged into same stream)
go func() {
defer cancel() // stderr EOF also triggers cleanup
io.Copy(wsWriterAdapter(wsConn), stderr)
}()
<-ctx.Done()
}
```
### Session Cleanup Order
1. Cancel context (triggers bridge shutdown)
2. Close WebSocket
3. Close SSH session
4. Close SSH client
5. Remove session from server map (under mutex)
6. Publish audit event via NATS: `audit.session.end` with payload `{session_id, user_id, tenant_id, device_id, start_time, end_time, source_ip, reason}`
### Audit End-Time Pipeline
The API subscribes to the NATS subject `audit.session.end` (durable consumer, same pattern as existing NATS subscribers in `backend/app/services/nats_subscribers.py`). When a message arrives, the subscriber calls `log_action("ssh_session_end", ...)` with the session details including `end_time` and duration. This uses the existing self-committing audit service — no new persistence mechanism needed.
### Idle Timeout
Per-session goroutine, every 30s:
```
idle := time.Since(time.Unix(0, atomic.LoadInt64(&sess.LastActive)))
if idle > 15 minutes:
cancel()
```
### Source IP
Extracted from `X-Real-IP` header (set by nginx from `$remote_addr`), fallback to `X-Forwarded-For` last entry before nginx, fallback to `r.RemoteAddr`. Using `X-Real-IP` as primary avoids client-spoofed `X-Forwarded-For` entries.
### Logging
Structured JSON logs for: session start, session end (with duration and reason: disconnect/idle/error). Fields: `session_id`, `device_id`, `tenant_id`, `user_id`, `source_ip`.
## API: Remote Access Endpoints
New router: `backend/app/routers/remote_access.py`
### WinBox Tunnel
```
POST /api/tenants/{tenant_id}/devices/{device_id}/winbox-session
RBAC: operator+
```
Flow:
1. Validate JWT, require `operator+`
2. Verify device exists, belongs to tenant, is active (not disabled/deleted)
3. Return 404 if not found, 403 if tenant mismatch (never leak cross-tenant existence)
4. Extract source IP from `X-Real-IP` header (preferred, set by nginx), fallback to `request.client.host`
5. Audit log: `log_action("winbox_tunnel_open", ...)`
6. NATS request to `tunnel.open` (10s timeout)
7. If timeout or error: return 503
8. Validate returned port is in range 4900049100
9. Response:
```json
{
"tunnel_id": "uuid",
"host": "127.0.0.1",
"port": 49023,
"winbox_uri": "winbox://127.0.0.1:49023",
"idle_timeout_seconds": 300
}
```
`host` is always hardcoded to `"127.0.0.1"` — never overridden by poller response.
Rate limit: 10 requests/min per user.
### SSH Session Token
```
POST /api/tenants/{tenant_id}/devices/{device_id}/ssh-session
RBAC: operator+
Body: {"cols": 80, "rows": 24}
```
Flow:
1. Validate JWT, require `operator+`
2. Verify device exists, belongs to tenant, is active
3. Check session limits (10 per user, 20 per device) — return 429 if exceeded
4. Audit log: `log_action("ssh_session_open", ...)`
5. Generate token: `secrets.token_urlsafe(32)`
6. Store in Redis with SETEX (atomic), 120s TTL. Key format: `ssh:token:<token_value>`
```json
{
"device_id": "uuid",
"tenant_id": "uuid",
"user_id": "uuid",
"source_ip": "1.2.3.4",
"cols": 80,
"rows": 24,
"created_at": 1710288000
}
```
7. Response:
```json
{
"token": "...",
"websocket_url": "/ws/ssh?token=<token>",
"idle_timeout_seconds": 900
}
```
Rate limit: 10 requests/min per user.
Input validation: `cols` 1500, `rows` 1200.
### Tunnel Close
```
DELETE /api/tenants/{tenant_id}/devices/{device_id}/winbox-session/{tunnel_id}
RBAC: operator+
```
Idempotent — returns 200 even if tunnel already closed. Audit log recorded.
### Active Sessions
```
GET /api/tenants/{tenant_id}/devices/{device_id}/sessions
RBAC: operator+
```
NATS request to poller. If poller doesn't respond within 10s, return empty session lists (degrade gracefully).
### Schemas
```python
class WinboxSessionResponse(BaseModel):
tunnel_id: str
host: str = "127.0.0.1"
port: int
winbox_uri: str
idle_timeout_seconds: int = 300
class SSHSessionRequest(BaseModel):
cols: int = Field(default=80, gt=0, le=500)
rows: int = Field(default=24, gt=0, le=200)
class SSHSessionResponse(BaseModel):
token: str
websocket_url: str
idle_timeout_seconds: int = 900
```
### Error Responses
- 403: insufficient role or tenant mismatch
- 404: device not found
- 429: session or rate limit exceeded
- 503: poller unavailable or port range exhausted
## Frontend: Remote Access UI
### Dependencies
New: `@xterm/xterm` (v5+), `@xterm/addon-fit`, `@xterm/addon-web-links`. No other new dependencies.
### Device Page
Remote access buttons render in the device header for `operator+` roles:
```
┌──────────────────────────────────────────┐
│ site-branch-01 Online ● │
│ 10.10.0.5 RB4011 RouterOS 7.16 │
│ │
│ [ Open WinBox ] [ SSH Terminal ] │
│ │
└──────────────────────────────────────────┘
```
### WinBox Button
States: `idle`, `requesting`, `ready`, `closing`, `error`.
On click:
1. Mutation: `POST .../winbox-session`
2. On success, display:
```
WinBox tunnel ready
Connect to: 127.0.0.1:49023
[ Copy Address ] [ Close Tunnel ]
Tunnel closes after 5 min of inactivity
```
3. Attempt deep link on Windows only (detect via `navigator.userAgent`): `window.open("winbox://127.0.0.1:49023")` — must fire directly inside the click handler chain (no setTimeout) to avoid browser blocking. On macOS/Linux, skip the deep link attempt and rely on the copy-address fallback.
4. Copy button with clipboard fallback for HTTP environments (textarea + `execCommand("copy")`)
5. Navigating away does not close the tunnel — backend idle timeout handles cleanup
6. Close button disabled while DELETE request is in flight
### SSH Terminal
Two phases:
**Phase 1 — Token acquisition:**
```
POST .../ssh-session { cols, rows }
→ { token, websocket_url }
```
**Phase 2 — Terminal session:**
```typescript
const term = new Terminal({
cursorBlink: true,
fontFamily: 'Geist Mono, monospace',
fontSize: 14,
scrollback: 2000,
convertEol: true,
theme: darkMode ? darkTheme : lightTheme
})
const fitAddon = new FitAddon()
term.loadAddon(fitAddon)
term.open(containerRef)
// fit after font load
fitAddon.fit()
```
WebSocket scheme derived dynamically: `location.protocol === "https:" ? "wss" : "ws"`
**Data flow:**
- User keystroke → `term.onData``ws.send(binaryFrame)` → poller → SSH stdin
- Router output → SSH stdout → poller → `ws.onmessage``term.write(new Uint8Array(data))`
- Resize → `term.onResize` → throttled (75ms) → `ws.send(JSON.stringify({type:"resize", cols, rows}))`
**WebSocket lifecycle:**
- `onopen`: `term.write("Connecting to router...\r\n")`
- `onmessage`: binary → `term.write`, text → parse control
- `onclose`: display "Session closed." in red, disable input, show Reconnect button
- `onerror`: display "Connection error." in red
- Abnormal close codes (1006, 1008, 1011) display appropriate messages
**Reconnect**: Always requests a new token. Never reuses WebSocket or token.
**Cleanup on unmount:**
```typescript
useEffect(() => {
return () => {
term?.dispose()
ws?.close()
}
}, [])
```
**Terminal UI:**
```
┌──────────────────────────────────────────────────┐
│ SSH: site-branch-01 [ Disconnect ] │
├──────────────────────────────────────────────────┤
│ │
│ [admin@site-branch-01] > │
│ │
└──────────────────────────────────────────────────┘
SSH session active — idle timeout: 15 min
```
- Inline on device page by default, expandable to full viewport
- Auto-expand to full viewport on screens < 900px width
- Dark/light theme maps to existing Tailwind HSL tokens (no hardcoded hex)
- `tabindex=0` on terminal container for keyboard focus
- Active session indicator when sessions list returns data
### API Client Extension
```typescript
const remoteAccessApi = {
openWinbox: (tenantId: string, deviceId: string) =>
client.post<WinboxSessionResponse>(
`/tenants/${tenantId}/devices/${deviceId}/winbox-session`
),
closeWinbox: (tenantId: string, deviceId: string, tunnelId: string) =>
client.delete(
`/tenants/${tenantId}/devices/${deviceId}/winbox-session/${tunnelId}`
),
openSSH: (tenantId: string, deviceId: string, req: SSHSessionRequest) =>
client.post<SSHSessionResponse>(
`/tenants/${tenantId}/devices/${deviceId}/ssh-session`, req
),
getSessions: (tenantId: string, deviceId: string) =>
client.get<ActiveSessionsResponse>(
`/tenants/${tenantId}/devices/${deviceId}/sessions`
),
}
```
## Infrastructure
### nginx — WebSocket Proxy
Add to `infrastructure/docker/nginx-spa.conf`:
```nginx
# WebSocket upgrade mapping (top-level, outside server block)
map $http_upgrade $connection_upgrade {
default upgrade;
'' close;
}
# Inside server block:
location /ws/ssh {
resolver 127.0.0.11 valid=10s ipv6=off;
set $poller_upstream http://poller:8080;
proxy_pass $poller_upstream;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header Host $host;
proxy_read_timeout 1800s;
proxy_send_timeout 1800s;
proxy_buffering off;
proxy_request_buffering off;
proxy_busy_buffers_size 512k;
proxy_buffers 8 512k;
}
```
**CSP**: The existing `connect-src 'self'` should be sufficient for same-origin WebSocket connections in modern browsers (CSP `self` matches same-origin `ws://` and `wss://`). For maximum compatibility across all environments, explicitly add `ws: wss:` to the `connect-src` directive. HTTPS-only deployments can restrict to just `wss:`.
### Docker Compose
**Poller service additions — apply to these specific files:**
- `docker-compose.override.yml` (dev): ports, environment, ulimits, healthcheck
- `docker-compose.prod.yml` (production): ports, environment, ulimits, healthcheck, increased memory limit
- `docker-compose.staging.yml` (staging): same as prod
```yaml
poller:
ports:
- "127.0.0.1:49000-49100:49000-49100"
ulimits:
nofile:
soft: 8192
hard: 8192
environment:
TUNNEL_PORT_MIN: 49000
TUNNEL_PORT_MAX: 49100
TUNNEL_IDLE_TIMEOUT: 300
SSH_RELAY_PORT: 8080
SSH_IDLE_TIMEOUT: 900
SSH_MAX_SESSIONS: 200
SSH_MAX_PER_USER: 10
SSH_MAX_PER_DEVICE: 20
healthcheck:
test: ["CMD-SHELL", "wget --spider -q http://localhost:8080/healthz || exit 1"]
interval: 30s
timeout: 3s
retries: 3
```
**Production memory limit**: Increase poller from 256MB to 384512MB.
**Redis dependency**: Ensure `depends_on: redis: condition: service_started`.
**Docker proxy note**: The 101-port range mapping creates individual `docker-proxy` processes. For production, set `"userland-proxy": false` in `/etc/docker/daemon.json` to use iptables-based forwarding instead, which avoids spawning 101 proxy processes and improves startup time.
### Poller HTTP Server
```go
httpServer := &http.Server{
Addr: ":" + cfg.SSHRelayPort,
Handler: sshrelay.NewServer(redisClient, credCache).Handler(),
}
go httpServer.ListenAndServe()
// Graceful shutdown with 5s timeout
httpServer.Shutdown(ctx)
```
### New Environment Variables
| Variable | Default | Description |
|----------|---------|-------------|
| `TUNNEL_PORT_MIN` | `49000` | Start of WinBox tunnel port range |
| `TUNNEL_PORT_MAX` | `49100` | End of WinBox tunnel port range |
| `TUNNEL_IDLE_TIMEOUT` | `300` | WinBox tunnel idle timeout (seconds) |
| `SSH_RELAY_PORT` | `8080` | Internal HTTP/WebSocket port for SSH relay |
| `SSH_IDLE_TIMEOUT` | `900` | SSH session idle timeout (seconds) |
| `SSH_MAX_SESSIONS` | `200` | Max concurrent SSH sessions per poller |
| `SSH_MAX_PER_USER` | `10` | Max concurrent SSH sessions per user |
| `SSH_MAX_PER_DEVICE` | `20` | Max concurrent SSH sessions per device |
### Graceful Shutdown
When poller container shuts down:
1. Stop accepting new tunnels and SSH sessions
2. Close HTTP/WebSocket server (5s timeout)
3. Gracefully terminate SSH sessions
4. Close all tunnel listeners
5. Wait for active connections
6. Release tunnel ports
## Testing Strategy
### Unit Tests
**Poller (Go):**
- Port pool: allocation, release, reuse after close, concurrent access, exhaustion, bind failure retry
- Tunnel manager: lifecycle, idle detection with zero active connections, multiple concurrent connections on same tunnel, cleanup when listener creation fails
- TCP proxy: activity tracking (atomic), bidirectional shutdown, dial failure cleanup
- SSH relay: token validation (valid/expired/reused/wrong tenant), session limits, resize parsing and validation, malformed control messages, invalid JSON frames, binary frame size limits, resize flood protection, cleanup on SSH dial failure, cleanup on abrupt WebSocket close
**Backend (Python):**
- RBAC: viewer gets 403, operator gets 200
- Device validation: wrong tenant gets 404, disabled device rejected
- Token generation: stored in Redis with correct TTL
- Rate limiting: 11th request gets 429
- Session limits: exceed per-user/per-device limits gets 429
- Source IP extraction from X-Forwarded-For
- NATS timeout returns 503
- Redis unavailable during token storage
- Malformed request payloads rejected
### Integration Tests
- **Tunnel end-to-end**: API → NATS → poller allocates port → verify listening on 127.0.0.1 → TCP connect → data forwarded to mock router
- **SSH end-to-end**: API issues token → WebSocket → poller validates → SSH to mock SSHD → verify keystroke round-trip and resize
- **Token lifecycle**: consumed on first use, second use rejected, expired token rejected
- **Idle timeout**: open tunnel, no traffic, verify closes after 5min; open SSH, no activity, verify closes after 15min
- **Concurrent sessions**: 10 SSH from same user succeeds, 11th rejected
- **Tunnel stress**: 50 concurrent tunnels, verify unique ports, verify cleanup
- **SSH stress**: many simultaneous WebSocket sessions, verify limits and stability
- **Router unreachable**: SSH dial fails, WebSocket closes with error, no zombie session
- **Poller restart**: sessions terminate, frontend shows disconnect, reconnect works
- **Backward compatibility**: existing polling, config push, NATS subjects unchanged
### Security Tests
- Token replay: reuse consumed token → rejected
- Cross-tenant: user from tenant A accesses device from tenant B → rejected
- Malformed token: invalid base64, wrong length → rejected without panic
### Resource Leak Detection
During integration testing, monitor: open file descriptors, goroutine count, memory usage. Verify SSH sessions and tunnels release all resources after closure.
### Manual Testing
- WinBox tunnel to router behind WireGuard — full WinBox functionality
- SSH terminal — tab completion, arrow keys, command history, line wrapping after resize
- Deep link `winbox://` on Windows — auto-launch
- Copy address fallback on macOS/Linux
- Navigate away with open tunnel — stays open, closes on idle
- Poller restart — frontend handles disconnect, reconnect works
- Multiple SSH terminals to different devices simultaneously
- Dark/light mode terminal theme
- Chrome, Firefox, Safari — WebSocket stability, clipboard, deep link, resize
### Observability Verification
Verify structured JSON logs exist with correct fields for: tunnel created/closed, port allocated, SSH session started/ended (with duration and reason), idle timeout events.
## Rollout Sequence
1. Deploy poller changes to staging (tunnel manager, SSH relay, HTTP server, NATS subjects)
2. Deploy infrastructure changes (docker-compose ports, nginx WebSocket config, CSP, ulimits)
3. Validate tunnels and SSH relay in staging
4. Deploy API endpoints (remote access router, session tokens, audit logging, rate limiting)
5. Deploy frontend (WinBox button, SSH terminal, API client)
6. Update documentation (ARCHITECTURE, DEPLOYMENT, SECURITY, CONFIGURATION, README)
7. Tag as v9.5 with release notes covering: WinBox remote access, browser SSH terminal, new env vars, port range requirement
Never deploy frontend before backend endpoints exist.
## Out of Scope
- WinBox protocol reimplementation in browser
- SSH key authentication (password only, matching existing credential model)
- Session recording/playback
- File transfer through SSH terminal
- Multi-user shared terminal sessions

View File

@@ -1415,7 +1415,7 @@ open http://localhost</code></pre>
<tr><td><code>SMTP_USER</code></td><td><em>(none)</em></td><td>SMTP authentication username</td></tr>
<tr><td><code>SMTP_PASSWORD</code></td><td><em>(none)</em></td><td>SMTP authentication password</td></tr>
<tr><td><code>SMTP_USE_TLS</code></td><td><code>false</code></td><td>Enable STARTTLS for SMTP connections</td></tr>
<tr><td><code>SMTP_FROM_ADDRESS</code></td><td><code>noreply@mikrotik-portal.local</code></td><td>Sender address for outbound emails</td></tr>
<tr><td><code>SMTP_FROM_ADDRESS</code></td><td><code>noreply@the-other-dude.local</code></td><td>Sender address for outbound emails</td></tr>
</tbody>
</table>

View File

@@ -0,0 +1,295 @@
import { useState, useEffect, useCallback, useRef } from 'react'
import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
import { Globe, X, Loader2, RefreshCw, Maximize2, Minimize2 } from 'lucide-react'
import { remoteWinboxApi, type RemoteWinBoxSession } from '@/lib/api'
interface RemoteWinBoxButtonProps {
tenantId: string
deviceId: string
}
type State = 'idle' | 'requesting' | 'connecting' | 'active' | 'closing' | 'terminated' | 'failed'
export function RemoteWinBoxButton({ tenantId, deviceId }: RemoteWinBoxButtonProps) {
const [state, setState] = useState<State>('idle')
const [session, setSession] = useState<RemoteWinBoxSession | null>(null)
const [error, setError] = useState<string | null>(null)
const [expanded, setExpanded] = useState(false)
const [countdown, setCountdown] = useState<string | null>(null)
const pollRef = useRef<ReturnType<typeof setInterval> | null>(null)
const queryClient = useQueryClient()
// Check for existing active sessions on mount
const { data: existingSessions } = useQuery({
queryKey: ['remote-winbox-sessions', tenantId, deviceId],
queryFn: () => remoteWinboxApi.list(tenantId, deviceId),
refetchOnWindowFocus: false,
})
useEffect(() => {
if (existingSessions && state === 'idle') {
const active = existingSessions.find(
(s) => s.status === 'active' || s.status === 'creating',
)
if (active) {
setSession(active)
setState(active.status === 'active' ? 'active' : 'connecting')
}
}
}, [existingSessions, state])
// Poll session status while connecting
useEffect(() => {
if (state !== 'connecting' || !session) return
const poll = setInterval(async () => {
try {
const updated = await remoteWinboxApi.get(tenantId, deviceId, session.session_id)
setSession(updated)
if (updated.status === 'active') {
setState('active')
} else if (updated.status === 'failed') {
setState('failed')
setError('Session failed to provision')
} else if (updated.status === 'terminated') {
setState('terminated')
}
} catch {
// ignore transient polling errors
}
}, 2000)
pollRef.current = poll
return () => clearInterval(poll)
}, [state, session, tenantId, deviceId])
// Countdown timer for session expiry
useEffect(() => {
if (state !== 'active' || !session?.expires_at) {
setCountdown(null)
return
}
const tick = () => {
const remaining = Math.max(0, new Date(session.expires_at).getTime() - Date.now())
if (remaining <= 0) {
setCountdown('Expired')
setState('terminated')
return
}
const mins = Math.floor(remaining / 60000)
const secs = Math.floor((remaining % 60000) / 1000)
setCountdown(`${mins}:${secs.toString().padStart(2, '0')}`)
}
tick()
const interval = setInterval(tick, 1000)
return () => clearInterval(interval)
}, [state, session?.expires_at])
const createMutation = useMutation({
mutationFn: () => remoteWinboxApi.create(tenantId, deviceId),
onSuccess: (data) => {
setSession(data)
if (data.status === 'active') {
setState('active')
} else {
setState('connecting')
}
},
onError: (err: any) => {
setState('failed')
setError(err.response?.data?.detail || 'Failed to create session')
},
})
const closeMutation = useMutation({
mutationFn: () => {
if (!session) throw new Error('No session')
return remoteWinboxApi.delete(tenantId, deviceId, session.session_id)
},
onSuccess: () => {
setState('idle')
setSession(null)
setError(null)
queryClient.invalidateQueries({ queryKey: ['remote-winbox-sessions', tenantId, deviceId] })
},
onError: (err: any) => {
setState('failed')
setError(err.response?.data?.detail || 'Failed to close session')
},
})
const handleOpen = useCallback(() => {
setState('requesting')
setError(null)
createMutation.mutate()
}, [createMutation])
const handleClose = useCallback(() => {
setState('closing')
closeMutation.mutate()
}, [closeMutation])
const handleRetry = useCallback(() => {
setSession(null)
setError(null)
handleOpen()
}, [handleOpen])
const handleReset = useCallback(async () => {
try {
const sessions = await remoteWinboxApi.list(tenantId, deviceId)
for (const s of sessions) {
if (s.status === 'active' || s.status === 'creating' || s.status === 'grace') {
await remoteWinboxApi.delete(tenantId, deviceId, s.session_id)
}
}
} catch {
// ignore cleanup errors
}
setState('idle')
setSession(null)
setError(null)
queryClient.invalidateQueries({ queryKey: ['remote-winbox-sessions', tenantId, deviceId] })
}, [tenantId, deviceId, queryClient])
// Build iframe URL: load Xpra HTML5 client directly via nginx /xpra/{port}/ proxy
// path= tells the Xpra HTML5 client where to open the WebSocket connection
const iframeSrc = session?.session_id && session?.xpra_ws_port
? `/xpra/${session.xpra_ws_port}/index.html?path=/xpra/${session.xpra_ws_port}/&keyboard=false&floating_menu=false&sharing=false&clipboard=false`
: null
// Idle / Failed / Terminated states — show button
if (state === 'idle' || state === 'failed' || state === 'terminated') {
return (
<div>
<div className="flex items-center gap-2">
<button
onClick={handleOpen}
disabled={createMutation.isPending}
className="inline-flex items-center gap-2 px-4 py-2 rounded-md bg-primary text-primary-foreground hover:bg-primary/90 disabled:opacity-50"
>
{createMutation.isPending ? (
<Loader2 className="h-4 w-4 animate-spin" />
) : (
<Globe className="h-4 w-4" />
)}
{createMutation.isPending ? 'Starting...' : 'Remote WinBox'}
</button>
<button
onClick={handleReset}
className="inline-flex items-center gap-2 px-4 py-2 rounded-md border border-input bg-background hover:bg-accent hover:text-accent-foreground"
title="Reset all remote WinBox sessions for this device"
>
<RefreshCw className="h-4 w-4" />
Reset
</button>
</div>
{state === 'failed' && error && (
<div className="mt-2 flex items-center gap-2">
<p className="text-sm text-destructive">{error}</p>
</div>
)}
{state === 'terminated' && (
<p className="mt-2 text-sm text-muted-foreground">Session ended</p>
)}
</div>
)
}
// Requesting / Connecting — spinner
if (state === 'requesting' || state === 'connecting') {
return (
<div className="rounded-md border p-4 space-y-2">
<div className="flex items-center gap-2">
<Loader2 className="h-4 w-4 animate-spin" />
<p className="text-sm font-medium">
{state === 'requesting' ? 'Requesting session...' : 'Provisioning WinBox container...'}
</p>
</div>
<p className="text-xs text-muted-foreground">This may take a few seconds</p>
</div>
)
}
// Closing
if (state === 'closing') {
return (
<div className="rounded-md border p-4">
<div className="flex items-center gap-2">
<Loader2 className="h-4 w-4 animate-spin" />
<p className="text-sm font-medium">Closing session...</p>
</div>
</div>
)
}
// Active — show iframe
if (state === 'active' && iframeSrc) {
return (
<div
className={
expanded
? 'fixed inset-0 z-50 bg-background flex flex-col'
: 'rounded-md border flex flex-col'
}
>
{/* Header bar */}
<div className="flex items-center justify-between px-3 py-2 border-b bg-muted/50">
<div className="flex items-center gap-2">
<Globe className="h-4 w-4 text-primary" />
<span className="text-sm font-medium">Remote WinBox</span>
{countdown && (
<span className="text-xs text-muted-foreground">
Expires in {countdown}
</span>
)}
</div>
<div className="flex items-center gap-1">
<button
onClick={() => setExpanded(!expanded)}
className="p-1.5 rounded hover:bg-accent"
title={expanded ? 'Minimize' : 'Maximize'}
>
{expanded ? (
<Minimize2 className="h-4 w-4" />
) : (
<Maximize2 className="h-4 w-4" />
)}
</button>
<button
onClick={handleClose}
disabled={closeMutation.isPending}
className="p-1.5 rounded hover:bg-accent disabled:opacity-50"
title="Close session"
>
<X className="h-4 w-4" />
</button>
</div>
</div>
{/* Xpra iframe */}
<iframe
src={iframeSrc}
className={expanded ? 'flex-1 w-full' : 'w-full h-[600px]'}
style={{ border: 'none' }}
allow="clipboard-read; clipboard-write"
title="Remote WinBox Session"
/>
</div>
)
}
// Active but no iframe URL (missing xpra_ws_port) — show reset option
return (
<div className="rounded-md border p-4 space-y-2">
<p className="text-sm text-destructive">Session active but display unavailable</p>
<button
onClick={handleReset}
className="inline-flex items-center gap-2 px-3 py-1.5 rounded-md border border-input bg-background hover:bg-accent text-sm"
>
<RefreshCw className="h-3 w-3" />
Reset
</button>
</div>
)
}

View File

@@ -3,9 +3,9 @@ import { Link, useNavigate } from '@tanstack/react-router'
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
import { useAuth, isSuperAdmin, isTenantAdmin } from '@/lib/auth'
import { authApi } from '@/lib/api'
import { getSMTPSettings, updateSMTPSettings, testSMTPSettings } from '@/lib/settingsApi'
import { getSMTPSettings, updateSMTPSettings, testSMTPSettings, clearWinboxSessions } from '@/lib/settingsApi'
import { SMTP_PRESETS } from '@/lib/smtpPresets'
import { Settings, User, Shield, Info, Key, Lock, ChevronRight, Download, Trash2, AlertTriangle, Mail } from 'lucide-react'
import { Settings, User, Shield, Info, Key, Lock, ChevronRight, Download, Trash2, AlertTriangle, Mail, Monitor } from 'lucide-react'
import { Button } from '@/components/ui/button'
import { Dialog, DialogContent, DialogDescription, DialogFooter, DialogHeader, DialogTitle } from '@/components/ui/dialog'
import { Input } from '@/components/ui/input'
@@ -149,6 +149,34 @@ export function SettingsPage() {
</div>
)}
{/* Maintenance — super_admin only */}
{isSuperAdmin(user) && (
<div className="rounded-lg border border-border bg-surface px-4 py-3 space-y-1">
<SectionHeader icon={Monitor} title="Maintenance" />
<div className="flex items-center justify-between py-2">
<div>
<span className="text-sm text-text-primary">Clear WinBox Sessions</span>
<p className="text-xs text-text-muted">Remove stale sessions and rate limits from Redis</p>
</div>
<Button
variant="outline"
size="sm"
onClick={async () => {
try {
const result = await clearWinboxSessions()
toast.success(`Cleared ${result.deleted} key${result.deleted !== 1 ? 's' : ''} from Redis`)
} catch {
toast.error('Failed to clear WinBox sessions')
}
}}
>
<Trash2 className="h-3.5 w-3.5 mr-1.5" />
Clear
</Button>
</div>
</div>
)}
{/* System Email (SMTP) — super_admin only */}
{isSuperAdmin(user) && <SMTPSettingsSection />}

View File

@@ -968,6 +968,59 @@ export const remoteAccessApi = {
.then((r) => r.data),
}
// ─── Remote WinBox (Browser) ─────────────────────────────────────────────────
export interface RemoteWinBoxSession {
session_id: string
status: 'creating' | 'active' | 'grace' | 'terminating' | 'terminated' | 'failed'
websocket_path?: string
xpra_ws_port?: number
idle_timeout_seconds: number
max_lifetime_seconds: number
expires_at: string
max_expires_at: string
created_at?: string
}
export const remoteWinboxApi = {
create: (tenantId: string, deviceId: string, opts?: {
idle_timeout_seconds?: number
max_lifetime_seconds?: number
}) =>
api
.post<RemoteWinBoxSession>(
`/api/tenants/${tenantId}/devices/${deviceId}/winbox-remote-sessions`,
opts || {},
)
.then((r) => r.data),
get: (tenantId: string, deviceId: string, sessionId: string) =>
api
.get<RemoteWinBoxSession>(
`/api/tenants/${tenantId}/devices/${deviceId}/winbox-remote-sessions/${sessionId}`,
)
.then((r) => r.data),
list: (tenantId: string, deviceId: string) =>
api
.get<RemoteWinBoxSession[]>(
`/api/tenants/${tenantId}/devices/${deviceId}/winbox-remote-sessions`,
)
.then((r) => r.data),
delete: (tenantId: string, deviceId: string, sessionId: string) =>
api
.delete(
`/api/tenants/${tenantId}/devices/${deviceId}/winbox-remote-sessions/${sessionId}`,
)
.then((r) => r.data),
getWebSocketUrl: (sessionPath: string) => {
const proto = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
return `${proto}//${window.location.host}${sessionPath}`
},
}
// ─── Config History ─────────────────────────────────────────────────────────
export interface ConfigChangeEntry {

View File

@@ -10,7 +10,7 @@
* - localStorage and sessionStorage are NEVER used for any key material.
*/
const DB_NAME = 'mikrotik-portal-keys';
const DB_NAME = 'the-other-dude-keys';
const DB_VERSION = 1;
const STORE_NAME = 'secret-keys';

View File

@@ -28,6 +28,11 @@ export async function updateSMTPSettings(data: {
await api.put('/api/settings/smtp', data)
}
export async function clearWinboxSessions(): Promise<{ status: string; deleted: number }> {
const res = await api.delete('/api/settings/winbox-sessions')
return res.data
}
export async function testSMTPSettings(data: {
to: string
smtp_host?: string

View File

@@ -57,6 +57,7 @@ import { useSimpleConfigMode } from '@/hooks/useSimpleConfig'
import { SimpleModeToggle } from '@/components/simple-config/SimpleModeToggle'
import { SimpleConfigView } from '@/components/simple-config/SimpleConfigView'
import { WinBoxButton } from '@/components/fleet/WinBoxButton'
import { RemoteWinBoxButton } from '@/components/fleet/RemoteWinBoxButton'
import { SSHTerminal } from '@/components/fleet/SSHTerminal'
export const Route = createFileRoute(
@@ -456,7 +457,10 @@ function DeviceDetailPage() {
{user?.role !== 'viewer' && (
<div className="flex gap-2">
{device.routeros_version !== null && (
<WinBoxButton tenantId={tenantId} deviceId={deviceId} />
<>
<WinBoxButton tenantId={tenantId} deviceId={deviceId} />
<RemoteWinBoxButton tenantId={tenantId} deviceId={deviceId} />
</>
)}
<SSHTerminal tenantId={tenantId} deviceId={deviceId} deviceName={device.hostname} />
</div>

View File

@@ -8,7 +8,7 @@ setup('authenticate', async ({ page }) => {
// Use legacy-auth test user (no SRP/Secret Key required)
await page.getByLabel(/email/i).fill(
process.env.TEST_ADMIN_EMAIL || 'e2e-test@mikrotik-portal.dev'
process.env.TEST_ADMIN_EMAIL || 'e2e-test@the-other-dude.dev'
)
await page.getByLabel(/password/i).fill(
process.env.TEST_ADMIN_PASSWORD || 'admin123'

View File

@@ -28,7 +28,7 @@ test.describe('Login Flow', () => {
const loginPage = new LoginPage(page)
await loginPage.goto()
await loginPage.login(
process.env.TEST_ADMIN_EMAIL || 'e2e-test@mikrotik-portal.dev',
process.env.TEST_ADMIN_EMAIL || 'e2e-test@the-other-dude.dev',
process.env.TEST_ADMIN_PASSWORD || 'admin123'
)
// Legacy auth user may trigger SRP upgrade dialog -- handle it

View File

@@ -23,7 +23,7 @@ server {
# CSP for React SPA with Tailwind CSS and Leaflet maps
# worker-src required for SRP key derivation Web Worker (Safari won't fall back to script-src)
add_header Content-Security-Policy "default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'; img-src 'self' data: https://*.tile.openstreetmap.org; font-src 'self'; connect-src 'self' ws: wss:; worker-src 'self'; frame-ancestors 'none'; base-uri 'self'; form-action 'self';" always;
add_header Content-Security-Policy "default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'; img-src 'self' data: https://*.tile.openstreetmap.org; font-src 'self'; connect-src 'self' ws: wss:; worker-src 'self'; frame-ancestors 'self'; base-uri 'self'; form-action 'self';" always;
# Proxy API requests to the backend service
# The api container is reachable via Docker internal DNS as "api" on port 8000
@@ -68,6 +68,29 @@ server {
proxy_buffers 8 512k;
}
# Proxy Xpra HTML5 client requests to the winbox-worker container
location ~ ^/xpra/(\d+)/(.*) {
resolver 127.0.0.11 valid=10s ipv6=off;
set $xpra_port $1;
set $xpra_path $2;
set $worker_upstream winbox-worker;
proxy_pass http://$worker_upstream:$xpra_port/$xpra_path$is_args$args;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
proxy_read_timeout 300s;
proxy_buffering off;
# Xpra HTML5 client uses inline event handlers and eval — override the
# strict server-level CSP. Adding any add_header in a location block
# replaces all inherited server-level add_header directives in nginx.
add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval' ws: wss: data: blob:; frame-ancestors 'self';" always;
add_header X-Content-Type-Options "nosniff" always;
}
# Serve static assets with long cache headers
# Note: add_header in a location block clears parent-block headers,
# so we re-add the essential security header for static assets.

View File

@@ -1,7 +1,7 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "mikrotik-portal.name" -}}
{{- define "the-other-dude.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
@@ -10,7 +10,7 @@ Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "mikrotik-portal.fullname" -}}
{{- define "the-other-dude.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
@@ -26,16 +26,16 @@ If release name contains chart name it will be used as a full name.
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "mikrotik-portal.chart" -}}
{{- define "the-other-dude.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels applied to all resources.
*/}}
{{- define "mikrotik-portal.labels" -}}
helm.sh/chart: {{ include "mikrotik-portal.chart" . }}
{{ include "mikrotik-portal.selectorLabels" . }}
{{- define "the-other-dude.labels" -}}
helm.sh/chart: {{ include "the-other-dude.chart" . }}
{{ include "the-other-dude.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
@@ -45,81 +45,81 @@ app.kubernetes.io/managed-by: {{ .Release.Service }}
{{/*
Selector labels used in Deployments/Services to match pods.
*/}}
{{- define "mikrotik-portal.selectorLabels" -}}
app.kubernetes.io/name: {{ include "mikrotik-portal.name" . }}
{{- define "the-other-dude.selectorLabels" -}}
app.kubernetes.io/name: {{ include "the-other-dude.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
API component labels
*/}}
{{- define "mikrotik-portal.apiLabels" -}}
{{ include "mikrotik-portal.labels" . }}
{{- define "the-other-dude.apiLabels" -}}
{{ include "the-other-dude.labels" . }}
app.kubernetes.io/component: api
{{- end }}
{{/*
API selector labels
*/}}
{{- define "mikrotik-portal.apiSelectorLabels" -}}
{{ include "mikrotik-portal.selectorLabels" . }}
{{- define "the-other-dude.apiSelectorLabels" -}}
{{ include "the-other-dude.selectorLabels" . }}
app.kubernetes.io/component: api
{{- end }}
{{/*
Frontend component labels
*/}}
{{- define "mikrotik-portal.frontendLabels" -}}
{{ include "mikrotik-portal.labels" . }}
{{- define "the-other-dude.frontendLabels" -}}
{{ include "the-other-dude.labels" . }}
app.kubernetes.io/component: frontend
{{- end }}
{{/*
Frontend selector labels
*/}}
{{- define "mikrotik-portal.frontendSelectorLabels" -}}
{{ include "mikrotik-portal.selectorLabels" . }}
{{- define "the-other-dude.frontendSelectorLabels" -}}
{{ include "the-other-dude.selectorLabels" . }}
app.kubernetes.io/component: frontend
{{- end }}
{{/*
PostgreSQL component labels
*/}}
{{- define "mikrotik-portal.postgresLabels" -}}
{{ include "mikrotik-portal.labels" . }}
{{- define "the-other-dude.postgresLabels" -}}
{{ include "the-other-dude.labels" . }}
app.kubernetes.io/component: postgres
{{- end }}
{{/*
PostgreSQL selector labels
*/}}
{{- define "mikrotik-portal.postgresSelectorLabels" -}}
{{ include "mikrotik-portal.selectorLabels" . }}
{{- define "the-other-dude.postgresSelectorLabels" -}}
{{ include "the-other-dude.selectorLabels" . }}
app.kubernetes.io/component: postgres
{{- end }}
{{/*
Redis component labels
*/}}
{{- define "mikrotik-portal.redisLabels" -}}
{{ include "mikrotik-portal.labels" . }}
{{- define "the-other-dude.redisLabels" -}}
{{ include "the-other-dude.labels" . }}
app.kubernetes.io/component: redis
{{- end }}
{{/*
Redis selector labels
*/}}
{{- define "mikrotik-portal.redisSelectorLabels" -}}
{{ include "mikrotik-portal.selectorLabels" . }}
{{- define "the-other-dude.redisSelectorLabels" -}}
{{ include "the-other-dude.selectorLabels" . }}
app.kubernetes.io/component: redis
{{- end }}
{{/*
Create the name of the service account to use.
*/}}
{{- define "mikrotik-portal.serviceAccountName" -}}
{{- define "the-other-dude.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "mikrotik-portal.fullname" .) .Values.serviceAccount.name }}
{{- default (include "the-other-dude.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
@@ -129,9 +129,9 @@ Create the name of the service account to use.
Database URL for the API service (constructed from service names).
Uses external URL if postgres.enabled=false.
*/}}
{{- define "mikrotik-portal.databaseUrl" -}}
{{- define "the-other-dude.databaseUrl" -}}
{{- if .Values.postgres.enabled }}
{{- printf "postgresql+asyncpg://%s:%s@%s-postgres:%d/%s" .Values.postgres.auth.username .Values.secrets.dbPassword (include "mikrotik-portal.fullname" .) (int .Values.postgres.service.port) .Values.postgres.auth.database }}
{{- printf "postgresql+asyncpg://%s:%s@%s-postgres:%d/%s" .Values.postgres.auth.username .Values.secrets.dbPassword (include "the-other-dude.fullname" .) (int .Values.postgres.service.port) .Values.postgres.auth.database }}
{{- else }}
{{- .Values.postgres.externalUrl }}
{{- end }}
@@ -140,9 +140,9 @@ Uses external URL if postgres.enabled=false.
{{/*
App user database URL (RLS enforced).
*/}}
{{- define "mikrotik-portal.appUserDatabaseUrl" -}}
{{- define "the-other-dude.appUserDatabaseUrl" -}}
{{- if .Values.postgres.enabled }}
{{- printf "postgresql+asyncpg://%s:%s@%s-postgres:%d/%s" .Values.postgres.auth.appUsername .Values.secrets.dbAppPassword (include "mikrotik-portal.fullname" .) (int .Values.postgres.service.port) .Values.postgres.auth.database }}
{{- printf "postgresql+asyncpg://%s:%s@%s-postgres:%d/%s" .Values.postgres.auth.appUsername .Values.secrets.dbAppPassword (include "the-other-dude.fullname" .) (int .Values.postgres.service.port) .Values.postgres.auth.database }}
{{- else }}
{{- .Values.postgres.externalUrl }}
{{- end }}
@@ -151,9 +151,9 @@ App user database URL (RLS enforced).
{{/*
Sync database URL for Alembic migrations.
*/}}
{{- define "mikrotik-portal.syncDatabaseUrl" -}}
{{- define "the-other-dude.syncDatabaseUrl" -}}
{{- if .Values.postgres.enabled }}
{{- printf "postgresql+psycopg2://%s:%s@%s-postgres:%d/%s" .Values.postgres.auth.username .Values.secrets.dbPassword (include "mikrotik-portal.fullname" .) (int .Values.postgres.service.port) .Values.postgres.auth.database }}
{{- printf "postgresql+psycopg2://%s:%s@%s-postgres:%d/%s" .Values.postgres.auth.username .Values.secrets.dbPassword (include "the-other-dude.fullname" .) (int .Values.postgres.service.port) .Values.postgres.auth.database }}
{{- else }}
{{- .Values.postgres.externalUrl | replace "asyncpg" "psycopg2" }}
{{- end }}
@@ -162,9 +162,9 @@ Sync database URL for Alembic migrations.
{{/*
Redis URL (constructed from service name).
*/}}
{{- define "mikrotik-portal.redisUrl" -}}
{{- define "the-other-dude.redisUrl" -}}
{{- if .Values.redis.enabled }}
{{- printf "redis://%s-redis:%d/0" (include "mikrotik-portal.fullname" .) (int .Values.redis.service.port) }}
{{- printf "redis://%s-redis:%d/0" (include "the-other-dude.fullname" .) (int .Values.redis.service.port) }}
{{- else }}
{{- .Values.redis.externalUrl | default "redis://localhost:6379/0" }}
{{- end }}

View File

@@ -1,18 +1,18 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-api
name: {{ include "the-other-dude.fullname" . }}-api
labels:
{{- include "mikrotik-portal.apiLabels" . | nindent 4 }}
{{- include "the-other-dude.apiLabels" . | nindent 4 }}
spec:
replicas: {{ .Values.api.replicaCount }}
selector:
matchLabels:
{{- include "mikrotik-portal.apiSelectorLabels" . | nindent 6 }}
{{- include "the-other-dude.apiSelectorLabels" . | nindent 6 }}
template:
metadata:
labels:
{{- include "mikrotik-portal.apiSelectorLabels" . | nindent 8 }}
{{- include "the-other-dude.apiSelectorLabels" . | nindent 8 }}
spec:
securityContext:
runAsNonRoot: true
@@ -29,28 +29,28 @@ spec:
# Load non-sensitive config from ConfigMap
envFrom:
- configMapRef:
name: {{ include "mikrotik-portal.fullname" . }}-config
name: {{ include "the-other-dude.fullname" . }}-config
# Load secrets as individual environment variables
env:
- name: JWT_SECRET_KEY
valueFrom:
secretKeyRef:
name: {{ include "mikrotik-portal.fullname" . }}-secrets
name: {{ include "the-other-dude.fullname" . }}-secrets
key: JWT_SECRET_KEY
- name: CREDENTIAL_ENCRYPTION_KEY
valueFrom:
secretKeyRef:
name: {{ include "mikrotik-portal.fullname" . }}-secrets
name: {{ include "the-other-dude.fullname" . }}-secrets
key: CREDENTIAL_ENCRYPTION_KEY
- name: FIRST_ADMIN_EMAIL
valueFrom:
secretKeyRef:
name: {{ include "mikrotik-portal.fullname" . }}-secrets
name: {{ include "the-other-dude.fullname" . }}-secrets
key: FIRST_ADMIN_EMAIL
- name: FIRST_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "mikrotik-portal.fullname" . }}-secrets
name: {{ include "the-other-dude.fullname" . }}-secrets
key: FIRST_ADMIN_PASSWORD
livenessProbe:
httpGet:

View File

@@ -1,9 +1,9 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-api
name: {{ include "the-other-dude.fullname" . }}-api
labels:
{{- include "mikrotik-portal.apiLabels" . | nindent 4 }}
{{- include "the-other-dude.apiLabels" . | nindent 4 }}
spec:
type: {{ .Values.api.service.type }}
ports:
@@ -12,4 +12,4 @@ spec:
protocol: TCP
name: http
selector:
{{- include "mikrotik-portal.apiSelectorLabels" . | nindent 4 }}
{{- include "the-other-dude.apiSelectorLabels" . | nindent 4 }}

View File

@@ -1,15 +1,15 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-config
name: {{ include "the-other-dude.fullname" . }}-config
labels:
{{- include "mikrotik-portal.labels" . | nindent 4 }}
{{- include "the-other-dude.labels" . | nindent 4 }}
data:
DATABASE_URL: {{ include "mikrotik-portal.databaseUrl" . | quote }}
SYNC_DATABASE_URL: {{ include "mikrotik-portal.syncDatabaseUrl" . | quote }}
APP_USER_DATABASE_URL: {{ include "mikrotik-portal.appUserDatabaseUrl" . | quote }}
REDIS_URL: {{ include "mikrotik-portal.redisUrl" . | quote }}
NATS_URL: {{ printf "nats://%s-nats:%d" (include "mikrotik-portal.fullname" .) (int .Values.nats.service.port) | quote }}
DATABASE_URL: {{ include "the-other-dude.databaseUrl" . | quote }}
SYNC_DATABASE_URL: {{ include "the-other-dude.syncDatabaseUrl" . | quote }}
APP_USER_DATABASE_URL: {{ include "the-other-dude.appUserDatabaseUrl" . | quote }}
REDIS_URL: {{ include "the-other-dude.redisUrl" . | quote }}
NATS_URL: {{ printf "nats://%s-nats:%d" (include "the-other-dude.fullname" .) (int .Values.nats.service.port) | quote }}
JWT_ALGORITHM: "HS256"
JWT_ACCESS_TOKEN_EXPIRE_MINUTES: {{ .Values.api.env.jwtAccessTokenExpireMinutes | quote }}
JWT_REFRESH_TOKEN_EXPIRE_DAYS: {{ .Values.api.env.jwtRefreshTokenExpireDays | quote }}

View File

@@ -1,18 +1,18 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-frontend
name: {{ include "the-other-dude.fullname" . }}-frontend
labels:
{{- include "mikrotik-portal.frontendLabels" . | nindent 4 }}
{{- include "the-other-dude.frontendLabels" . | nindent 4 }}
spec:
replicas: {{ .Values.frontend.replicaCount }}
selector:
matchLabels:
{{- include "mikrotik-portal.frontendSelectorLabels" . | nindent 6 }}
{{- include "the-other-dude.frontendSelectorLabels" . | nindent 6 }}
template:
metadata:
labels:
{{- include "mikrotik-portal.frontendSelectorLabels" . | nindent 8 }}
{{- include "the-other-dude.frontendSelectorLabels" . | nindent 8 }}
spec:
containers:
- name: frontend
@@ -42,9 +42,9 @@ spec:
apiVersion: v1
kind: Service
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-frontend
name: {{ include "the-other-dude.fullname" . }}-frontend
labels:
{{- include "mikrotik-portal.frontendLabels" . | nindent 4 }}
{{- include "the-other-dude.frontendLabels" . | nindent 4 }}
spec:
type: {{ .Values.frontend.service.type }}
ports:
@@ -53,4 +53,4 @@ spec:
protocol: TCP
name: http
selector:
{{- include "mikrotik-portal.frontendSelectorLabels" . | nindent 4 }}
{{- include "the-other-dude.frontendSelectorLabels" . | nindent 4 }}

View File

@@ -2,9 +2,9 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: {{ include "mikrotik-portal.fullname" . }}
name: {{ include "the-other-dude.fullname" . }}
labels:
{{- include "mikrotik-portal.labels" . | nindent 4 }}
{{- include "the-other-dude.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
@@ -16,11 +16,11 @@ spec:
{{- if .Values.ingress.tls.enabled }}
tls:
- hosts:
- {{ .Values.ingress.host | default "mikrotik-portal.local" | quote }}
secretName: {{ .Values.ingress.tls.secretName | default (printf "%s-tls" (include "mikrotik-portal.fullname" .)) | quote }}
- {{ .Values.ingress.host | default "the-other-dude.local" | quote }}
secretName: {{ .Values.ingress.tls.secretName | default (printf "%s-tls" (include "the-other-dude.fullname" .)) | quote }}
{{- end }}
rules:
- host: {{ .Values.ingress.host | default "mikrotik-portal.local" | quote }}
- host: {{ .Values.ingress.host | default "the-other-dude.local" | quote }}
http:
paths:
# API routes — send /api/* to the FastAPI service
@@ -28,7 +28,7 @@ spec:
pathType: Prefix
backend:
service:
name: {{ include "mikrotik-portal.fullname" . }}-api
name: {{ include "the-other-dude.fullname" . }}-api
port:
number: {{ .Values.api.service.port }}
# Docs routes — proxy /docs and /redoc to API as well
@@ -36,14 +36,14 @@ spec:
pathType: Prefix
backend:
service:
name: {{ include "mikrotik-portal.fullname" . }}-api
name: {{ include "the-other-dude.fullname" . }}-api
port:
number: {{ .Values.api.service.port }}
- path: /redoc
pathType: Prefix
backend:
service:
name: {{ include "mikrotik-portal.fullname" . }}-api
name: {{ include "the-other-dude.fullname" . }}-api
port:
number: {{ .Values.api.service.port }}
# Frontend SPA — all other routes go to nginx
@@ -51,7 +51,7 @@ spec:
pathType: Prefix
backend:
service:
name: {{ include "mikrotik-portal.fullname" . }}-frontend
name: {{ include "the-other-dude.fullname" . }}-frontend
port:
number: {{ .Values.frontend.service.port }}
{{- end }}

View File

@@ -4,9 +4,9 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-nats-headless
name: {{ include "the-other-dude.fullname" . }}-nats-headless
labels:
{{- include "mikrotik-portal.labels" . | nindent 4 }}
{{- include "the-other-dude.labels" . | nindent 4 }}
app.kubernetes.io/component: nats
spec:
clusterIP: None
@@ -15,16 +15,16 @@ spec:
port: 4222
targetPort: 4222
selector:
{{- include "mikrotik-portal.selectorLabels" . | nindent 4 }}
{{- include "the-other-dude.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: nats
---
# NATS ClusterIP service for client access
apiVersion: v1
kind: Service
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-nats
name: {{ include "the-other-dude.fullname" . }}-nats
labels:
{{- include "mikrotik-portal.labels" . | nindent 4 }}
{{- include "the-other-dude.labels" . | nindent 4 }}
app.kubernetes.io/component: nats
spec:
type: ClusterIP
@@ -36,28 +36,28 @@ spec:
port: 8222
targetPort: 8222
selector:
{{- include "mikrotik-portal.selectorLabels" . | nindent 4 }}
{{- include "the-other-dude.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: nats
---
# NATS JetStream StatefulSet (needs stable storage)
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-nats
name: {{ include "the-other-dude.fullname" . }}-nats
labels:
{{- include "mikrotik-portal.labels" . | nindent 4 }}
{{- include "the-other-dude.labels" . | nindent 4 }}
app.kubernetes.io/component: nats
spec:
replicas: 1
serviceName: {{ include "mikrotik-portal.fullname" . }}-nats-headless
serviceName: {{ include "the-other-dude.fullname" . }}-nats-headless
selector:
matchLabels:
{{- include "mikrotik-portal.selectorLabels" . | nindent 6 }}
{{- include "the-other-dude.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: nats
template:
metadata:
labels:
{{- include "mikrotik-portal.selectorLabels" . | nindent 8 }}
{{- include "the-other-dude.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: nats
spec:
containers:

View File

@@ -2,20 +2,20 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-poller
name: {{ include "the-other-dude.fullname" . }}-poller
labels:
{{- include "mikrotik-portal.labels" . | nindent 4 }}
{{- include "the-other-dude.labels" . | nindent 4 }}
app.kubernetes.io/component: poller
spec:
replicas: {{ .Values.poller.replicaCount }}
selector:
matchLabels:
{{- include "mikrotik-portal.selectorLabels" . | nindent 6 }}
{{- include "the-other-dude.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: poller
template:
metadata:
labels:
{{- include "mikrotik-portal.selectorLabels" . | nindent 8 }}
{{- include "the-other-dude.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: poller
spec:
containers:
@@ -26,32 +26,32 @@ spec:
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: {{ include "mikrotik-portal.fullname" . }}-secrets
name: {{ include "the-other-dude.fullname" . }}-secrets
key: POLLER_DATABASE_URL
- name: CREDENTIAL_ENCRYPTION_KEY
valueFrom:
secretKeyRef:
name: {{ include "mikrotik-portal.fullname" . }}-secrets
name: {{ include "the-other-dude.fullname" . }}-secrets
key: CREDENTIAL_ENCRYPTION_KEY
- name: NATS_URL
valueFrom:
configMapKeyRef:
name: {{ include "mikrotik-portal.fullname" . }}-config
name: {{ include "the-other-dude.fullname" . }}-config
key: NATS_URL
- name: REDIS_URL
valueFrom:
configMapKeyRef:
name: {{ include "mikrotik-portal.fullname" . }}-config
name: {{ include "the-other-dude.fullname" . }}-config
key: REDIS_URL
- name: POLL_INTERVAL_SECONDS
valueFrom:
configMapKeyRef:
name: {{ include "mikrotik-portal.fullname" . }}-config
name: {{ include "the-other-dude.fullname" . }}-config
key: POLL_INTERVAL_SECONDS
- name: LOG_LEVEL
valueFrom:
configMapKeyRef:
name: {{ include "mikrotik-portal.fullname" . }}-config
name: {{ include "the-other-dude.fullname" . }}-config
key: POLLER_LOG_LEVEL
resources:
requests:

View File

@@ -2,19 +2,19 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-postgres
name: {{ include "the-other-dude.fullname" . }}-postgres
labels:
{{- include "mikrotik-portal.postgresLabels" . | nindent 4 }}
{{- include "the-other-dude.postgresLabels" . | nindent 4 }}
spec:
serviceName: {{ include "mikrotik-portal.fullname" . }}-postgres
serviceName: {{ include "the-other-dude.fullname" . }}-postgres
replicas: 1
selector:
matchLabels:
{{- include "mikrotik-portal.postgresSelectorLabels" . | nindent 6 }}
{{- include "the-other-dude.postgresSelectorLabels" . | nindent 6 }}
template:
metadata:
labels:
{{- include "mikrotik-portal.postgresSelectorLabels" . | nindent 8 }}
{{- include "the-other-dude.postgresSelectorLabels" . | nindent 8 }}
spec:
containers:
- name: postgres
@@ -32,14 +32,14 @@ spec:
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "mikrotik-portal.fullname" . }}-secrets
name: {{ include "the-other-dude.fullname" . }}-secrets
key: DB_PASSWORD
- name: APP_USER
value: {{ .Values.postgres.auth.appUsername | quote }}
- name: APP_USER_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "mikrotik-portal.fullname" . }}-secrets
name: {{ include "the-other-dude.fullname" . }}-secrets
key: DB_APP_PASSWORD
volumeMounts:
- name: postgres-data
@@ -74,7 +74,7 @@ spec:
volumes:
- name: init-scripts
configMap:
name: {{ include "mikrotik-portal.fullname" . }}-postgres-init
name: {{ include "the-other-dude.fullname" . }}-postgres-init
volumeClaimTemplates:
- metadata:
name: postgres-data
@@ -90,9 +90,9 @@ spec:
apiVersion: v1
kind: Service
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-postgres
name: {{ include "the-other-dude.fullname" . }}-postgres
labels:
{{- include "mikrotik-portal.postgresLabels" . | nindent 4 }}
{{- include "the-other-dude.postgresLabels" . | nindent 4 }}
spec:
type: ClusterIP
clusterIP: None
@@ -102,14 +102,14 @@ spec:
protocol: TCP
name: postgres
selector:
{{- include "mikrotik-portal.postgresSelectorLabels" . | nindent 4 }}
{{- include "the-other-dude.postgresSelectorLabels" . | nindent 4 }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-postgres-init
name: {{ include "the-other-dude.fullname" . }}-postgres-init
labels:
{{- include "mikrotik-portal.postgresLabels" . | nindent 4 }}
{{- include "the-other-dude.postgresLabels" . | nindent 4 }}
data:
init.sql: |
-- Create non-superuser app_user role for RLS enforcement

View File

@@ -2,18 +2,18 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-redis
name: {{ include "the-other-dude.fullname" . }}-redis
labels:
{{- include "mikrotik-portal.redisLabels" . | nindent 4 }}
{{- include "the-other-dude.redisLabels" . | nindent 4 }}
spec:
replicas: 1
selector:
matchLabels:
{{- include "mikrotik-portal.redisSelectorLabels" . | nindent 6 }}
{{- include "the-other-dude.redisSelectorLabels" . | nindent 6 }}
template:
metadata:
labels:
{{- include "mikrotik-portal.redisSelectorLabels" . | nindent 8 }}
{{- include "the-other-dude.redisSelectorLabels" . | nindent 8 }}
spec:
containers:
- name: redis
@@ -45,9 +45,9 @@ spec:
apiVersion: v1
kind: Service
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-redis
name: {{ include "the-other-dude.fullname" . }}-redis
labels:
{{- include "mikrotik-portal.redisLabels" . | nindent 4 }}
{{- include "the-other-dude.redisLabels" . | nindent 4 }}
spec:
type: ClusterIP
ports:
@@ -56,5 +56,5 @@ spec:
protocol: TCP
name: redis
selector:
{{- include "mikrotik-portal.redisSelectorLabels" . | nindent 4 }}
{{- include "the-other-dude.redisSelectorLabels" . | nindent 4 }}
{{- end }}

View File

@@ -1,9 +1,9 @@
apiVersion: v1
kind: Secret
metadata:
name: {{ include "mikrotik-portal.fullname" . }}-secrets
name: {{ include "the-other-dude.fullname" . }}-secrets
labels:
{{- include "mikrotik-portal.labels" . | nindent 4 }}
{{- include "the-other-dude.labels" . | nindent 4 }}
type: Opaque
stringData:
JWT_SECRET_KEY: {{ .Values.secrets.jwtSecretKey | quote }}
@@ -12,4 +12,4 @@ stringData:
FIRST_ADMIN_PASSWORD: {{ .Values.secrets.firstAdminPassword | quote }}
DB_PASSWORD: {{ .Values.secrets.dbPassword | quote }}
DB_APP_PASSWORD: {{ .Values.secrets.dbAppPassword | quote }}
POLLER_DATABASE_URL: {{ printf "postgres://poller_user:%s@%s-postgres:%d/%s" .Values.secrets.dbPollerPassword (include "mikrotik-portal.fullname" .) (int .Values.postgres.service.port) .Values.postgres.auth.database | quote }}
POLLER_DATABASE_URL: {{ printf "postgres://poller_user:%s@%s-postgres:%d/%s" .Values.secrets.dbPollerPassword (include "the-other-dude.fullname" .) (int .Values.postgres.service.port) .Values.postgres.auth.database | quote }}

View File

@@ -1,4 +1,4 @@
# Default values for mikrotik-portal.
# Default values for the-other-dude.
# These values should work with `helm install` out of the box for development.
# Production deployments MUST override secrets.jwtSecretKey, secrets.credentialEncryptionKey,
# and secrets.firstAdminPassword.
@@ -10,7 +10,7 @@ api:
replicaCount: 1
image:
repository: mikrotik-portal/api
repository: the-other-dude/api
tag: latest
pullPolicy: IfNotPresent
@@ -54,7 +54,7 @@ frontend:
replicaCount: 1
image:
repository: mikrotik-portal/frontend
repository: the-other-dude/frontend
tag: latest
pullPolicy: IfNotPresent
@@ -161,7 +161,7 @@ poller:
replicaCount: 2
image:
repository: mikrotik-portal/poller
repository: the-other-dude/poller
tag: latest
pullPolicy: IfNotPresent
@@ -191,7 +191,7 @@ ingress:
tls:
enabled: false
# secretName: mikrotik-portal-tls
# secretName: the-other-dude-tls
# -----------------------------------------------------------------------
# Secrets
@@ -206,7 +206,7 @@ secrets:
credentialEncryptionKey: ""
# First admin account (created on first startup)
firstAdminEmail: "admin@mikrotik-portal.local"
firstAdminEmail: "admin@the-other-dude.local"
firstAdminPassword: ""
# PostgreSQL superuser password

View File

@@ -0,0 +1,11 @@
storage "file" {
path = "/openbao/data"
}
listener "tcp" {
address = "0.0.0.0:8200"
tls_disable = true
}
api_addr = "http://127.0.0.1:8200"
ui = false

View File

@@ -1,31 +1,107 @@
#!/bin/sh
#!/bin/sh
# OpenBao Transit initialization script
# Runs after OpenBao starts in dev mode
# Handles first-run init, sealed unseal, and already-unsealed cases
set -e
export BAO_ADDR="http://127.0.0.1:8200"
export BAO_TOKEN="${BAO_DEV_ROOT_TOKEN_ID:-dev-openbao-token}"
# Wait for OpenBao to be ready
# ---------------------------------------------------------------------------
# Wait for OpenBao HTTP listener to accept connections.
# We hit /v1/sys/health which returns 200 (unsealed), 429 (standby),
# 472 (perf-standby), 501 (uninitialized), or 503 (sealed).
# Any HTTP response means the server is up; connection refused means not yet.
# ---------------------------------------------------------------------------
echo "Waiting for OpenBao to start..."
until bao status >/dev/null 2>&1; do
until wget -qO /dev/null http://127.0.0.1:8200/v1/sys/health 2>/dev/null; do
# wget returns 0 only on 2xx; for 4xx/5xx it returns 8.
# But connection refused returns 4. Check if we got ANY HTTP response.
rc=0
wget -S -qO /dev/null http://127.0.0.1:8200/v1/sys/health 2>&1 | grep -q "HTTP/" && break
sleep 0.5
done
echo "OpenBao is ready"
# Enable Transit secrets engine (idempotent - ignores "already enabled" errors)
# ---------------------------------------------------------------------------
# Determine current state via structured output
# ---------------------------------------------------------------------------
STATUS_JSON="$(bao status -format=json 2>/dev/null || true)"
INITIALIZED="$(echo "$STATUS_JSON" | grep '"initialized"' | head -1 | awk -F: '{gsub(/[^a-z]/, "", $2); print $2}')"
SEALED="$(echo "$STATUS_JSON" | grep '"sealed"' | head -1 | awk -F: '{gsub(/[^a-z]/, "", $2); print $2}')"
# ---------------------------------------------------------------------------
# Scenario 1 First run (not initialized)
# ---------------------------------------------------------------------------
if [ "$INITIALIZED" != "true" ]; then
echo "OpenBao is not initialized — running first-time setup..."
INIT_JSON="$(bao operator init -key-shares=1 -key-threshold=1 -format=json)"
UNSEAL_KEY="$(echo "$INIT_JSON" | grep '"unseal_keys_b64"' -A1 | tail -1 | tr -d ' ",[]\r')"
ROOT_TOKEN="$(echo "$INIT_JSON" | grep '"root_token"' | awk -F'"' '{print $4}')"
export BAO_TOKEN="$ROOT_TOKEN"
echo ""
echo "═══════════════════════════════════════════════════════════════"
echo " OPENBAO FIRST-RUN CREDENTIALS — SAVE THESE TO .env"
echo "═══════════════════════════════════════════════════════════════"
echo ""
echo " BAO_UNSEAL_KEY=$UNSEAL_KEY"
echo " OPENBAO_TOKEN=$ROOT_TOKEN"
echo ""
echo " Add both values to your .env file so subsequent starts"
echo " can unseal and authenticate automatically."
echo ""
echo "═══════════════════════════════════════════════════════════════"
echo ""
echo "Unsealing OpenBao..."
bao operator unseal "$UNSEAL_KEY"
# ---------------------------------------------------------------------------
# Scenario 2 Sealed, key provided
# ---------------------------------------------------------------------------
elif [ "$SEALED" = "true" ]; then
if [ -z "$BAO_UNSEAL_KEY" ]; then
echo "ERROR: OpenBao is sealed but BAO_UNSEAL_KEY is not set." >&2
echo " Provide BAO_UNSEAL_KEY in the environment or .env file." >&2
exit 1
fi
echo "OpenBao is sealed — unsealing..."
bao operator unseal "$BAO_UNSEAL_KEY"
# ---------------------------------------------------------------------------
# Scenario 3 Already unsealed
# ---------------------------------------------------------------------------
else
echo "OpenBao is already unsealed"
fi
# ---------------------------------------------------------------------------
# Verify BAO_TOKEN is available for Transit setup
# (Scenario 1 exports it from init output; Scenarios 2/3 inherit from env)
# ---------------------------------------------------------------------------
if [ -z "$BAO_TOKEN" ]; then
echo "ERROR: BAO_TOKEN is not set. Set OPENBAO_TOKEN in .env / .env.prod." >&2
exit 1
fi
export BAO_TOKEN
# ---------------------------------------------------------------------------
# Transit engine + policy setup (idempotent)
# ---------------------------------------------------------------------------
echo "Configuring Transit engine and policies..."
bao secrets enable transit 2>/dev/null || true
echo "Transit engine enabled"
# Create policy for the API backend (full Transit access)
bao policy write api-policy - <<'POLICY'
path "transit/*" {
capabilities = ["create", "read", "update", "delete", "list"]
}
POLICY
# Create policy for the Go poller (encrypt + decrypt only)
bao policy write poller-policy - <<'POLICY'
path "transit/decrypt/tenant_*" {
capabilities = ["update"]

View File

@@ -0,0 +1,71 @@
# The Other Dude — Apache reverse proxy example
#
# Required modules:
# a2enmod proxy proxy_http proxy_wstunnel rewrite ssl headers
#
# This config assumes:
# - TOD frontend runs on FRONTEND_HOST:3000
# - TOD API runs on API_HOST:8001
# - WinBox worker Xpra ports are on WORKER_HOST:10100-10119
#
# Replace tod.example.com and upstream addresses with your values.
<VirtualHost *:80>
ServerName tod.example.com
RewriteEngine On
RewriteRule ^(.*)$ https://%{HTTP_HOST}$1 [R=301,L]
</VirtualHost>
<VirtualHost *:443>
ServerName tod.example.com
SSLEngine on
SSLCertificateFile /etc/ssl/certs/tod.example.com.pem
SSLCertificateKeyFile /etc/ssl/private/tod.example.com.key
# ── Security headers ──────────────────────────────────────────────
Header always set X-Frame-Options "SAMEORIGIN"
Header always set X-Content-Type-Options "nosniff"
Header always set X-XSS-Protection "1; mode=block"
Header always set Referrer-Policy "strict-origin-when-cross-origin"
# ── Xpra (Remote WinBox) ─────────────────────────────────────────
# Must appear BEFORE the general proxy rules.
# WebSocket upgrade is required. Do NOT enable mod_deflate on this path
# — compressing WebSocket binary frames corrupts Xpra mouse/keyboard data.
#
# ProxyPassMatch uses regex to capture the port and forward to the worker.
# Ports 10100-10119 (up to 20 concurrent sessions).
RewriteEngine On
# WebSocket upgrade for Xpra
RewriteCond %{HTTP:Upgrade} =websocket [NC]
RewriteRule ^/xpra/(\d+)/(.*)$ ws://YOUR_TOD_HOST:$1/$2 [P,L]
# Regular HTTP requests for Xpra HTML5 client assets
ProxyPassMatch "^/xpra/(\d+)/(.*)" "http://YOUR_TOD_HOST:$1/$2"
# Relaxed CSP for Xpra HTML5 client (inline scripts + eval)
<LocationMatch "^/xpra/">
Header always set Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval' ws: wss: data: blob:; frame-ancestors 'self';"
SetEnv no-gzip 1
</LocationMatch>
# ── API ───────────────────────────────────────────────────────────
ProxyPass /api/ http://YOUR_TOD_HOST:8001/api/
ProxyPassReverse /api/ http://YOUR_TOD_HOST:8001/api/
ProxyTimeout 300
RequestHeader set X-Forwarded-Proto "https"
<Location /api/>
# Let the API set its own CSP
Header unset Content-Security-Policy
</Location>
# ── Frontend (SPA) ────────────────────────────────────────────────
ProxyPass / http://YOUR_TOD_HOST:3000/
ProxyPassReverse / http://YOUR_TOD_HOST:3000/
ProxyPreserveHost On
</VirtualHost>

View File

@@ -0,0 +1,71 @@
# The Other Dude — Caddy reverse proxy example
#
# This config assumes:
# - TOD frontend runs on FRONTEND_HOST:3000
# - TOD API runs on API_HOST:8001
# - WinBox worker Xpra ports are on WORKER_HOST:10100-10119
#
# Replace tod.example.com and the upstream IPs with your values.
# Caddy handles TLS automatically via Let's Encrypt.
tod.example.com {
log {
output file /var/log/caddy/tod.log {
roll_size 50mb
roll_keep 5
}
format json
}
encode zstd gzip
header {
X-Content-Type-Options "nosniff"
X-Frame-Options "SAMEORIGIN"
X-XSS-Protection "1; mode=block"
Referrer-Policy "strict-origin-when-cross-origin"
-Server
}
# ── Xpra (Remote WinBox) ──────────────────────────────────────────
# Proxies the Xpra HTML5 client to winbox-worker Xpra ports.
# Port range 10100-10119 (up to 20 concurrent sessions).
# Uses scoped compression to avoid corrupting WebSocket binary frames.
@xpra path_regexp xpra ^/xpra/(101[0-1][0-9])/(.*)$
handle @xpra {
# Override parent encode — only compress text assets, NOT WebSocket frames
encode {
gzip
match {
header Content-Type text/*
header Content-Type application/javascript*
header Content-Type application/json*
}
}
uri strip_prefix /xpra/{re.xpra.1}
reverse_proxy {$WORKER_HOST:YOUR_TOD_HOST}:{re.xpra.1} {
header_up Host {host}
header_up X-Real-IP {remote_host}
}
}
# ── API ───────────────────────────────────────────────────────────
handle /api/* {
reverse_proxy http://{$API_HOST:YOUR_TOD_HOST}:8001 {
header_up Host {host}
header_up X-Real-IP {remote_host}
transport http {
dial_timeout 30s
response_header_timeout 60s
}
}
}
# ── Frontend (SPA) ────────────────────────────────────────────────
handle {
reverse_proxy http://{$FRONTEND_HOST:YOUR_TOD_HOST}:3000 {
header_up Host {host}
header_up X-Real-IP {remote_host}
}
}
}

View File

@@ -0,0 +1,77 @@
# The Other Dude — HAProxy reverse proxy example
#
# This config assumes:
# - TOD frontend runs on FRONTEND_HOST:3000
# - TOD API runs on API_HOST:8001
# - WinBox worker Xpra ports are on WORKER_HOST:10100-10119
# - TLS is terminated by HAProxy
#
# Replace tod.example.com and upstream addresses with your values.
#
# IMPORTANT: Do NOT enable compression on the xpra backend —
# compressing WebSocket binary frames corrupts Xpra mouse/keyboard data.
global
log stdout format raw local0
maxconn 4096
defaults
log global
mode http
option httplog
timeout connect 10s
timeout client 300s
timeout server 300s
timeout tunnel 3600s
# ── Frontend ─────────────────────────────────────────────────────────
frontend https
bind *:443 ssl crt /etc/ssl/certs/tod.example.com.pem
bind *:80
redirect scheme https code 301 if !{ ssl_fc }
# Security headers
http-response set-header X-Frame-Options "SAMEORIGIN"
http-response set-header X-Content-Type-Options "nosniff"
http-response set-header Referrer-Policy "strict-origin-when-cross-origin"
# Routing rules (order matters — first match wins)
acl is_xpra path_beg /xpra/
acl is_api path_beg /api/
use_backend xpra if is_xpra
use_backend api if is_api
default_backend frontend
# ── Backends ─────────────────────────────────────────────────────────
backend api
option forwardfor
http-request set-header X-Forwarded-Proto https
server api1 YOUR_TOD_HOST:8001 check
backend frontend
option forwardfor
server fe1 YOUR_TOD_HOST:3000 check
# Xpra backend — uses a Lua or map-based approach to extract the port
# from the URL path. This example covers port 10100; add servers for
# 10101-10119 as needed, or use HAProxy's Lua scripting for dynamic routing.
#
# WARNING: Do NOT add "compression" directives to this backend.
backend xpra
option forwardfor
# Strip /xpra/{port} prefix
http-request set-path %[path,regsub(^/xpra/[0-9]+/,/)]
# Route to the correct port based on URL
# For dynamic port routing, use a map file or Lua script.
# Static example for port 10100:
acl xpra_10100 path_beg /xpra/10100/
use-server xpra10100 if xpra_10100
server xpra10100 YOUR_TOD_HOST:10100 check
# server xpra10101 YOUR_TOD_HOST:10101 check
# ... add through 10119 as needed

View File

@@ -0,0 +1,90 @@
# The Other Dude — nginx reverse proxy example
#
# This config assumes:
# - TOD frontend runs on FRONTEND_HOST:3000
# - TOD API runs on API_HOST:8001
# - WinBox worker Xpra ports are on WORKER_HOST:10100-10119
# - TLS is terminated by nginx (or upstream load balancer)
#
# Replace tod.example.com and upstream addresses with your values.
map $http_upgrade $connection_upgrade {
default upgrade;
'' close;
}
upstream tod_frontend {
server YOUR_TOD_HOST:3000;
}
upstream tod_api {
server YOUR_TOD_HOST:8001;
}
server {
listen 80;
server_name tod.example.com;
return 301 https://$host$request_uri;
}
server {
listen 443 ssl http2;
server_name tod.example.com;
ssl_certificate /etc/ssl/certs/tod.example.com.pem;
ssl_certificate_key /etc/ssl/private/tod.example.com.key;
# ── Security headers ──────────────────────────────────────────────
add_header X-Frame-Options "SAMEORIGIN" always;
add_header X-Content-Type-Options "nosniff" always;
add_header X-XSS-Protection "1; mode=block" always;
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
add_header Content-Security-Policy "default-src 'self'; script-src 'self'; style-src 'self' 'unsafe-inline'; img-src 'self' data: https://*.tile.openstreetmap.org; font-src 'self'; connect-src 'self' ws: wss:; worker-src 'self'; frame-ancestors 'self'; base-uri 'self'; form-action 'self';" always;
# ── API ───────────────────────────────────────────────────────────
location /api/ {
proxy_pass http://tod_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_buffering off;
proxy_read_timeout 300s;
proxy_hide_header Content-Security-Policy;
}
# ── Xpra (Remote WinBox) ─────────────────────────────────────────
# Proxies Xpra HTML5 client to winbox-worker ports 10100-10119.
# WebSocket support is required. Do NOT enable gzip on this location
# — compressing WebSocket binary frames corrupts Xpra mouse/keyboard data.
location ~ ^/xpra/(\d+)/(.*) {
set $xpra_port $1;
set $xpra_path $2;
proxy_pass http://YOUR_TOD_HOST:$xpra_port/$xpra_path$is_args$args;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
proxy_read_timeout 300s;
proxy_buffering off;
# Xpra HTML5 client needs relaxed CSP (inline scripts + eval)
# Adding add_header in a location block replaces all server-level headers in nginx
add_header Content-Security-Policy "default-src 'self' 'unsafe-inline' 'unsafe-eval' ws: wss: data: blob:; frame-ancestors 'self';" always;
add_header X-Content-Type-Options "nosniff" always;
}
# ── Frontend (SPA) ────────────────────────────────────────────────
location / {
proxy_pass http://tod_frontend;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
}

View File

@@ -0,0 +1,93 @@
# The Other Dude — Traefik dynamic configuration example
#
# This config assumes:
# - TOD frontend runs on FRONTEND_HOST:3000
# - TOD API runs on API_HOST:8001
# - WinBox worker Xpra ports are on WORKER_HOST:10100-10119
# - Traefik entrypoints: web (80) and websecure (443)
#
# Replace tod.example.com and upstream addresses with your values.
#
# For Docker-based Traefik, labels can replace this file.
# This example uses file provider for clarity.
http:
routers:
# ── Xpra (Remote WinBox) ────────────────────────────────────────
# Must be higher priority than the frontend catch-all.
# Each Xpra port needs its own service since Traefik doesn't
# support dynamic port extraction from path regex.
# Shown for port 10100; duplicate for 10101-10119 as needed.
tod-xpra-10100:
rule: "Host(`tod.example.com`) && PathPrefix(`/xpra/10100/`)"
entryPoints: [websecure]
service: tod-xpra-10100
middlewares: [xpra-strip, xpra-headers]
tls:
certResolver: letsencrypt
priority: 30
# ── API ─────────────────────────────────────────────────────────
tod-api:
rule: "Host(`tod.example.com`) && PathPrefix(`/api/`)"
entryPoints: [websecure]
service: tod-api
middlewares: [security-headers]
tls:
certResolver: letsencrypt
priority: 20
# ── Frontend (SPA) ──────────────────────────────────────────────
tod-frontend:
rule: "Host(`tod.example.com`)"
entryPoints: [websecure]
service: tod-frontend
middlewares: [security-headers]
tls:
certResolver: letsencrypt
priority: 10
services:
tod-xpra-10100:
loadBalancer:
servers:
- url: "http://YOUR_TOD_HOST:10100"
# Add tod-xpra-10101 through tod-xpra-10119 as needed
tod-api:
loadBalancer:
servers:
- url: "http://YOUR_TOD_HOST:8001"
tod-frontend:
loadBalancer:
servers:
- url: "http://YOUR_TOD_HOST:3000"
middlewares:
xpra-strip:
# Strip /xpra/{port} prefix before forwarding
stripPrefixRegex:
regex: ["^/xpra/[0-9]+"]
xpra-headers:
headers:
# Relaxed CSP for Xpra HTML5 client (inline scripts + eval)
customResponseHeaders:
Content-Security-Policy: "default-src 'self' 'unsafe-inline' 'unsafe-eval' ws: wss: data: blob:; frame-ancestors 'self';"
X-Content-Type-Options: "nosniff"
# IMPORTANT: Disable compression for Xpra — compressing WebSocket
# binary frames corrupts mouse/keyboard coordinate data.
security-headers:
headers:
frameDeny: true
contentTypeNosniff: true
browserXssFilter: true
referrerPolicy: "strict-origin-when-cross-origin"
customResponseHeaders:
X-Frame-Options: "SAMEORIGIN"
# IMPORTANT: Disable Traefik's built-in compression for Xpra routes.
# If using --entrypoints.websecure.http.middlewares=compress@...,
# exclude the xpra router or WebSocket binary frames will be corrupted.

View File

@@ -18,14 +18,14 @@ import (
"github.com/bsm/redislock"
"github.com/redis/go-redis/v9"
"github.com/mikrotik-portal/poller/internal/bus"
"github.com/mikrotik-portal/poller/internal/config"
"github.com/mikrotik-portal/poller/internal/observability"
"github.com/mikrotik-portal/poller/internal/poller"
"github.com/mikrotik-portal/poller/internal/sshrelay"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/tunnel"
"github.com/mikrotik-portal/poller/internal/vault"
"github.com/staack/the-other-dude/poller/internal/bus"
"github.com/staack/the-other-dude/poller/internal/config"
"github.com/staack/the-other-dude/poller/internal/observability"
"github.com/staack/the-other-dude/poller/internal/poller"
"github.com/staack/the-other-dude/poller/internal/sshrelay"
"github.com/staack/the-other-dude/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/tunnel"
"github.com/staack/the-other-dude/poller/internal/vault"
)
func main() {

View File

@@ -1,4 +1,4 @@
module github.com/mikrotik-portal/poller
module github.com/staack/the-other-dude/poller
go 1.25.0

View File

@@ -17,7 +17,7 @@ import (
"github.com/nats-io/nats.go"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/store"
)
// ErrLockNotObtained is returned when a backup lock cannot be acquired

View File

@@ -10,7 +10,7 @@ import (
natsserver "github.com/nats-io/nats-server/v2/server"
"github.com/nats-io/nats.go"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/store"
)
// mockDeviceStore implements a minimal device store for testing.

View File

@@ -18,9 +18,9 @@ import (
"github.com/nats-io/nats.go"
"github.com/mikrotik-portal/poller/internal/device"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/vault"
"github.com/staack/the-other-dude/poller/internal/device"
"github.com/staack/the-other-dude/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/vault"
)
// CertDeployResponder handles NATS request-reply for certificate deployment.

View File

@@ -16,9 +16,9 @@ import (
"github.com/nats-io/nats.go"
"github.com/mikrotik-portal/poller/internal/device"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/vault"
"github.com/staack/the-other-dude/poller/internal/device"
"github.com/staack/the-other-dude/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/vault"
)
// CmdResponder handles NATS request-reply for device commands.

View File

@@ -11,7 +11,7 @@ import (
"github.com/nats-io/nats.go"
"github.com/mikrotik-portal/poller/internal/vault"
"github.com/staack/the-other-dude/poller/internal/vault"
)
// CredentialSubscriber listens for credential change events and invalidates

View File

@@ -11,7 +11,7 @@ import (
"github.com/nats-io/nats.go"
"github.com/nats-io/nats.go/jetstream"
"github.com/mikrotik-portal/poller/internal/device"
"github.com/staack/the-other-dude/poller/internal/device"
)
// DeviceStatusEvent is the payload published to NATS JetStream when a device

View File

@@ -11,8 +11,8 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/mikrotik-portal/poller/internal/bus"
"github.com/mikrotik-portal/poller/internal/testutil"
"github.com/staack/the-other-dude/poller/internal/bus"
"github.com/staack/the-other-dude/poller/internal/testutil"
)
func TestPublisher_PublishStatus_Integration(t *testing.T) {

View File

@@ -13,9 +13,9 @@ import (
"github.com/nats-io/nats.go"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/tunnel"
"github.com/mikrotik-portal/poller/internal/vault"
"github.com/staack/the-other-dude/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/tunnel"
"github.com/staack/the-other-dude/poller/internal/vault"
)
// TunnelOpenRequest is the JSON payload for a tunnel.open NATS request.

View File

@@ -13,11 +13,11 @@ import (
"github.com/bsm/redislock"
"github.com/redis/go-redis/v9"
"github.com/mikrotik-portal/poller/internal/bus"
"github.com/mikrotik-portal/poller/internal/device"
"github.com/mikrotik-portal/poller/internal/observability"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/vault"
"github.com/staack/the-other-dude/poller/internal/bus"
"github.com/staack/the-other-dude/poller/internal/device"
"github.com/staack/the-other-dude/poller/internal/observability"
"github.com/staack/the-other-dude/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/vault"
)
// backupDeviceState tracks per-device backup state.

View File

@@ -11,8 +11,8 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/mikrotik-portal/poller/internal/device"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/device"
"github.com/staack/the-other-dude/poller/internal/store"
)
// mockSSHHostKeyUpdater implements SSHHostKeyUpdater for testing.

View File

@@ -13,9 +13,9 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/mikrotik-portal/poller/internal/bus"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/testutil"
"github.com/staack/the-other-dude/poller/internal/bus"
"github.com/staack/the-other-dude/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/testutil"
)
// TestPollPublishConsumeCycle_Integration verifies the complete pipeline:

View File

@@ -3,7 +3,7 @@ package poller
import (
"context"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/store"
)
// DeviceFetcher is the subset of store.DeviceStore that the Scheduler needs.

View File

@@ -8,10 +8,10 @@ import (
"github.com/bsm/redislock"
"github.com/mikrotik-portal/poller/internal/bus"
"github.com/mikrotik-portal/poller/internal/observability"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/vault"
"github.com/staack/the-other-dude/poller/internal/bus"
"github.com/staack/the-other-dude/poller/internal/observability"
"github.com/staack/the-other-dude/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/vault"
)
// deviceState tracks per-device circuit breaker and lifecycle state.

View File

@@ -10,8 +10,8 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/vault"
"github.com/staack/the-other-dude/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/vault"
)
// mockDeviceFetcher implements DeviceFetcher for testing.

View File

@@ -12,11 +12,11 @@ import (
"github.com/bsm/redislock"
"github.com/redis/go-redis/v9"
"github.com/mikrotik-portal/poller/internal/bus"
"github.com/mikrotik-portal/poller/internal/device"
"github.com/mikrotik-portal/poller/internal/observability"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/vault"
"github.com/staack/the-other-dude/poller/internal/bus"
"github.com/staack/the-other-dude/poller/internal/device"
"github.com/staack/the-other-dude/poller/internal/observability"
"github.com/staack/the-other-dude/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/vault"
)
// ErrDeviceOffline is returned by PollDevice when a device cannot be reached.

View File

@@ -11,9 +11,9 @@ import (
"time"
"github.com/google/uuid"
"github.com/mikrotik-portal/poller/internal/bus"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/vault"
"github.com/staack/the-other-dude/poller/internal/bus"
"github.com/staack/the-other-dude/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/vault"
"github.com/redis/go-redis/v9"
"golang.org/x/crypto/ssh"
"nhooyr.io/websocket"

View File

@@ -7,8 +7,8 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/testutil"
"github.com/staack/the-other-dude/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/testutil"
)
func TestDeviceStore_FetchDevices_Integration(t *testing.T) {

View File

@@ -19,7 +19,7 @@ import (
"github.com/testcontainers/testcontainers-go/modules/redis"
"github.com/testcontainers/testcontainers-go/wait"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/store"
)
// devicesSchema is the minimal DDL needed for integration tests against the

View File

@@ -9,8 +9,8 @@ import (
"time"
"github.com/google/uuid"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/vault"
"github.com/staack/the-other-dude/poller/internal/store"
"github.com/staack/the-other-dude/poller/internal/vault"
)
// OpenTunnelResponse is returned by Manager.OpenTunnel.

View File

@@ -14,7 +14,7 @@ import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/mikrotik-portal/poller/internal/device"
"github.com/staack/the-other-dude/poller/internal/device"
)
// CachedCreds holds decrypted device credentials.

2
winbox-worker/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
# Compiled binary
/worker

77
winbox-worker/Dockerfile Normal file
View File

@@ -0,0 +1,77 @@
# Stage 1: Build Go session manager
FROM golang:1.22-bookworm AS builder
WORKDIR /build
COPY go.mod go.sum ./
RUN go mod download
COPY . .
RUN CGO_ENABLED=0 go build -o /winbox-worker ./cmd/worker/
# Stage 2: Runtime with Xpra + WinBox
FROM ubuntu:24.04 AS runtime
ARG WINBOX_VERSION=4.0.1
ARG WINBOX_SHA256=8ec2d08929fd434c4b88881f3354bdf60b057ecd2fb54961dd912df57e326a70
# Install Xpra + X11 deps
# Use distro xpra (works on all architectures including arm64 via emulation)
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
curl \
unzip \
xvfb \
xpra \
libjs-jquery \
libjs-jquery-ui \
libxcb1 \
libxcb-icccm4 \
libxcb-image0 \
libxcb-keysyms1 \
libxcb-render-util0 \
libxcb-cursor0 \
libxcb-shape0 \
libx11-6 \
libx11-xcb1 \
libxkbcommon0 \
libxkbcommon-x11-0 \
libgl1 \
libgl1-mesa-dri \
libegl1 \
libegl-mesa0 \
libfontconfig1 \
libdbus-1-3 \
xauth \
python3-pil \
&& rm -rf /var/lib/apt/lists/*
# Download and verify WinBox binary
RUN curl -fsSL -o /tmp/WinBox_Linux.zip \
"https://download.mikrotik.com/routeros/winbox/${WINBOX_VERSION}/WinBox_Linux.zip" \
&& echo "${WINBOX_SHA256} /tmp/WinBox_Linux.zip" | sha256sum -c - \
&& mkdir -p /opt/winbox \
&& unzip /tmp/WinBox_Linux.zip -d /opt/winbox \
&& chmod +x /opt/winbox/WinBox \
&& rm /tmp/WinBox_Linux.zip
# Patch Xpra HTML5 client: _poll_clipboard is called on every mouse click
# but never checks clipboard_enabled, causing clipboard permission prompts
RUN sed -i 's/XpraClient.prototype._poll_clipboard = function(e) {/XpraClient.prototype._poll_clipboard = function(e) {\n\tif (!this.clipboard_enabled) { return; }/' \
/usr/share/xpra/www/js/Client.js
# Create non-root user
RUN groupadd --gid 1001 worker && \
useradd --uid 1001 --gid worker --create-home worker
# Create session directory and XDG runtime dir
RUN mkdir -p /tmp/winbox-sessions && chown worker:worker /tmp/winbox-sessions && \
mkdir -p /run/user/1001/xpra && chown -R worker:worker /run/user/1001
# Copy Go binary
COPY --from=builder /winbox-worker /usr/local/bin/winbox-worker
USER worker
EXPOSE 9090
ENTRYPOINT ["/usr/local/bin/winbox-worker"]

View File

@@ -0,0 +1,174 @@
package main
import (
"context"
"encoding/json"
"log/slog"
"net/http"
"os"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
"github.com/the-other-dude/winbox-worker/internal/session"
)
func envInt(key string, def int) int {
if v := os.Getenv(key); v != "" {
if n, err := strconv.Atoi(v); err == nil {
return n
}
}
return def
}
func envStr(key, def string) string {
if v := os.Getenv(key); v != "" {
return v
}
return def
}
func main() {
slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelInfo,
})))
cfg := session.Config{
MaxSessions: envInt("MAX_CONCURRENT_SESSIONS", 10),
DisplayMin: 100,
DisplayMax: 119,
WSPortMin: 10100,
WSPortMax: 10119,
IdleTimeout: envInt("IDLE_TIMEOUT_SECONDS", 600),
MaxLifetime: envInt("MAX_LIFETIME_SECONDS", 7200),
WinBoxPath: envStr("WINBOX_PATH", "/opt/winbox/WinBox"),
BindAddr: envStr("BIND_ADDR", "0.0.0.0"),
}
mgr := session.NewManager(cfg)
mgr.CleanupOrphans()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go mgr.RunCleanupLoop(ctx)
mux := http.NewServeMux()
mux.HandleFunc("POST /sessions", func(w http.ResponseWriter, r *http.Request) {
var req session.CreateRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJSON(w, http.StatusBadRequest, session.ErrorResponse{Error: "invalid request body"})
return
}
if !mgr.HasCapacity() {
writeJSON(w, http.StatusServiceUnavailable, session.ErrorResponse{
Error: "capacity",
MaxSessions: cfg.MaxSessions,
})
return
}
resp, err := mgr.CreateSession(req)
req.Username = ""
req.Password = ""
if err != nil {
slog.Error("create session failed", "err", err)
if strings.Contains(err.Error(), "capacity") {
writeJSON(w, http.StatusServiceUnavailable, session.ErrorResponse{
Error: "capacity",
MaxSessions: cfg.MaxSessions,
})
return
}
writeJSON(w, http.StatusInternalServerError, session.ErrorResponse{Error: "launch failed"})
return
}
writeJSON(w, http.StatusCreated, resp)
})
mux.HandleFunc("DELETE /sessions/{id}", func(w http.ResponseWriter, r *http.Request) {
id := r.PathValue("id")
if err := mgr.TerminateSession(id); err != nil {
writeJSON(w, http.StatusInternalServerError, session.ErrorResponse{Error: err.Error()})
return
}
writeJSON(w, http.StatusOK, map[string]string{"status": "terminated"})
})
mux.HandleFunc("GET /sessions/{id}", func(w http.ResponseWriter, r *http.Request) {
id := r.PathValue("id")
resp, err := mgr.GetSession(id)
if err != nil {
writeJSON(w, http.StatusNotFound, session.ErrorResponse{Error: "not found"})
return
}
writeJSON(w, http.StatusOK, resp)
})
mux.HandleFunc("GET /sessions", func(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusOK, mgr.ListSessions())
})
mux.HandleFunc("GET /healthz", func(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusOK, map[string]any{
"status": "ok",
"sessions": mgr.SessionCount(),
"capacity": cfg.MaxSessions,
"available": cfg.MaxSessions - mgr.SessionCount(),
})
})
handler := provenanceMiddleware(mux)
listenAddr := envStr("LISTEN_ADDR", ":9090")
srv := &http.Server{
Addr: listenAddr,
Handler: handler,
ReadTimeout: 10 * time.Second,
WriteTimeout: 30 * time.Second,
}
go func() {
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGINT)
<-sigCh
slog.Info("shutting down worker")
cancel()
for _, s := range mgr.ListSessions() {
mgr.TerminateSession(s.WorkerSessionID)
}
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer shutdownCancel()
srv.Shutdown(shutdownCtx)
}()
slog.Info("winbox-worker starting", "addr", listenAddr, "max_sessions", cfg.MaxSessions)
if err := srv.ListenAndServe(); err != http.ErrServerClosed {
slog.Error("server error", "err", err)
os.Exit(1)
}
}
func writeJSON(w http.ResponseWriter, code int, v any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(code)
json.NewEncoder(w).Encode(v)
}
func provenanceMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
svc := r.Header.Get("X-Internal-Service")
if svc == "" && !strings.HasPrefix(r.URL.Path, "/healthz") {
slog.Warn("request missing X-Internal-Service header", "path", r.URL.Path, "remote", r.RemoteAddr)
}
next.ServeHTTP(w, r)
})
}

5
winbox-worker/go.mod Normal file
View File

@@ -0,0 +1,5 @@
module github.com/the-other-dude/winbox-worker
go 1.22
require github.com/google/uuid v1.6.0

2
winbox-worker/go.sum Normal file
View File

@@ -0,0 +1,2 @@
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=

View File

@@ -0,0 +1,375 @@
package session
import (
"context"
"fmt"
"log/slog"
"os"
"os/exec"
"path/filepath"
"sync"
"syscall"
"time"
"github.com/google/uuid"
)
type Config struct {
MaxSessions int
DisplayMin int
DisplayMax int
WSPortMin int
WSPortMax int
IdleTimeout int // seconds
MaxLifetime int // seconds
WinBoxPath string
BindAddr string
}
type Manager struct {
mu sync.Mutex
sessions map[string]*Session
displays *Pool
wsPorts *Pool
cfg Config
}
func NewManager(cfg Config) *Manager {
return &Manager{
sessions: make(map[string]*Session),
displays: NewPool(cfg.DisplayMin, cfg.DisplayMax),
wsPorts: NewPool(cfg.WSPortMin, cfg.WSPortMax),
cfg: cfg,
}
}
func (m *Manager) HasCapacity() bool {
m.mu.Lock()
defer m.mu.Unlock()
return len(m.sessions) < m.cfg.MaxSessions
}
func (m *Manager) SessionCount() int {
m.mu.Lock()
defer m.mu.Unlock()
return len(m.sessions)
}
func (m *Manager) CreateSession(req CreateRequest) (*CreateResponse, error) {
m.mu.Lock()
if len(m.sessions) >= m.cfg.MaxSessions {
m.mu.Unlock()
return nil, fmt.Errorf("capacity")
}
display, err := m.displays.Allocate()
if err != nil {
m.mu.Unlock()
return nil, fmt.Errorf("no displays available: %w", err)
}
wsPort, err := m.wsPorts.Allocate()
if err != nil {
m.displays.Release(display)
m.mu.Unlock()
return nil, fmt.Errorf("no ws ports available: %w", err)
}
workerID := req.SessionID
if workerID == "" {
workerID = uuid.New().String()
}
idleTimeout := time.Duration(req.IdleTimeoutSec) * time.Second
if idleTimeout == 0 {
idleTimeout = time.Duration(m.cfg.IdleTimeout) * time.Second
}
maxLifetime := time.Duration(req.MaxLifetimeSec) * time.Second
if maxLifetime == 0 {
maxLifetime = time.Duration(m.cfg.MaxLifetime) * time.Second
}
sess := &Session{
ID: workerID,
TunnelHost: req.TunnelHost,
TunnelPort: req.TunnelPort,
Display: display,
WSPort: wsPort,
State: StateCreating,
CreatedAt: time.Now(),
IdleTimeout: idleTimeout,
MaxLifetime: maxLifetime,
}
m.sessions[workerID] = sess
m.mu.Unlock()
tmpDir, err := CreateSessionTmpDir(workerID)
if err != nil {
m.terminateSession(workerID, "tmpdir creation failed")
return nil, fmt.Errorf("create tmpdir: %w", err)
}
sess.mu.Lock()
sess.TmpDir = tmpDir
sess.mu.Unlock()
xpraCfg := XpraConfig{
Display: display,
WSPort: wsPort,
BindAddr: m.cfg.BindAddr,
TunnelHost: req.TunnelHost,
TunnelPort: req.TunnelPort,
Username: req.Username,
Password: req.Password,
TmpDir: tmpDir,
WinBoxPath: m.cfg.WinBoxPath,
}
proc, err := StartXpra(xpraCfg)
// Zero credential copies (Go-side only; /proc and exec args are a known v1 limitation)
xpraCfg.Username = ""
xpraCfg.Password = ""
req.Username = ""
req.Password = ""
if err != nil {
m.terminateSession(workerID, "xpra start failed")
return nil, fmt.Errorf("xpra start: %w", err)
}
sess.mu.Lock()
sess.XpraPID = proc.Pid
sess.mu.Unlock()
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := WaitForXpraReady(ctx, m.cfg.BindAddr, wsPort, 10*time.Second); err != nil {
m.terminateSession(workerID, "xpra not ready")
return nil, fmt.Errorf("xpra ready: %w", err)
}
sess.mu.Lock()
sess.State = StateActive
createdAt := sess.CreatedAt
sess.mu.Unlock()
return &CreateResponse{
WorkerSessionID: workerID,
Status: StateActive,
XpraWSPort: wsPort,
ExpiresAt: createdAt.Add(idleTimeout),
MaxExpiresAt: createdAt.Add(maxLifetime),
}, nil
}
func (m *Manager) TerminateSession(workerID string) error {
return m.terminateSession(workerID, "requested")
}
func (m *Manager) terminateSession(workerID string, reason string) error {
m.mu.Lock()
sess, ok := m.sessions[workerID]
if !ok {
m.mu.Unlock()
return nil
}
m.mu.Unlock()
sess.mu.Lock()
if sess.State == StateTerminating || sess.State == StateTerminated {
sess.mu.Unlock()
return nil
}
sess.State = StateTerminating
pid := sess.XpraPID
tmpDir := sess.TmpDir
display := sess.Display
wsPort := sess.WSPort
sess.mu.Unlock()
slog.Info("terminating session", "id", workerID, "reason", reason)
if pid > 0 {
KillXpraSession(pid)
}
if tmpDir != "" {
if err := CleanupTmpDir(tmpDir); err != nil {
slog.Warn("tmpdir cleanup failed", "id", workerID, "err", err)
}
}
m.displays.Release(display)
m.wsPorts.Release(wsPort)
sess.mu.Lock()
sess.State = StateTerminated
sess.mu.Unlock()
m.mu.Lock()
delete(m.sessions, workerID)
m.mu.Unlock()
return nil
}
func (m *Manager) GetSession(workerID string) (*StatusResponse, error) {
m.mu.Lock()
sess, ok := m.sessions[workerID]
m.mu.Unlock()
if !ok {
return nil, fmt.Errorf("not found")
}
sess.mu.Lock()
id := sess.ID
state := sess.State
display := sess.Display
wsPort := sess.WSPort
createdAt := sess.CreatedAt
sess.mu.Unlock()
idleSec := QueryIdleTime(display)
return &StatusResponse{
WorkerSessionID: id,
Status: state,
Display: display,
WSPort: wsPort,
CreatedAt: createdAt,
IdleSeconds: idleSec,
}, nil
}
func (m *Manager) ListSessions() []StatusResponse {
m.mu.Lock()
type sessInfo struct {
id string
state State
display int
wsPort int
createdAt time.Time
}
infos := make([]sessInfo, 0, len(m.sessions))
for _, sess := range m.sessions {
sess.mu.Lock()
infos = append(infos, sessInfo{
id: sess.ID,
state: sess.State,
display: sess.Display,
wsPort: sess.WSPort,
createdAt: sess.CreatedAt,
})
sess.mu.Unlock()
}
m.mu.Unlock()
result := make([]StatusResponse, 0, len(infos))
for _, info := range infos {
result = append(result, StatusResponse{
WorkerSessionID: info.id,
Status: info.state,
Display: info.display,
WSPort: info.wsPort,
CreatedAt: info.createdAt,
IdleSeconds: QueryIdleTime(info.display),
})
}
return result
}
func (m *Manager) RunCleanupLoop(ctx context.Context) {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
m.checkTimeouts()
}
}
}
func (m *Manager) checkTimeouts() {
m.mu.Lock()
ids := make([]string, 0, len(m.sessions))
for id := range m.sessions {
ids = append(ids, id)
}
m.mu.Unlock()
now := time.Now()
for _, id := range ids {
m.mu.Lock()
sess, ok := m.sessions[id]
m.mu.Unlock()
if !ok {
continue
}
sess.mu.Lock()
state := sess.State
createdAt := sess.CreatedAt
maxLifetime := sess.MaxLifetime
idleTimeout := sess.IdleTimeout
display := sess.Display
pid := sess.XpraPID
sess.mu.Unlock()
if state != StateActive && state != StateGrace {
continue
}
if now.Sub(createdAt) > maxLifetime {
slog.Info("session max lifetime exceeded", "id", id)
m.terminateSession(id, "max_lifetime")
continue
}
if pid > 0 {
proc, err := os.FindProcess(pid)
if err != nil || proc.Signal(syscall.Signal(0)) != nil {
slog.Info("xpra process dead", "id", id)
m.terminateSession(id, "worker_failure")
continue
}
}
idleSec := QueryIdleTime(display)
if idleSec >= 0 && time.Duration(idleSec)*time.Second > idleTimeout {
slog.Info("session idle timeout", "id", id, "idle_seconds", idleSec)
m.terminateSession(id, "idle_timeout")
}
}
}
func (m *Manager) CleanupOrphans() {
baseDir := "/tmp/winbox-sessions"
entries, err := os.ReadDir(baseDir)
if err != nil {
if !os.IsNotExist(err) {
slog.Warn("orphan scan: cannot read dir", "err", err)
}
return
}
count := 0
for _, entry := range entries {
if !entry.IsDir() {
continue
}
path := filepath.Join(baseDir, entry.Name())
slog.Info("cleaning orphan session dir", "path", path)
os.RemoveAll(path)
count++
}
exec.Command("xpra", "stop", "--all").Run()
m.displays.ResetAll()
m.wsPorts.ResetAll()
if count > 0 {
slog.Info("orphan cleanup complete", "cleaned", count)
}
}

View File

@@ -0,0 +1,107 @@
package session
import "testing"
func TestManagerCapacityCheck(t *testing.T) {
m := NewManager(Config{
MaxSessions: 2,
DisplayMin: 100,
DisplayMax: 105,
WSPortMin: 10100,
WSPortMax: 10105,
IdleTimeout: 600,
MaxLifetime: 7200,
WinBoxPath: "/usr/bin/winbox4",
BindAddr: "0.0.0.0",
})
if m.SessionCount() != 0 {
t.Fatal("expected 0 sessions")
}
if !m.HasCapacity() {
t.Fatal("expected capacity")
}
}
func TestManagerListEmpty(t *testing.T) {
m := NewManager(Config{
MaxSessions: 5,
DisplayMin: 100,
DisplayMax: 105,
WSPortMin: 10100,
WSPortMax: 10105,
IdleTimeout: 600,
MaxLifetime: 7200,
WinBoxPath: "/usr/bin/winbox4",
BindAddr: "0.0.0.0",
})
sessions := m.ListSessions()
if len(sessions) != 0 {
t.Fatalf("expected 0 sessions, got %d", len(sessions))
}
}
func TestTerminateNonExistentIsIdempotent(t *testing.T) {
m := NewManager(Config{
MaxSessions: 2,
DisplayMin: 100,
DisplayMax: 105,
WSPortMin: 10100,
WSPortMax: 10105,
IdleTimeout: 600,
MaxLifetime: 7200,
WinBoxPath: "/usr/bin/winbox4",
BindAddr: "0.0.0.0",
})
// Terminating a non-existent session should return nil (no error)
err := m.TerminateSession("does-not-exist")
if err != nil {
t.Fatalf("expected nil error for non-existent session, got: %v", err)
}
}
func TestGetNonExistentSessionReturnsError(t *testing.T) {
m := NewManager(Config{
MaxSessions: 2,
DisplayMin: 100,
DisplayMax: 105,
WSPortMin: 10100,
WSPortMax: 10105,
IdleTimeout: 600,
MaxLifetime: 7200,
WinBoxPath: "/usr/bin/winbox4",
BindAddr: "0.0.0.0",
})
_, err := m.GetSession("does-not-exist")
if err == nil {
t.Fatal("expected error for non-existent session, got nil")
}
}
func TestCleanupOrphansRunsWithoutError(t *testing.T) {
m := NewManager(Config{
MaxSessions: 2,
DisplayMin: 100,
DisplayMax: 105,
WSPortMin: 10100,
WSPortMax: 10105,
IdleTimeout: 600,
MaxLifetime: 7200,
WinBoxPath: "/usr/bin/winbox4",
BindAddr: "0.0.0.0",
})
// CleanupOrphans should not panic on a fresh manager with no sessions
m.CleanupOrphans()
// After cleanup, manager should still be functional
if !m.HasCapacity() {
t.Fatal("expected capacity after cleanup")
}
if m.SessionCount() != 0 {
t.Fatal("expected 0 sessions after cleanup")
}
sessions := m.ListSessions()
if len(sessions) != 0 {
t.Fatalf("expected empty session list after cleanup, got %d", len(sessions))
}
}

View File

@@ -0,0 +1,60 @@
package session
import (
"fmt"
"sync"
)
type Pool struct {
mu sync.Mutex
available []int
inUse map[int]bool
}
func NewPool(min, max int) *Pool {
available := make([]int, 0, max-min+1)
for i := min; i <= max; i++ {
available = append(available, i)
}
return &Pool{
available: available,
inUse: make(map[int]bool),
}
}
func (p *Pool) Allocate() (int, error) {
p.mu.Lock()
defer p.mu.Unlock()
if len(p.available) == 0 {
return 0, fmt.Errorf("pool exhausted")
}
id := p.available[0]
p.available = p.available[1:]
p.inUse[id] = true
return id, nil
}
func (p *Pool) Release(id int) {
p.mu.Lock()
defer p.mu.Unlock()
if !p.inUse[id] {
return
}
delete(p.inUse, id)
p.available = append(p.available, id)
}
func (p *Pool) Available() int {
p.mu.Lock()
defer p.mu.Unlock()
return len(p.available)
}
func (p *Pool) ResetAll() {
p.mu.Lock()
defer p.mu.Unlock()
for id := range p.inUse {
p.available = append(p.available, id)
}
p.inUse = make(map[int]bool)
}

View File

@@ -0,0 +1,48 @@
package session
import "testing"
func TestPoolAllocateAndRelease(t *testing.T) {
p := NewPool(100, 105)
allocated := make([]int, 0, 6)
for i := 0; i < 6; i++ {
n, err := p.Allocate()
if err != nil {
t.Fatalf("allocate %d: %v", i, err)
}
allocated = append(allocated, n)
}
_, err := p.Allocate()
if err == nil {
t.Fatal("expected error on exhausted pool")
}
p.Release(allocated[0])
n, err := p.Allocate()
if err != nil {
t.Fatalf("re-allocate: %v", err)
}
if n != allocated[0] {
t.Fatalf("expected %d, got %d", allocated[0], n)
}
}
func TestPoolAvailable(t *testing.T) {
p := NewPool(100, 102)
if p.Available() != 3 {
t.Fatalf("expected 3 available, got %d", p.Available())
}
p.Allocate()
if p.Available() != 2 {
t.Fatalf("expected 2 available, got %d", p.Available())
}
}
func TestPoolResetAll(t *testing.T) {
p := NewPool(100, 102)
p.Allocate()
p.Allocate()
p.ResetAll()
if p.Available() != 3 {
t.Fatalf("expected 3 after reset, got %d", p.Available())
}
}

View File

@@ -0,0 +1,67 @@
package session
import (
"sync"
"time"
)
type State string
const (
StateCreating State = "creating"
StateActive State = "active"
StateGrace State = "grace"
StateTerminating State = "terminating"
StateTerminated State = "terminated"
StateFailed State = "failed"
)
type Session struct {
mu sync.Mutex
ID string `json:"id"`
TunnelHost string `json:"-"`
TunnelPort int `json:"-"`
Display int `json:"display"`
WSPort int `json:"ws_port"`
State State `json:"state"`
XpraPID int `json:"-"`
WinBoxPID int `json:"-"`
TmpDir string `json:"-"`
CreatedAt time.Time `json:"created_at"`
IdleTimeout time.Duration `json:"-"`
MaxLifetime time.Duration `json:"-"`
}
type CreateRequest struct {
SessionID string `json:"session_id"`
TunnelHost string `json:"tunnel_host"`
TunnelPort int `json:"tunnel_port"`
Username string `json:"username"`
Password string `json:"password"`
DisplayName string `json:"display_name"`
IdleTimeoutSec int `json:"idle_timeout_seconds"`
MaxLifetimeSec int `json:"max_lifetime_seconds"`
}
type CreateResponse struct {
WorkerSessionID string `json:"worker_session_id"`
Status State `json:"status"`
XpraWSPort int `json:"xpra_ws_port"`
ExpiresAt time.Time `json:"expires_at"`
MaxExpiresAt time.Time `json:"max_expires_at"`
}
type StatusResponse struct {
WorkerSessionID string `json:"worker_session_id"`
Status State `json:"status"`
Display int `json:"display"`
WSPort int `json:"ws_port"`
CreatedAt time.Time `json:"created_at"`
IdleSeconds int `json:"idle_seconds"`
}
type ErrorResponse struct {
Error string `json:"error"`
MaxSessions int `json:"max_sessions,omitempty"`
}

View File

@@ -0,0 +1,161 @@
package session
import (
"context"
"fmt"
"log/slog"
"net"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"
)
type XpraConfig struct {
Display int
WSPort int
BindAddr string
TunnelHost string
TunnelPort int
Username string
Password string
TmpDir string
WinBoxPath string
}
func StartXpra(cfg XpraConfig) (*os.Process, error) {
display := fmt.Sprintf(":%d", cfg.Display)
bindWS := fmt.Sprintf("%s:%d", cfg.BindAddr, cfg.WSPort)
winboxCmd := fmt.Sprintf("%s %s:%d %s %s",
cfg.WinBoxPath, cfg.TunnelHost, cfg.TunnelPort, cfg.Username, cfg.Password)
args := []string{
"start", display,
"--bind-ws=" + bindWS,
"--html=on",
"--daemon=no",
"--start-new-commands=no",
"--no-clipboard",
"--no-printing",
"--no-file-transfer",
"--no-notifications",
"--no-webcam",
"--no-speaker",
"--no-microphone",
"--sharing=no",
"--opengl=off",
"--env=XPRA_CLIENT_CAN_SHUTDOWN=0",
"--xvfb=Xvfb +extension GLX +extension Composite -screen 0 1280x800x24+32 -dpi 96 -nolisten tcp -noreset -auth /home/worker/.Xauthority",
"--start-child=" + winboxCmd,
}
logFile := filepath.Join(cfg.TmpDir, "xpra.log")
cmd := exec.Command("xpra", args...)
cmd.Dir = cfg.TmpDir
f, err := os.Create(logFile)
if err != nil {
return nil, fmt.Errorf("create xpra log: %w", err)
}
cmd.Stdout = f
cmd.Stderr = f
cmd.Env = append(os.Environ(),
"HOME="+cfg.TmpDir,
"DISPLAY="+display,
"XPRA_CLIENT_CAN_SHUTDOWN=0",
"LIBGL_ALWAYS_SOFTWARE=1",
"GALLIUM_DRIVER=llvmpipe",
)
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
if err := cmd.Start(); err != nil {
return nil, fmt.Errorf("xpra start failed: %w", err)
}
return cmd.Process, nil
}
func WaitForXpraReady(ctx context.Context, bindAddr string, wsPort int, timeout time.Duration) error {
addr := fmt.Sprintf("%s:%d", bindAddr, wsPort)
deadline := time.After(timeout)
ticker := time.NewTicker(250 * time.Millisecond)
defer ticker.Stop()
for {
select {
case <-deadline:
return fmt.Errorf("xpra not ready after %s", timeout)
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
conn, err := (&net.Dialer{Timeout: 200 * time.Millisecond}).DialContext(ctx, "tcp", addr)
if err == nil {
conn.Close()
return nil
}
}
}
}
func QueryIdleTime(display int) int {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "xpra", "info", fmt.Sprintf(":%d", display))
out, err := cmd.Output()
if err != nil {
return -1
}
for _, line := range strings.Split(string(out), "\n") {
line = strings.TrimSpace(line)
if strings.HasPrefix(line, "idle_time=") {
val := strings.TrimPrefix(line, "idle_time=")
if n, err := strconv.Atoi(val); err == nil {
return n
}
}
}
return -1
}
func KillXpraSession(pid int) error {
if err := syscall.Kill(-pid, syscall.SIGTERM); err != nil {
slog.Warn("SIGTERM to xpra process group failed", "pid", pid, "err", err)
}
done := make(chan struct{})
go func() {
proc, err := os.FindProcess(pid)
if err == nil {
proc.Wait()
}
close(done)
}()
select {
case <-done:
return nil
case <-time.After(5 * time.Second):
slog.Warn("SIGKILL to xpra process group", "pid", pid)
return syscall.Kill(-pid, syscall.SIGKILL)
}
}
func CleanupTmpDir(dir string) error {
if dir == "" || !strings.HasPrefix(dir, "/tmp/winbox-sessions/") {
return fmt.Errorf("refusing to remove suspicious path: %s", dir)
}
return os.RemoveAll(dir)
}
func CreateSessionTmpDir(sessionID string) (string, error) {
dir := filepath.Join("/tmp/winbox-sessions", sessionID)
if err := os.MkdirAll(dir, 0700); err != nil {
return "", fmt.Errorf("create tmpdir: %w", err)
}
return dir, nil
}