Files
the-other-dude/docker-compose.override.yml
Jason Staack 1042319a08 perf: fix API CPU saturation at 400+ devices
Root cause: stale NATS JetStream consumers accumulated across API
restarts, causing 13+ consumers to fight over messages in a single
Python async event loop (100% CPU).

Fixes:
- Add performance indexes on devices(tenant_id, hostname),
  devices(tenant_id, status), key_access_log(tenant_id, created_at)
  — drops devices seq_scans from 402k to 6 per interval
- Remove redundant ORDER BY t.name from fleet summary SQL
  (tenant name sort is client-side, was forcing a cross-table sort)
- Bump NATS memory limit from 128MB to 256MB (was at 118/128)
- Increase dev poll interval from 60s to 120s for 400+ device fleet

The stream purge + restart brought API CPU from 100% to 0.3%.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 18:06:40 -05:00

146 lines
4.2 KiB
YAML

# docker-compose.override.yml -- Dev environment (auto-loaded by `docker compose up`)
# Adds application services with hot reload, debug logging, and dev defaults.
services:
api:
build:
context: .
dockerfile: infrastructure/docker/Dockerfile.api
container_name: tod_api
restart: on-failure
ports:
- "8001:8000"
env_file: .env
environment:
ENVIRONMENT: dev
LOG_LEVEL: debug
DEBUG: "true"
GUNICORN_WORKERS: "1"
DATABASE_URL: postgresql+asyncpg://postgres:postgres@postgres:5432/tod
SYNC_DATABASE_URL: postgresql+psycopg2://postgres:postgres@postgres:5432/tod
APP_USER_DATABASE_URL: postgresql+asyncpg://app_user:app_password@postgres:5432/tod
REDIS_URL: redis://redis:6379/0
NATS_URL: nats://nats:4222
FIRST_ADMIN_EMAIL: ${FIRST_ADMIN_EMAIL:-admin@the-other-dude.dev}
FIRST_ADMIN_PASSWORD: ${FIRST_ADMIN_PASSWORD:-changeme-in-production}
CREDENTIAL_ENCRYPTION_KEY: ${CREDENTIAL_ENCRYPTION_KEY:?Set CREDENTIAL_ENCRYPTION_KEY in .env}
JWT_SECRET_KEY: ${JWT_SECRET_KEY:?Set JWT_SECRET_KEY in .env}
OPENBAO_ADDR: http://openbao:8200
OPENBAO_TOKEN: dev-openbao-token
GIT_STORE_PATH: /data/git-store
WIREGUARD_CONFIG_PATH: /data/wireguard
WIREGUARD_GATEWAY: wireguard
cap_add:
- NET_ADMIN
user: root
command: >
sh -c "
if [ -n \"$$WIREGUARD_GATEWAY\" ]; then
apt-get update -qq && apt-get install -y -qq iproute2 >/dev/null 2>&1 || true;
GW_IP=$$(getent hosts $$WIREGUARD_GATEWAY 2>/dev/null | awk '{print $$1}');
[ -z \"$$GW_IP\" ] && GW_IP=$$WIREGUARD_GATEWAY;
ip route add 10.10.0.0/16 via $$GW_IP 2>/dev/null || true;
echo VPN route: 10.10.0.0/16 via $$GW_IP;
fi;
exec su -s /bin/sh appuser -c 'gunicorn app.main:app --config gunicorn.conf.py'
"
volumes:
- ./backend:/app
- ./docker-data/git-store:/data/git-store
- ./docker-data/firmware-cache:/data/firmware-cache
- ./docker-data/wireguard:/data/wireguard
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
nats:
condition: service_healthy
deploy:
resources:
limits:
memory: 512M
networks:
- tod
- tod_remote_worker
poller:
build:
context: ./poller
dockerfile: ./Dockerfile
container_name: tod_poller
restart: on-failure
env_file: .env
environment:
ENVIRONMENT: dev
LOG_LEVEL: debug
DATABASE_URL: postgres://poller_user:poller_password@postgres:5432/tod
REDIS_URL: redis://redis:6379/0
NATS_URL: nats://nats:4222
CREDENTIAL_ENCRYPTION_KEY: ${CREDENTIAL_ENCRYPTION_KEY:?Set CREDENTIAL_ENCRYPTION_KEY in .env}
OPENBAO_ADDR: http://openbao:8200
OPENBAO_TOKEN: dev-openbao-token
POLL_INTERVAL_SECONDS: 120
WIREGUARD_GATEWAY: wireguard
TUNNEL_PORT_MIN: 49000
TUNNEL_PORT_MAX: 49004
TUNNEL_IDLE_TIMEOUT: 300
SSH_RELAY_PORT: 8080
SSH_IDLE_TIMEOUT: 900
SSH_MAX_SESSIONS: 200
SSH_MAX_PER_USER: 10
SSH_MAX_PER_DEVICE: 20
ports:
- "49000-49004:49000-49004"
ulimits:
nofile:
soft: 8192
hard: 8192
cap_add:
- NET_ADMIN
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
nats:
condition: service_healthy
healthcheck:
test: ["CMD-SHELL", "wget --spider -q http://localhost:8080/healthz || exit 1"]
interval: 30s
timeout: 3s
retries: 3
deploy:
resources:
limits:
memory: 256M
networks:
- tod
- tod_remote_worker
winbox-worker:
environment:
LOG_LEVEL: debug
MAX_CONCURRENT_SESSIONS: 5
deploy:
resources:
limits:
memory: 512M
restart: on-failure
frontend:
build:
context: .
dockerfile: infrastructure/docker/Dockerfile.frontend
container_name: tod_frontend
ports:
- "3000:80"
depends_on:
- api
deploy:
resources:
limits:
memory: 64M
networks:
- tod