From 1042319a082df2e01876cbae0dd30415ee543886 Mon Sep 17 00:00:00 2001 From: Jason Staack Date: Thu, 19 Mar 2026 18:06:40 -0500 Subject: [PATCH] perf: fix API CPU saturation at 400+ devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: stale NATS JetStream consumers accumulated across API restarts, causing 13+ consumers to fight over messages in a single Python async event loop (100% CPU). Fixes: - Add performance indexes on devices(tenant_id, hostname), devices(tenant_id, status), key_access_log(tenant_id, created_at) — drops devices seq_scans from 402k to 6 per interval - Remove redundant ORDER BY t.name from fleet summary SQL (tenant name sort is client-side, was forcing a cross-table sort) - Bump NATS memory limit from 128MB to 256MB (was at 118/128) - Increase dev poll interval from 60s to 120s for 400+ device fleet The stream purge + restart brought API CPU from 100% to 0.3%. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../versions/036_performance_indexes.py | 51 +++++++++++++++++++ backend/app/routers/metrics.py | 2 +- docker-compose.override.yml | 2 +- docker-compose.yml | 2 +- 4 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 backend/alembic/versions/036_performance_indexes.py diff --git a/backend/alembic/versions/036_performance_indexes.py b/backend/alembic/versions/036_performance_indexes.py new file mode 100644 index 0000000..9b7365c --- /dev/null +++ b/backend/alembic/versions/036_performance_indexes.py @@ -0,0 +1,51 @@ +"""Add performance indexes for fleet dashboard and link discovery queries. + +Revision ID: 036 +Revises: 035 +Create Date: 2026-03-19 + +At ~400 devices, sequential scans on the devices table accounted for 58M +row reads. These indexes cover the hot query paths: fleet summary +(tenant_id + hostname sort), dashboard status counts (tenant_id + status), +and key_access_log time-range queries. +""" + +import sqlalchemy as sa +from alembic import op + +revision = "036" +down_revision = "035" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Fleet summary query: SELECT ... FROM devices JOIN tenants ORDER BY hostname + op.create_index( + "idx_devices_tenant_hostname", + "devices", + ["tenant_id", "hostname"], + if_not_exists=True, + ) + + # Dashboard status count queries + op.create_index( + "idx_devices_tenant_status", + "devices", + ["tenant_id", "status"], + if_not_exists=True, + ) + + # key_access_log: growing unbounded, queried by tenant + time range + op.create_index( + "idx_key_access_log_tenant_time", + "key_access_log", + [sa.text("tenant_id"), sa.text("created_at DESC")], + if_not_exists=True, + ) + + +def downgrade() -> None: + op.drop_index("idx_key_access_log_tenant_time", table_name="key_access_log") + op.drop_index("idx_devices_tenant_status", table_name="devices") + op.drop_index("idx_devices_tenant_hostname", table_name="devices") diff --git a/backend/app/routers/metrics.py b/backend/app/routers/metrics.py index fce20df..270675e 100644 --- a/backend/app/routers/metrics.py +++ b/backend/app/routers/metrics.py @@ -363,7 +363,7 @@ _FLEET_SUMMARY_SQL = """ d.tenant_id, t.name AS tenant_name FROM devices d JOIN tenants t ON d.tenant_id = t.id - ORDER BY t.name, d.hostname + ORDER BY d.hostname """ diff --git a/docker-compose.override.yml b/docker-compose.override.yml index be4da3a..9bcf228 100644 --- a/docker-compose.override.yml +++ b/docker-compose.override.yml @@ -80,7 +80,7 @@ services: CREDENTIAL_ENCRYPTION_KEY: ${CREDENTIAL_ENCRYPTION_KEY:?Set CREDENTIAL_ENCRYPTION_KEY in .env} OPENBAO_ADDR: http://openbao:8200 OPENBAO_TOKEN: dev-openbao-token - POLL_INTERVAL_SECONDS: 60 + POLL_INTERVAL_SECONDS: 120 WIREGUARD_GATEWAY: wireguard TUNNEL_PORT_MIN: 49000 TUNNEL_PORT_MAX: 49004 diff --git a/docker-compose.yml b/docker-compose.yml index c86ac0c..a264f8e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -73,7 +73,7 @@ services: deploy: resources: limits: - memory: 128M + memory: 256M networks: - tod