perf: fix API CPU saturation at 400+ devices
Root cause: stale NATS JetStream consumers accumulated across API restarts, causing 13+ consumers to fight over messages in a single Python async event loop (100% CPU). Fixes: - Add performance indexes on devices(tenant_id, hostname), devices(tenant_id, status), key_access_log(tenant_id, created_at) — drops devices seq_scans from 402k to 6 per interval - Remove redundant ORDER BY t.name from fleet summary SQL (tenant name sort is client-side, was forcing a cross-table sort) - Bump NATS memory limit from 128MB to 256MB (was at 118/128) - Increase dev poll interval from 60s to 120s for 400+ device fleet The stream purge + restart brought API CPU from 100% to 0.3%. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
51
backend/alembic/versions/036_performance_indexes.py
Normal file
51
backend/alembic/versions/036_performance_indexes.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
"""Add performance indexes for fleet dashboard and link discovery queries.
|
||||||
|
|
||||||
|
Revision ID: 036
|
||||||
|
Revises: 035
|
||||||
|
Create Date: 2026-03-19
|
||||||
|
|
||||||
|
At ~400 devices, sequential scans on the devices table accounted for 58M
|
||||||
|
row reads. These indexes cover the hot query paths: fleet summary
|
||||||
|
(tenant_id + hostname sort), dashboard status counts (tenant_id + status),
|
||||||
|
and key_access_log time-range queries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision = "036"
|
||||||
|
down_revision = "035"
|
||||||
|
branch_labels = None
|
||||||
|
depends_on = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
# Fleet summary query: SELECT ... FROM devices JOIN tenants ORDER BY hostname
|
||||||
|
op.create_index(
|
||||||
|
"idx_devices_tenant_hostname",
|
||||||
|
"devices",
|
||||||
|
["tenant_id", "hostname"],
|
||||||
|
if_not_exists=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Dashboard status count queries
|
||||||
|
op.create_index(
|
||||||
|
"idx_devices_tenant_status",
|
||||||
|
"devices",
|
||||||
|
["tenant_id", "status"],
|
||||||
|
if_not_exists=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# key_access_log: growing unbounded, queried by tenant + time range
|
||||||
|
op.create_index(
|
||||||
|
"idx_key_access_log_tenant_time",
|
||||||
|
"key_access_log",
|
||||||
|
[sa.text("tenant_id"), sa.text("created_at DESC")],
|
||||||
|
if_not_exists=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_index("idx_key_access_log_tenant_time", table_name="key_access_log")
|
||||||
|
op.drop_index("idx_devices_tenant_status", table_name="devices")
|
||||||
|
op.drop_index("idx_devices_tenant_hostname", table_name="devices")
|
||||||
@@ -363,7 +363,7 @@ _FLEET_SUMMARY_SQL = """
|
|||||||
d.tenant_id, t.name AS tenant_name
|
d.tenant_id, t.name AS tenant_name
|
||||||
FROM devices d
|
FROM devices d
|
||||||
JOIN tenants t ON d.tenant_id = t.id
|
JOIN tenants t ON d.tenant_id = t.id
|
||||||
ORDER BY t.name, d.hostname
|
ORDER BY d.hostname
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ services:
|
|||||||
CREDENTIAL_ENCRYPTION_KEY: ${CREDENTIAL_ENCRYPTION_KEY:?Set CREDENTIAL_ENCRYPTION_KEY in .env}
|
CREDENTIAL_ENCRYPTION_KEY: ${CREDENTIAL_ENCRYPTION_KEY:?Set CREDENTIAL_ENCRYPTION_KEY in .env}
|
||||||
OPENBAO_ADDR: http://openbao:8200
|
OPENBAO_ADDR: http://openbao:8200
|
||||||
OPENBAO_TOKEN: dev-openbao-token
|
OPENBAO_TOKEN: dev-openbao-token
|
||||||
POLL_INTERVAL_SECONDS: 60
|
POLL_INTERVAL_SECONDS: 120
|
||||||
WIREGUARD_GATEWAY: wireguard
|
WIREGUARD_GATEWAY: wireguard
|
||||||
TUNNEL_PORT_MIN: 49000
|
TUNNEL_PORT_MIN: 49000
|
||||||
TUNNEL_PORT_MAX: 49004
|
TUNNEL_PORT_MAX: 49004
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ services:
|
|||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
memory: 128M
|
memory: 256M
|
||||||
networks:
|
networks:
|
||||||
- tod
|
- tod
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user