Files
the-other-dude/backend/alembic/versions/036_performance_indexes.py
Jason Staack 1042319a08 perf: fix API CPU saturation at 400+ devices
Root cause: stale NATS JetStream consumers accumulated across API
restarts, causing 13+ consumers to fight over messages in a single
Python async event loop (100% CPU).

Fixes:
- Add performance indexes on devices(tenant_id, hostname),
  devices(tenant_id, status), key_access_log(tenant_id, created_at)
  — drops devices seq_scans from 402k to 6 per interval
- Remove redundant ORDER BY t.name from fleet summary SQL
  (tenant name sort is client-side, was forcing a cross-table sort)
- Bump NATS memory limit from 128MB to 256MB (was at 118/128)
- Increase dev poll interval from 60s to 120s for 400+ device fleet

The stream purge + restart brought API CPU from 100% to 0.3%.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 18:06:40 -05:00

52 lines
1.4 KiB
Python

"""Add performance indexes for fleet dashboard and link discovery queries.
Revision ID: 036
Revises: 035
Create Date: 2026-03-19
At ~400 devices, sequential scans on the devices table accounted for 58M
row reads. These indexes cover the hot query paths: fleet summary
(tenant_id + hostname sort), dashboard status counts (tenant_id + status),
and key_access_log time-range queries.
"""
import sqlalchemy as sa
from alembic import op
revision = "036"
down_revision = "035"
branch_labels = None
depends_on = None
def upgrade() -> None:
# Fleet summary query: SELECT ... FROM devices JOIN tenants ORDER BY hostname
op.create_index(
"idx_devices_tenant_hostname",
"devices",
["tenant_id", "hostname"],
if_not_exists=True,
)
# Dashboard status count queries
op.create_index(
"idx_devices_tenant_status",
"devices",
["tenant_id", "status"],
if_not_exists=True,
)
# key_access_log: growing unbounded, queried by tenant + time range
op.create_index(
"idx_key_access_log_tenant_time",
"key_access_log",
[sa.text("tenant_id"), sa.text("created_at DESC")],
if_not_exists=True,
)
def downgrade() -> None:
op.drop_index("idx_key_access_log_tenant_time", table_name="key_access_log")
op.drop_index("idx_devices_tenant_status", table_name="devices")
op.drop_index("idx_devices_tenant_hostname", table_name="devices")