Files
the-other-dude/poller/internal/bus/publisher.go
Jason Staack e1d81b40ac fix: cap NATS JetStream streams to prevent OOM crash
WIRELESS_REGISTRATIONS stream had a 256MB MaxBytes cap in a 256MB
container — guaranteed to crash under load. ALERT_EVENTS and
OPERATION_EVENTS had no byte limit at all.

- Reduce WIRELESS_REGISTRATIONS MaxBytes from 256MB to 128MB
- Add 16MB MaxBytes cap to ALERT_EVENTS and OPERATION_EVENTS
- Bump NATS container memory limit from 256MB to 384MB
- Add restart: unless-stopped to NATS in base compose
- Bump version to 9.8.2

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-23 07:52:07 -05:00

552 lines
19 KiB
Go

// Package bus provides NATS JetStream publishing for device events.
package bus
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"time"
"github.com/nats-io/nats.go"
"github.com/nats-io/nats.go/jetstream"
"github.com/staack/the-other-dude/poller/internal/device"
)
// DeviceStatusEvent is the payload published to NATS JetStream when a device
// is polled. Consumers subscribe to "device.status.>" to receive all events.
type DeviceStatusEvent struct {
DeviceID string `json:"device_id"`
TenantID string `json:"tenant_id"`
Status string `json:"status"` // "online" or "offline"
SoftwareVersion string `json:"software_version,omitempty"` // parsed from sysDescr for SNMP devices
RouterOSVersion string `json:"routeros_version,omitempty"`
MajorVersion int `json:"major_version,omitempty"`
BoardName string `json:"board_name,omitempty"`
Architecture string `json:"architecture,omitempty"`
Uptime string `json:"uptime,omitempty"`
CPULoad string `json:"cpu_load,omitempty"`
FreeMemory string `json:"free_memory,omitempty"`
TotalMemory string `json:"total_memory,omitempty"`
SerialNumber string `json:"serial_number,omitempty"`
FirmwareVersion string `json:"firmware_version,omitempty"`
LastSeen string `json:"last_seen"` // RFC3339
}
// DeviceMetricsEvent is the payload published to NATS JetStream for metric data
// collected from a RouterOS device on each poll cycle.
//
// Events are published to "device.metrics.{type}.{device_id}" where type is one
// of "health", "interfaces", or "wireless". Only the field matching the type will
// be populated; the others will be omitted from the JSON payload.
type DeviceMetricsEvent struct {
DeviceID string `json:"device_id"`
TenantID string `json:"tenant_id"`
CollectedAt string `json:"collected_at"` // RFC3339
Type string `json:"type"` // "health", "interfaces", "wireless"
Health *device.HealthMetrics `json:"health,omitempty"`
Interfaces []device.InterfaceStats `json:"interfaces,omitempty"`
Wireless []device.WirelessStats `json:"wireless,omitempty"`
}
// ConfigChangedEvent is published when a device's config changes out-of-band.
type ConfigChangedEvent struct {
DeviceID string `json:"device_id"`
TenantID string `json:"tenant_id"`
OldTimestamp string `json:"old_timestamp"`
NewTimestamp string `json:"new_timestamp"`
}
// PushRollbackEvent triggers automatic rollback for template pushes.
type PushRollbackEvent struct {
DeviceID string `json:"device_id"`
TenantID string `json:"tenant_id"`
PushOperationID string `json:"push_operation_id"`
PrePushCommitSHA string `json:"pre_push_commit_sha"`
}
// ConfigSnapshotEvent is the payload published to NATS JetStream when a config
// backup is successfully collected from a device. The backend subscribes to
// "config.snapshot.>" to store snapshots and compute diffs.
type ConfigSnapshotEvent struct {
DeviceID string `json:"device_id"`
TenantID string `json:"tenant_id"`
RouterOSVersion string `json:"routeros_version,omitempty"`
CollectedAt string `json:"collected_at"` // RFC3339
SHA256Hash string `json:"sha256_hash"`
ConfigText string `json:"config_text"`
NormalizationVersion int `json:"normalization_version"`
}
// PushAlertEvent triggers an alert for editor pushes (one-click rollback).
type PushAlertEvent struct {
DeviceID string `json:"device_id"`
TenantID string `json:"tenant_id"`
PushType string `json:"push_type"`
}
// Publisher wraps a NATS JetStream connection for publishing device events.
type Publisher struct {
nc *nats.Conn
js jetstream.JetStream
}
// NewPublisher connects to NATS and ensures the DEVICE_EVENTS stream exists.
//
// The DEVICE_EVENTS stream covers device.status.>, device.metrics.>, and
// device.firmware.> subjects. These are explicit to avoid capturing
// device.cmd.* (used by CmdResponder for request-reply). This allows
// the Python API to subscribe to either family via durable consumers.
//
// The connection uses unlimited reconnects with a 2-second wait between attempts
// so the poller survives transient NATS restarts gracefully.
func NewPublisher(natsURL string) (*Publisher, error) {
nc, err := nats.Connect(natsURL,
nats.MaxReconnects(-1),
nats.ReconnectWait(2*time.Second),
nats.DisconnectErrHandler(func(nc *nats.Conn, err error) {
slog.Warn("NATS disconnected", "error", err)
}),
nats.ReconnectHandler(func(nc *nats.Conn) {
slog.Info("NATS reconnected", "url", nc.ConnectedUrl())
}),
)
if err != nil {
return nil, fmt.Errorf("connecting to NATS at %s: %w", natsURL, err)
}
js, err := jetstream.New(nc)
if err != nil {
nc.Close()
return nil, fmt.Errorf("creating JetStream context: %w", err)
}
// Ensure the DEVICE_EVENTS stream exists. CreateOrUpdateStream is idempotent.
// Subjects are explicit (not "device.>") to avoid capturing device.cmd.*
// which is used by CmdResponder for core NATS request-reply.
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
_, err = js.CreateOrUpdateStream(ctx, jetstream.StreamConfig{
Name: "DEVICE_EVENTS",
Subjects: []string{
"device.status.>",
"device.metrics.>",
"device.interfaces.>",
"device.firmware.>",
"device.credential_changed.>",
"config.changed.>",
"config.snapshot.>",
"config.push.rollback.>",
"config.push.alert.>",
"audit.session.end.>",
},
MaxAge: 24 * time.Hour,
MaxBytes: 64 * 1024 * 1024, // 64MB cap — discard oldest when full
Discard: jetstream.DiscardOld,
})
if err != nil {
nc.Close()
return nil, fmt.Errorf("ensuring DEVICE_EVENTS stream: %w", err)
}
slog.Info("NATS JetStream DEVICE_EVENTS stream ready")
// Ensure the WIRELESS_REGISTRATIONS stream exists for per-client wireless data.
// Separate stream with 30-day retention (vs 24h for DEVICE_EVENTS) to support
// historical client analytics and hypertable ingestion.
ctx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel2()
_, err = js.CreateOrUpdateStream(ctx2, jetstream.StreamConfig{
Name: "WIRELESS_REGISTRATIONS",
Subjects: []string{"wireless.registrations.>"},
MaxAge: 30 * 24 * time.Hour, // 30-day retention
MaxBytes: 128 * 1024 * 1024, // 128MB cap
Discard: jetstream.DiscardOld,
})
if err != nil {
nc.Close()
return nil, fmt.Errorf("ensuring WIRELESS_REGISTRATIONS stream: %w", err)
}
slog.Info("NATS JetStream WIRELESS_REGISTRATIONS stream ready")
return &Publisher{nc: nc, js: js}, nil
}
// PublishStatus publishes a device status event to NATS JetStream.
//
// Events are published to "device.status.{DeviceID}" so consumers can subscribe
// to individual devices or all events via "device.status.>".
func (p *Publisher) PublishStatus(ctx context.Context, event DeviceStatusEvent) error {
data, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("marshalling event: %w", err)
}
subject := fmt.Sprintf("device.status.%s", event.DeviceID)
_, err = p.js.Publish(ctx, subject, data)
if err != nil {
return fmt.Errorf("publishing to %s: %w", subject, err)
}
slog.Debug("published device status event",
"device_id", event.DeviceID,
"status", event.Status,
"subject", subject,
)
return nil
}
// PublishMetrics publishes a device metrics event to NATS JetStream.
//
// Events are published to "device.metrics.{type}.{device_id}" so consumers can
// subscribe to all metrics via "device.metrics.>" or filter by type.
func (p *Publisher) PublishMetrics(ctx context.Context, event DeviceMetricsEvent) error {
data, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("marshalling metrics event: %w", err)
}
subject := fmt.Sprintf("device.metrics.%s.%s", event.Type, event.DeviceID)
_, err = p.js.Publish(ctx, subject, data)
if err != nil {
return fmt.Errorf("publishing to %s: %w", subject, err)
}
slog.Debug("published device metrics event",
"device_id", event.DeviceID,
"type", event.Type,
"subject", subject,
)
return nil
}
// DeviceInterfaceEvent is the payload published to the DEVICE_EVENTS NATS stream
// when interface identity data (name, MAC, type, running) is collected from a
// device. The link discovery system uses MAC addresses to resolve which managed
// device owns each end of a wireless link.
type DeviceInterfaceEvent struct {
DeviceID string `json:"device_id"`
TenantID string `json:"tenant_id"`
CollectedAt string `json:"collected_at"` // RFC3339
Interfaces []device.InterfaceInfo `json:"interfaces"`
}
// PublishDeviceInterfaces publishes interface identity data to the DEVICE_EVENTS
// NATS stream for link discovery.
//
// Events are published to "device.interfaces.{device_id}" so consumers can
// subscribe to all interface data or filter by device.
func (p *Publisher) PublishDeviceInterfaces(ctx context.Context, event DeviceInterfaceEvent) error {
data, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("marshalling device interface event: %w", err)
}
subject := fmt.Sprintf("device.interfaces.%s", event.DeviceID)
_, err = p.js.Publish(ctx, subject, data)
if err != nil {
return fmt.Errorf("publishing to %s: %w", subject, err)
}
slog.Debug("published device interface event",
"device_id", event.DeviceID,
"interfaces", len(event.Interfaces),
"subject", subject,
)
return nil
}
// PublishWirelessRegistrations publishes per-client wireless registration data
// and RF monitor stats to the WIRELESS_REGISTRATIONS NATS stream.
//
// Events are published to "wireless.registrations.{device_id}" so consumers
// can subscribe to all wireless data or filter by device.
func (p *Publisher) PublishWirelessRegistrations(ctx context.Context, event WirelessRegistrationEvent) error {
data, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("marshalling wireless registration event: %w", err)
}
subject := fmt.Sprintf("wireless.registrations.%s", event.DeviceID)
_, err = p.js.Publish(ctx, subject, data)
if err != nil {
return fmt.Errorf("publishing to %s: %w", subject, err)
}
slog.Debug("published wireless registration event",
"device_id", event.DeviceID,
"registrations", len(event.Registrations),
"rf_stats", len(event.RFStats),
"subject", subject,
)
return nil
}
// DeviceFirmwareEvent is the payload published to NATS JetStream when the poller
// checks a device's firmware update status (rate-limited to once per day per device).
type DeviceFirmwareEvent struct {
DeviceID string `json:"device_id"`
TenantID string `json:"tenant_id"`
InstalledVersion string `json:"installed_version"`
LatestVersion string `json:"latest_version,omitempty"`
Channel string `json:"channel,omitempty"`
Status string `json:"status"`
Architecture string `json:"architecture"`
}
// PublishFirmware publishes a device firmware status event to NATS JetStream.
//
// Events are published to "device.firmware.{DeviceID}" so the Python firmware
// subscriber can process them and update the firmware_versions table.
func (p *Publisher) PublishFirmware(ctx context.Context, event DeviceFirmwareEvent) error {
data, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("marshalling firmware event: %w", err)
}
subject := fmt.Sprintf("device.firmware.%s", event.DeviceID)
_, err = p.js.Publish(ctx, subject, data)
if err != nil {
return fmt.Errorf("publishing to %s: %w", subject, err)
}
slog.Debug("published device firmware event",
"device_id", event.DeviceID,
"installed", event.InstalledVersion,
"latest", event.LatestVersion,
"subject", subject,
)
return nil
}
// PublishConfigChanged publishes a config change event for a device.
//
// Events are published to "config.changed.{TenantID}.{DeviceID}" so the Python
// backend can trigger event-driven backups when out-of-band changes are detected.
func (p *Publisher) PublishConfigChanged(ctx context.Context, event ConfigChangedEvent) error {
data, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("marshal config changed event: %w", err)
}
subject := fmt.Sprintf("config.changed.%s.%s", event.TenantID, event.DeviceID)
_, err = p.js.Publish(ctx, subject, data)
if err != nil {
return fmt.Errorf("publish config changed: %w", err)
}
slog.Debug("published config changed event",
"device_id", event.DeviceID,
"tenant_id", event.TenantID,
"old_timestamp", event.OldTimestamp,
"new_timestamp", event.NewTimestamp,
"subject", subject,
)
return nil
}
// PublishConfigSnapshot publishes a config snapshot event to NATS JetStream.
//
// Events are published to "config.snapshot.create.{DeviceID}" so the Python
// backend can store the snapshot and compute diffs against the previous one.
func (p *Publisher) PublishConfigSnapshot(ctx context.Context, event ConfigSnapshotEvent) error {
data, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("marshalling config snapshot event: %w", err)
}
subject := fmt.Sprintf("config.snapshot.create.%s", event.DeviceID)
_, err = p.js.Publish(ctx, subject, data)
if err != nil {
return fmt.Errorf("publishing to %s: %w", subject, err)
}
slog.Debug("published config snapshot event",
"device_id", event.DeviceID,
"tenant_id", event.TenantID,
"sha256_hash", event.SHA256Hash,
"subject", subject,
)
return nil
}
// PublishPushRollback publishes a push rollback event when a device goes offline
// after a template or restore config push, triggering automatic rollback.
func (p *Publisher) PublishPushRollback(ctx context.Context, event PushRollbackEvent) error {
data, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("marshal push rollback event: %w", err)
}
subject := fmt.Sprintf("config.push.rollback.%s.%s", event.TenantID, event.DeviceID)
_, err = p.js.Publish(ctx, subject, data)
if err != nil {
return fmt.Errorf("publishing to %s: %w", subject, err)
}
slog.Info("published push rollback event",
"device_id", event.DeviceID,
"tenant_id", event.TenantID,
"push_operation_id", event.PushOperationID,
"subject", subject,
)
return nil
}
// PublishPushAlert publishes a push alert event when a device goes offline
// after an editor config push, enabling one-click rollback in the UI.
func (p *Publisher) PublishPushAlert(ctx context.Context, event PushAlertEvent) error {
data, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("marshal push alert event: %w", err)
}
subject := fmt.Sprintf("config.push.alert.%s.%s", event.TenantID, event.DeviceID)
_, err = p.js.Publish(ctx, subject, data)
if err != nil {
return fmt.Errorf("publishing to %s: %w", subject, err)
}
slog.Info("published push alert event",
"device_id", event.DeviceID,
"tenant_id", event.TenantID,
"push_type", event.PushType,
"subject", subject,
)
return nil
}
// WirelessRegistrationEvent is the payload published to the WIRELESS_REGISTRATIONS
// NATS stream when per-client wireless registration data and RF monitor stats
// are collected from a device. This is separate from DEVICE_EVENTS to allow
// independent retention (30 days vs 24 hours) and consumer scaling.
type WirelessRegistrationEvent struct {
DeviceID string `json:"device_id"`
TenantID string `json:"tenant_id"`
CollectedAt string `json:"collected_at"` // RFC3339
Registrations []device.RegistrationEntry `json:"registrations"`
RFStats []device.RFMonitorStats `json:"rf_stats,omitempty"`
}
// SessionEndEvent is the payload published to NATS JetStream when an SSH
// relay session ends. The backend subscribes to audit.session.end.> and
// writes an audit log entry with the session duration.
type SessionEndEvent struct {
SessionID string `json:"session_id"`
UserID string `json:"user_id"`
TenantID string `json:"tenant_id"`
DeviceID string `json:"device_id"`
StartTime string `json:"start_time"` // RFC3339
EndTime string `json:"end_time"` // RFC3339
SourceIP string `json:"source_ip"`
Reason string `json:"reason"` // "normal", "idle_timeout", "shutdown"
}
// PublishSessionEnd publishes an SSH session end event to NATS JetStream.
func (p *Publisher) PublishSessionEnd(ctx context.Context, event SessionEndEvent) error {
data, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("marshalling session end event: %w", err)
}
subject := fmt.Sprintf("audit.session.end.%s", event.SessionID)
_, err = p.js.Publish(ctx, subject, data)
if err != nil {
return fmt.Errorf("publishing to %s: %w", subject, err)
}
slog.Debug("published session end event",
"session_id", event.SessionID,
"device_id", event.DeviceID,
"subject", subject,
)
return nil
}
// SNMPMetricsEvent is the payload published to NATS JetStream when custom SNMP
// metrics are collected from a device. The backend subscribes to
// "device.metrics.snmp_custom.>" to ingest these into the snmp_metrics hypertable.
type SNMPMetricsEvent struct {
DeviceID string `json:"device_id"`
TenantID string `json:"tenant_id"`
CollectedAt string `json:"collected_at"` // RFC3339
Type string `json:"type"` // always "snmp_custom"
Metrics []SNMPMetricEntry `json:"metrics"`
}
// SNMPMetricEntry is a single metric within an SNMPMetricsEvent. Numeric and
// text values are mutually exclusive; IndexValue is populated for table metrics.
type SNMPMetricEntry struct {
MetricName string `json:"metric_name"`
MetricGroup string `json:"metric_group"`
OID string `json:"oid"`
ValueNum *float64 `json:"value_numeric,omitempty"`
ValueText *string `json:"value_text,omitempty"`
IndexValue *string `json:"index_value,omitempty"`
}
// PublishSNMPMetrics publishes custom SNMP metrics to NATS JetStream.
//
// Events are published to "device.metrics.snmp_custom.{device_id}" so the
// Python metrics_subscriber can ingest them into the snmp_metrics hypertable.
func (p *Publisher) PublishSNMPMetrics(ctx context.Context, event SNMPMetricsEvent) error {
data, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("marshalling SNMP metrics event: %w", err)
}
subject := fmt.Sprintf("device.metrics.snmp_custom.%s", event.DeviceID)
_, err = p.js.Publish(ctx, subject, data)
if err != nil {
return fmt.Errorf("publishing to %s: %w", subject, err)
}
slog.Debug("published SNMP metrics event",
"device_id", event.DeviceID,
"metrics_count", len(event.Metrics),
"subject", subject,
)
return nil
}
// Conn returns the raw NATS connection for use by other components
// (e.g., CmdResponder for request-reply subscriptions).
func (p *Publisher) Conn() *nats.Conn {
return p.nc
}
// Close drains the NATS connection, flushing pending messages before closing.
func (p *Publisher) Close() {
if p.nc != nil {
if err := p.nc.Drain(); err != nil {
slog.Warn("error draining NATS connection", "error", err)
}
}
}