feat: The Other Dude v9.0.1 — full-featured email system
ci: add GitHub Pages deployment workflow for docs site Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
182
poller/internal/bus/cmd_cert_deploy.go
Normal file
182
poller/internal/bus/cmd_cert_deploy.go
Normal file
@@ -0,0 +1,182 @@
|
||||
// Package bus provides a NATS request-reply handler for certificate deployment.
|
||||
//
|
||||
// cmd_cert_deploy.go handles cert.deploy.{device_id} subjects. The Python backend
|
||||
// sends signed certificate PEM data via NATS, and this handler:
|
||||
// 1. Looks up the device and decrypts credentials
|
||||
// 2. Establishes SSH/SFTP + RouterOS API connections
|
||||
// 3. Calls device.DeployCert for the full deployment flow
|
||||
// 4. Returns the result via NATS reply
|
||||
package bus
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/device"
|
||||
"github.com/mikrotik-portal/poller/internal/store"
|
||||
"github.com/mikrotik-portal/poller/internal/vault"
|
||||
)
|
||||
|
||||
// CertDeployResponder handles NATS request-reply for certificate deployment.
|
||||
type CertDeployResponder struct {
|
||||
nc *nats.Conn
|
||||
store *store.DeviceStore
|
||||
credentialCache *vault.CredentialCache
|
||||
sub *nats.Subscription
|
||||
}
|
||||
|
||||
// NewCertDeployResponder creates a certificate deployment responder using the
|
||||
// given NATS connection, device store, and credential cache.
|
||||
func NewCertDeployResponder(nc *nats.Conn, store *store.DeviceStore, credentialCache *vault.CredentialCache) *CertDeployResponder {
|
||||
return &CertDeployResponder{nc: nc, store: store, credentialCache: credentialCache}
|
||||
}
|
||||
|
||||
// Start subscribes to "cert.deploy.*" with a queue group for load balancing
|
||||
// across multiple poller instances.
|
||||
func (r *CertDeployResponder) Start() error {
|
||||
sub, err := r.nc.QueueSubscribe("cert.deploy.*", "cert-deploy-workers", r.handleRequest)
|
||||
if err != nil {
|
||||
return fmt.Errorf("subscribing to cert.deploy.*: %w", err)
|
||||
}
|
||||
r.sub = sub
|
||||
slog.Info("cert deploy responder subscribed", "subject", "cert.deploy.*", "queue", "cert-deploy-workers")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop unsubscribes from NATS.
|
||||
func (r *CertDeployResponder) Stop() {
|
||||
if r.sub != nil {
|
||||
if err := r.sub.Unsubscribe(); err != nil {
|
||||
slog.Warn("error unsubscribing cert deploy responder", "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleRequest processes a single certificate deployment request.
|
||||
func (r *CertDeployResponder) handleRequest(msg *nats.Msg) {
|
||||
// Extract device ID from subject: cert.deploy.{device_id}
|
||||
parts := strings.Split(msg.Subject, ".")
|
||||
if len(parts) < 3 {
|
||||
r.respondError(msg, "invalid subject format")
|
||||
return
|
||||
}
|
||||
deviceID := parts[2]
|
||||
|
||||
// Parse cert deploy request
|
||||
var req device.CertDeployRequest
|
||||
if err := json.Unmarshal(msg.Data, &req); err != nil {
|
||||
r.respondError(msg, fmt.Sprintf("invalid request JSON: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
slog.Info("cert deploy request received",
|
||||
"device_id", deviceID,
|
||||
"cert_name", req.CertName,
|
||||
"ssh_port", req.SSHPort,
|
||||
)
|
||||
|
||||
// Default SSH port if not specified
|
||||
if req.SSHPort == 0 {
|
||||
req.SSHPort = 22
|
||||
}
|
||||
|
||||
// Look up device from DB
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
dev, err := r.store.GetDevice(ctx, deviceID)
|
||||
if err != nil {
|
||||
slog.Warn("device lookup failed for cert deploy", "device_id", deviceID, "error", err)
|
||||
r.respondError(msg, fmt.Sprintf("device not found: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
// Decrypt device credentials via credential cache (Transit preferred, legacy fallback)
|
||||
username, password, err := r.credentialCache.GetCredentials(
|
||||
dev.ID,
|
||||
dev.TenantID,
|
||||
dev.EncryptedCredentialsTransit,
|
||||
dev.EncryptedCredentials,
|
||||
)
|
||||
if err != nil {
|
||||
r.respondError(msg, fmt.Sprintf("credential decryption failed: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
// Create SSH client for SFTP upload
|
||||
sshClient, err := device.NewSSHClient(dev.IPAddress, req.SSHPort, username, password, 30*time.Second)
|
||||
if err != nil {
|
||||
slog.Warn("SSH connection failed for cert deploy",
|
||||
"device_id", deviceID,
|
||||
"ip", dev.IPAddress,
|
||||
"ssh_port", req.SSHPort,
|
||||
"error", err,
|
||||
)
|
||||
r.respondError(msg, fmt.Sprintf("SSH connection failed: %s", err))
|
||||
return
|
||||
}
|
||||
defer sshClient.Close()
|
||||
|
||||
// Create RouterOS API client for certificate import commands.
|
||||
// Uses the existing ConnectDevice which tries TLS then falls back to plain.
|
||||
// Pass nil for caCertPEM -- we're deploying the cert, so the device doesn't
|
||||
// have a portal-signed cert yet. Plan 03 wires per-device CA cert loading.
|
||||
apiClient, err := device.ConnectDevice(
|
||||
dev.IPAddress,
|
||||
dev.APISSLPort,
|
||||
dev.APIPort,
|
||||
username,
|
||||
password,
|
||||
10*time.Second,
|
||||
nil, // caCertPEM: device has no portal cert yet during deployment
|
||||
dev.TLSMode,
|
||||
)
|
||||
if err != nil {
|
||||
slog.Warn("API connection failed for cert deploy",
|
||||
"device_id", deviceID,
|
||||
"ip", dev.IPAddress,
|
||||
"error", err,
|
||||
)
|
||||
r.respondError(msg, fmt.Sprintf("device API connection failed: %s", err))
|
||||
return
|
||||
}
|
||||
defer device.CloseDevice(apiClient)
|
||||
|
||||
// Execute the full deployment flow
|
||||
resp := device.DeployCert(sshClient, apiClient, req)
|
||||
|
||||
slog.Info("cert deploy completed",
|
||||
"device_id", deviceID,
|
||||
"success", resp.Success,
|
||||
"cert_name_on_device", resp.CertNameOnDevice,
|
||||
)
|
||||
|
||||
// Respond with result
|
||||
data, err := json.Marshal(resp)
|
||||
if err != nil {
|
||||
r.respondError(msg, fmt.Sprintf("failed to marshal response: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
if err := msg.Respond(data); err != nil {
|
||||
slog.Error("failed to respond to cert deploy request", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// respondError sends an error response to a NATS cert deploy request.
|
||||
func (r *CertDeployResponder) respondError(msg *nats.Msg, errMsg string) {
|
||||
resp := device.CertDeployResponse{
|
||||
Success: false,
|
||||
Error: errMsg,
|
||||
}
|
||||
data, _ := json.Marshal(resp)
|
||||
if err := msg.Respond(data); err != nil {
|
||||
slog.Error("failed to respond with cert deploy error", "error", err)
|
||||
}
|
||||
}
|
||||
166
poller/internal/bus/cmd_responder.go
Normal file
166
poller/internal/bus/cmd_responder.go
Normal file
@@ -0,0 +1,166 @@
|
||||
// Package bus provides NATS messaging for the poller service.
|
||||
//
|
||||
// cmd_responder.go implements a NATS request-reply handler for interactive
|
||||
// RouterOS device commands. The Python backend sends command requests to
|
||||
// "device.cmd.{device_id}" and receives structured responses.
|
||||
|
||||
package bus
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/device"
|
||||
"github.com/mikrotik-portal/poller/internal/store"
|
||||
"github.com/mikrotik-portal/poller/internal/vault"
|
||||
)
|
||||
|
||||
// CmdResponder handles NATS request-reply for device commands.
|
||||
type CmdResponder struct {
|
||||
nc *nats.Conn
|
||||
store *store.DeviceStore
|
||||
credentialCache *vault.CredentialCache
|
||||
sub *nats.Subscription
|
||||
}
|
||||
|
||||
// NewCmdResponder creates a command responder using the given NATS connection,
|
||||
// device store, and credential cache.
|
||||
func NewCmdResponder(nc *nats.Conn, store *store.DeviceStore, credentialCache *vault.CredentialCache) *CmdResponder {
|
||||
return &CmdResponder{nc: nc, store: store, credentialCache: credentialCache}
|
||||
}
|
||||
|
||||
// Start subscribes to "device.cmd.*" with a queue group for load balancing
|
||||
// across multiple poller instances.
|
||||
func (r *CmdResponder) Start() error {
|
||||
sub, err := r.nc.QueueSubscribe("device.cmd.*", "cmd-workers", r.handleRequest)
|
||||
if err != nil {
|
||||
return fmt.Errorf("subscribing to device.cmd.*: %w", err)
|
||||
}
|
||||
r.sub = sub
|
||||
slog.Info("command responder subscribed", "subject", "device.cmd.*", "queue", "cmd-workers")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop unsubscribes from NATS.
|
||||
func (r *CmdResponder) Stop() {
|
||||
if r.sub != nil {
|
||||
if err := r.sub.Unsubscribe(); err != nil {
|
||||
slog.Warn("error unsubscribing command responder", "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleRequest processes a single device command request.
|
||||
func (r *CmdResponder) handleRequest(msg *nats.Msg) {
|
||||
// Extract device ID from subject: device.cmd.{device_id}
|
||||
parts := strings.Split(msg.Subject, ".")
|
||||
if len(parts) < 3 {
|
||||
r.respondError(msg, "invalid subject format")
|
||||
return
|
||||
}
|
||||
deviceID := parts[2]
|
||||
|
||||
// Parse command request
|
||||
var req device.CommandRequest
|
||||
if err := json.Unmarshal(msg.Data, &req); err != nil {
|
||||
r.respondError(msg, fmt.Sprintf("invalid request JSON: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
slog.Debug("command request received",
|
||||
"device_id", deviceID,
|
||||
"command", req.Command,
|
||||
"args_count", len(req.Args),
|
||||
)
|
||||
|
||||
// Look up device from DB
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
dev, err := r.store.GetDevice(ctx, deviceID)
|
||||
if err != nil {
|
||||
slog.Warn("device lookup failed for command", "device_id", deviceID, "error", err)
|
||||
r.respondError(msg, fmt.Sprintf("device not found: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
// Decrypt credentials via credential cache (Transit preferred, legacy fallback)
|
||||
username, password, err := r.credentialCache.GetCredentials(
|
||||
dev.ID,
|
||||
dev.TenantID,
|
||||
dev.EncryptedCredentialsTransit,
|
||||
dev.EncryptedCredentials,
|
||||
)
|
||||
if err != nil {
|
||||
r.respondError(msg, fmt.Sprintf("credential decryption failed: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
// Prepare CA cert PEM for TLS verification (only populated for portal_ca devices).
|
||||
var caCertPEM []byte
|
||||
if dev.CACertPEM != nil {
|
||||
caCertPEM = []byte(*dev.CACertPEM)
|
||||
}
|
||||
|
||||
// Connect to device with 10-second timeout
|
||||
client, err := device.ConnectDevice(
|
||||
dev.IPAddress,
|
||||
dev.APISSLPort,
|
||||
dev.APIPort,
|
||||
username,
|
||||
password,
|
||||
10*time.Second,
|
||||
caCertPEM,
|
||||
dev.TLSMode,
|
||||
)
|
||||
if err != nil {
|
||||
slog.Info("device connection failed for command",
|
||||
"device_id", deviceID,
|
||||
"ip", dev.IPAddress,
|
||||
"error", err,
|
||||
)
|
||||
r.respondError(msg, fmt.Sprintf("device connection failed: %s", err))
|
||||
return
|
||||
}
|
||||
defer device.CloseDevice(client)
|
||||
|
||||
// Execute the command
|
||||
resp := device.ExecuteCommand(client, req.Command, req.Args)
|
||||
|
||||
slog.Debug("command executed",
|
||||
"device_id", deviceID,
|
||||
"command", req.Command,
|
||||
"success", resp.Success,
|
||||
"result_count", len(resp.Data),
|
||||
)
|
||||
|
||||
// Respond
|
||||
data, err := json.Marshal(resp)
|
||||
if err != nil {
|
||||
r.respondError(msg, fmt.Sprintf("failed to marshal response: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
if err := msg.Respond(data); err != nil {
|
||||
slog.Error("failed to respond to command request", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// respondError sends an error response to a NATS request.
|
||||
func (r *CmdResponder) respondError(msg *nats.Msg, errMsg string) {
|
||||
resp := device.CommandResponse{
|
||||
Success: false,
|
||||
Data: nil,
|
||||
Error: errMsg,
|
||||
}
|
||||
data, _ := json.Marshal(resp)
|
||||
if err := msg.Respond(data); err != nil {
|
||||
slog.Error("failed to respond with error", "error", err)
|
||||
}
|
||||
}
|
||||
75
poller/internal/bus/credential_subscriber.go
Normal file
75
poller/internal/bus/credential_subscriber.go
Normal file
@@ -0,0 +1,75 @@
|
||||
// Package bus provides NATS messaging for the poller service.
|
||||
//
|
||||
// credential_subscriber.go subscribes to device.credential_changed.> events
|
||||
// and invalidates the credential cache so the poller uses fresh credentials
|
||||
// on the next poll cycle instead of waiting for the 5-minute cache TTL.
|
||||
package bus
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/vault"
|
||||
)
|
||||
|
||||
// CredentialSubscriber listens for credential change events and invalidates
|
||||
// the credential cache. This ensures the poller picks up new credentials
|
||||
// within seconds of a change rather than waiting for the 5-minute TTL.
|
||||
type CredentialSubscriber struct {
|
||||
nc *nats.Conn
|
||||
credentialCache *vault.CredentialCache
|
||||
sub *nats.Subscription
|
||||
}
|
||||
|
||||
// NewCredentialSubscriber creates a subscriber that invalidates cached
|
||||
// credentials when the backend publishes credential_changed events.
|
||||
func NewCredentialSubscriber(nc *nats.Conn, credentialCache *vault.CredentialCache) *CredentialSubscriber {
|
||||
return &CredentialSubscriber{nc: nc, credentialCache: credentialCache}
|
||||
}
|
||||
|
||||
// Start subscribes to "device.credential_changed.>" with a queue group
|
||||
// so only one poller instance processes each event.
|
||||
func (s *CredentialSubscriber) Start() error {
|
||||
sub, err := s.nc.QueueSubscribe("device.credential_changed.>", "credential-invalidators", s.handleEvent)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.sub = sub
|
||||
slog.Info("credential subscriber started", "subject", "device.credential_changed.>", "queue", "credential-invalidators")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop unsubscribes from NATS.
|
||||
func (s *CredentialSubscriber) Stop() {
|
||||
if s.sub != nil {
|
||||
if err := s.sub.Unsubscribe(); err != nil {
|
||||
slog.Warn("error unsubscribing credential subscriber", "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleEvent processes a credential_changed event by invalidating the
|
||||
// device's entry in the credential cache.
|
||||
func (s *CredentialSubscriber) handleEvent(msg *nats.Msg) {
|
||||
var event struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
}
|
||||
if err := json.Unmarshal(msg.Data, &event); err != nil {
|
||||
slog.Warn("failed to unmarshal credential_changed event", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
if event.DeviceID == "" {
|
||||
slog.Warn("credential_changed event missing device_id")
|
||||
return
|
||||
}
|
||||
|
||||
s.credentialCache.Invalidate(event.DeviceID)
|
||||
slog.Info("credential cache invalidated",
|
||||
"device_id", event.DeviceID,
|
||||
"tenant_id", event.TenantID,
|
||||
)
|
||||
}
|
||||
322
poller/internal/bus/publisher.go
Normal file
322
poller/internal/bus/publisher.go
Normal file
@@ -0,0 +1,322 @@
|
||||
// Package bus provides NATS JetStream publishing for device events.
|
||||
package bus
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/device"
|
||||
)
|
||||
|
||||
// DeviceStatusEvent is the payload published to NATS JetStream when a device
|
||||
// is polled. Consumers subscribe to "device.status.>" to receive all events.
|
||||
type DeviceStatusEvent struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
Status string `json:"status"` // "online" or "offline"
|
||||
RouterOSVersion string `json:"routeros_version,omitempty"`
|
||||
MajorVersion int `json:"major_version,omitempty"`
|
||||
BoardName string `json:"board_name,omitempty"`
|
||||
Architecture string `json:"architecture,omitempty"`
|
||||
Uptime string `json:"uptime,omitempty"`
|
||||
CPULoad string `json:"cpu_load,omitempty"`
|
||||
FreeMemory string `json:"free_memory,omitempty"`
|
||||
TotalMemory string `json:"total_memory,omitempty"`
|
||||
SerialNumber string `json:"serial_number,omitempty"`
|
||||
FirmwareVersion string `json:"firmware_version,omitempty"`
|
||||
LastSeen string `json:"last_seen"` // RFC3339
|
||||
}
|
||||
|
||||
// DeviceMetricsEvent is the payload published to NATS JetStream for metric data
|
||||
// collected from a RouterOS device on each poll cycle.
|
||||
//
|
||||
// Events are published to "device.metrics.{type}.{device_id}" where type is one
|
||||
// of "health", "interfaces", or "wireless". Only the field matching the type will
|
||||
// be populated; the others will be omitted from the JSON payload.
|
||||
type DeviceMetricsEvent struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
CollectedAt string `json:"collected_at"` // RFC3339
|
||||
Type string `json:"type"` // "health", "interfaces", "wireless"
|
||||
Health *device.HealthMetrics `json:"health,omitempty"`
|
||||
Interfaces []device.InterfaceStats `json:"interfaces,omitempty"`
|
||||
Wireless []device.WirelessStats `json:"wireless,omitempty"`
|
||||
}
|
||||
|
||||
// ConfigChangedEvent is published when a device's config changes out-of-band.
|
||||
type ConfigChangedEvent struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
OldTimestamp string `json:"old_timestamp"`
|
||||
NewTimestamp string `json:"new_timestamp"`
|
||||
}
|
||||
|
||||
// PushRollbackEvent triggers automatic rollback for template pushes.
|
||||
type PushRollbackEvent struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
PushOperationID string `json:"push_operation_id"`
|
||||
PrePushCommitSHA string `json:"pre_push_commit_sha"`
|
||||
}
|
||||
|
||||
// PushAlertEvent triggers an alert for editor pushes (one-click rollback).
|
||||
type PushAlertEvent struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
PushType string `json:"push_type"`
|
||||
}
|
||||
|
||||
// Publisher wraps a NATS JetStream connection for publishing device events.
|
||||
type Publisher struct {
|
||||
nc *nats.Conn
|
||||
js jetstream.JetStream
|
||||
}
|
||||
|
||||
// NewPublisher connects to NATS and ensures the DEVICE_EVENTS stream exists.
|
||||
//
|
||||
// The DEVICE_EVENTS stream covers device.status.>, device.metrics.>, and
|
||||
// device.firmware.> subjects. These are explicit to avoid capturing
|
||||
// device.cmd.* (used by CmdResponder for request-reply). This allows
|
||||
// the Python API to subscribe to either family via durable consumers.
|
||||
//
|
||||
// The connection uses unlimited reconnects with a 2-second wait between attempts
|
||||
// so the poller survives transient NATS restarts gracefully.
|
||||
func NewPublisher(natsURL string) (*Publisher, error) {
|
||||
nc, err := nats.Connect(natsURL,
|
||||
nats.MaxReconnects(-1),
|
||||
nats.ReconnectWait(2*time.Second),
|
||||
nats.DisconnectErrHandler(func(nc *nats.Conn, err error) {
|
||||
slog.Warn("NATS disconnected", "error", err)
|
||||
}),
|
||||
nats.ReconnectHandler(func(nc *nats.Conn) {
|
||||
slog.Info("NATS reconnected", "url", nc.ConnectedUrl())
|
||||
}),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("connecting to NATS at %s: %w", natsURL, err)
|
||||
}
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
return nil, fmt.Errorf("creating JetStream context: %w", err)
|
||||
}
|
||||
|
||||
// Ensure the DEVICE_EVENTS stream exists. CreateOrUpdateStream is idempotent.
|
||||
// Subjects are explicit (not "device.>") to avoid capturing device.cmd.*
|
||||
// which is used by CmdResponder for core NATS request-reply.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
_, err = js.CreateOrUpdateStream(ctx, jetstream.StreamConfig{
|
||||
Name: "DEVICE_EVENTS",
|
||||
Subjects: []string{
|
||||
"device.status.>",
|
||||
"device.metrics.>",
|
||||
"device.firmware.>",
|
||||
"device.credential_changed.>",
|
||||
"config.changed.>",
|
||||
"config.push.rollback.>",
|
||||
"config.push.alert.>",
|
||||
},
|
||||
MaxAge: 24 * time.Hour,
|
||||
})
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
return nil, fmt.Errorf("ensuring DEVICE_EVENTS stream: %w", err)
|
||||
}
|
||||
|
||||
slog.Info("NATS JetStream DEVICE_EVENTS stream ready")
|
||||
|
||||
return &Publisher{nc: nc, js: js}, nil
|
||||
}
|
||||
|
||||
// PublishStatus publishes a device status event to NATS JetStream.
|
||||
//
|
||||
// Events are published to "device.status.{DeviceID}" so consumers can subscribe
|
||||
// to individual devices or all events via "device.status.>".
|
||||
func (p *Publisher) PublishStatus(ctx context.Context, event DeviceStatusEvent) error {
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshalling event: %w", err)
|
||||
}
|
||||
|
||||
subject := fmt.Sprintf("device.status.%s", event.DeviceID)
|
||||
|
||||
_, err = p.js.Publish(ctx, subject, data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("publishing to %s: %w", subject, err)
|
||||
}
|
||||
|
||||
slog.Debug("published device status event",
|
||||
"device_id", event.DeviceID,
|
||||
"status", event.Status,
|
||||
"subject", subject,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// PublishMetrics publishes a device metrics event to NATS JetStream.
|
||||
//
|
||||
// Events are published to "device.metrics.{type}.{device_id}" so consumers can
|
||||
// subscribe to all metrics via "device.metrics.>" or filter by type.
|
||||
func (p *Publisher) PublishMetrics(ctx context.Context, event DeviceMetricsEvent) error {
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshalling metrics event: %w", err)
|
||||
}
|
||||
|
||||
subject := fmt.Sprintf("device.metrics.%s.%s", event.Type, event.DeviceID)
|
||||
|
||||
_, err = p.js.Publish(ctx, subject, data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("publishing to %s: %w", subject, err)
|
||||
}
|
||||
|
||||
slog.Debug("published device metrics event",
|
||||
"device_id", event.DeviceID,
|
||||
"type", event.Type,
|
||||
"subject", subject,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// DeviceFirmwareEvent is the payload published to NATS JetStream when the poller
|
||||
// checks a device's firmware update status (rate-limited to once per day per device).
|
||||
type DeviceFirmwareEvent struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
InstalledVersion string `json:"installed_version"`
|
||||
LatestVersion string `json:"latest_version,omitempty"`
|
||||
Channel string `json:"channel,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Architecture string `json:"architecture"`
|
||||
}
|
||||
|
||||
// PublishFirmware publishes a device firmware status event to NATS JetStream.
|
||||
//
|
||||
// Events are published to "device.firmware.{DeviceID}" so the Python firmware
|
||||
// subscriber can process them and update the firmware_versions table.
|
||||
func (p *Publisher) PublishFirmware(ctx context.Context, event DeviceFirmwareEvent) error {
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshalling firmware event: %w", err)
|
||||
}
|
||||
|
||||
subject := fmt.Sprintf("device.firmware.%s", event.DeviceID)
|
||||
|
||||
_, err = p.js.Publish(ctx, subject, data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("publishing to %s: %w", subject, err)
|
||||
}
|
||||
|
||||
slog.Debug("published device firmware event",
|
||||
"device_id", event.DeviceID,
|
||||
"installed", event.InstalledVersion,
|
||||
"latest", event.LatestVersion,
|
||||
"subject", subject,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// PublishConfigChanged publishes a config change event for a device.
|
||||
//
|
||||
// Events are published to "config.changed.{TenantID}.{DeviceID}" so the Python
|
||||
// backend can trigger event-driven backups when out-of-band changes are detected.
|
||||
func (p *Publisher) PublishConfigChanged(ctx context.Context, event ConfigChangedEvent) error {
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal config changed event: %w", err)
|
||||
}
|
||||
|
||||
subject := fmt.Sprintf("config.changed.%s.%s", event.TenantID, event.DeviceID)
|
||||
|
||||
_, err = p.js.Publish(ctx, subject, data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("publish config changed: %w", err)
|
||||
}
|
||||
|
||||
slog.Debug("published config changed event",
|
||||
"device_id", event.DeviceID,
|
||||
"tenant_id", event.TenantID,
|
||||
"old_timestamp", event.OldTimestamp,
|
||||
"new_timestamp", event.NewTimestamp,
|
||||
"subject", subject,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// PublishPushRollback publishes a push rollback event when a device goes offline
|
||||
// after a template or restore config push, triggering automatic rollback.
|
||||
func (p *Publisher) PublishPushRollback(ctx context.Context, event PushRollbackEvent) error {
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal push rollback event: %w", err)
|
||||
}
|
||||
|
||||
subject := fmt.Sprintf("config.push.rollback.%s.%s", event.TenantID, event.DeviceID)
|
||||
|
||||
_, err = p.js.Publish(ctx, subject, data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("publishing to %s: %w", subject, err)
|
||||
}
|
||||
|
||||
slog.Info("published push rollback event",
|
||||
"device_id", event.DeviceID,
|
||||
"tenant_id", event.TenantID,
|
||||
"push_operation_id", event.PushOperationID,
|
||||
"subject", subject,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// PublishPushAlert publishes a push alert event when a device goes offline
|
||||
// after an editor config push, enabling one-click rollback in the UI.
|
||||
func (p *Publisher) PublishPushAlert(ctx context.Context, event PushAlertEvent) error {
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal push alert event: %w", err)
|
||||
}
|
||||
|
||||
subject := fmt.Sprintf("config.push.alert.%s.%s", event.TenantID, event.DeviceID)
|
||||
|
||||
_, err = p.js.Publish(ctx, subject, data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("publishing to %s: %w", subject, err)
|
||||
}
|
||||
|
||||
slog.Info("published push alert event",
|
||||
"device_id", event.DeviceID,
|
||||
"tenant_id", event.TenantID,
|
||||
"push_type", event.PushType,
|
||||
"subject", subject,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Conn returns the raw NATS connection for use by other components
|
||||
// (e.g., CmdResponder for request-reply subscriptions).
|
||||
func (p *Publisher) Conn() *nats.Conn {
|
||||
return p.nc
|
||||
}
|
||||
|
||||
// Close drains the NATS connection, flushing pending messages before closing.
|
||||
func (p *Publisher) Close() {
|
||||
if p.nc != nil {
|
||||
if err := p.nc.Drain(); err != nil {
|
||||
slog.Warn("error draining NATS connection", "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
232
poller/internal/bus/publisher_integration_test.go
Normal file
232
poller/internal/bus/publisher_integration_test.go
Normal file
@@ -0,0 +1,232 @@
|
||||
package bus_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/bus"
|
||||
"github.com/mikrotik-portal/poller/internal/testutil"
|
||||
)
|
||||
|
||||
func TestPublisher_PublishStatus_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
natsURL, cleanup := testutil.SetupNATS(t)
|
||||
defer cleanup()
|
||||
|
||||
pub, err := bus.NewPublisher(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer pub.Close()
|
||||
|
||||
// Create a direct NATS consumer to receive messages.
|
||||
nc, err := nats.Connect(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer nc.Close()
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Create a consumer on the DEVICE_EVENTS stream.
|
||||
cons, err := js.CreateOrUpdateConsumer(ctx, "DEVICE_EVENTS", jetstream.ConsumerConfig{
|
||||
FilterSubject: "device.status.>",
|
||||
AckPolicy: jetstream.AckNonePolicy,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Publish a status event.
|
||||
event := bus.DeviceStatusEvent{
|
||||
DeviceID: "dev-abc-123",
|
||||
TenantID: "tenant-xyz",
|
||||
Status: "online",
|
||||
LastSeen: time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
err = pub.PublishStatus(ctx, event)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Consume the message with timeout.
|
||||
msgBatch, err := cons.Fetch(1, jetstream.FetchMaxWait(5*time.Second))
|
||||
require.NoError(t, err)
|
||||
|
||||
var received *jetstream.Msg
|
||||
for msg := range msgBatch.Messages() {
|
||||
received = &msg
|
||||
break
|
||||
}
|
||||
|
||||
require.NotNil(t, received, "should receive a message within 5 seconds")
|
||||
|
||||
var got bus.DeviceStatusEvent
|
||||
err = json.Unmarshal((*received).Data(), &got)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, event.DeviceID, got.DeviceID)
|
||||
assert.Equal(t, event.TenantID, got.TenantID)
|
||||
assert.Equal(t, event.Status, got.Status)
|
||||
}
|
||||
|
||||
func TestPublisher_PublishMetrics_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
natsURL, cleanup := testutil.SetupNATS(t)
|
||||
defer cleanup()
|
||||
|
||||
pub, err := bus.NewPublisher(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer pub.Close()
|
||||
|
||||
nc, err := nats.Connect(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer nc.Close()
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Create a consumer filtering on metrics subjects.
|
||||
cons, err := js.CreateOrUpdateConsumer(ctx, "DEVICE_EVENTS", jetstream.ConsumerConfig{
|
||||
FilterSubject: "device.metrics.>",
|
||||
AckPolicy: jetstream.AckNonePolicy,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Publish a metrics event.
|
||||
event := bus.DeviceMetricsEvent{
|
||||
DeviceID: "dev-metrics-456",
|
||||
TenantID: "tenant-xyz",
|
||||
CollectedAt: time.Now().UTC().Format(time.RFC3339),
|
||||
Type: "health",
|
||||
}
|
||||
err = pub.PublishMetrics(ctx, event)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Consume the message.
|
||||
msgBatch, err := cons.Fetch(1, jetstream.FetchMaxWait(5*time.Second))
|
||||
require.NoError(t, err)
|
||||
|
||||
var received *jetstream.Msg
|
||||
for msg := range msgBatch.Messages() {
|
||||
received = &msg
|
||||
break
|
||||
}
|
||||
|
||||
require.NotNil(t, received, "should receive metrics message within 5 seconds")
|
||||
|
||||
// Verify the subject includes the type and device_id.
|
||||
assert.Equal(t, "device.metrics.health.dev-metrics-456", (*received).Subject())
|
||||
|
||||
var got bus.DeviceMetricsEvent
|
||||
err = json.Unmarshal((*received).Data(), &got)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, event.DeviceID, got.DeviceID)
|
||||
assert.Equal(t, event.TenantID, got.TenantID)
|
||||
assert.Equal(t, event.Type, got.Type)
|
||||
}
|
||||
|
||||
func TestPublisher_PublishFirmware_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
natsURL, cleanup := testutil.SetupNATS(t)
|
||||
defer cleanup()
|
||||
|
||||
pub, err := bus.NewPublisher(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer pub.Close()
|
||||
|
||||
nc, err := nats.Connect(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer nc.Close()
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
cons, err := js.CreateOrUpdateConsumer(ctx, "DEVICE_EVENTS", jetstream.ConsumerConfig{
|
||||
FilterSubject: "device.firmware.>",
|
||||
AckPolicy: jetstream.AckNonePolicy,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
event := bus.DeviceFirmwareEvent{
|
||||
DeviceID: "dev-fw-789",
|
||||
TenantID: "tenant-xyz",
|
||||
InstalledVersion: "7.15",
|
||||
LatestVersion: "7.16",
|
||||
Channel: "stable",
|
||||
Status: "update_available",
|
||||
Architecture: "arm64",
|
||||
}
|
||||
err = pub.PublishFirmware(ctx, event)
|
||||
require.NoError(t, err)
|
||||
|
||||
msgBatch, err := cons.Fetch(1, jetstream.FetchMaxWait(5*time.Second))
|
||||
require.NoError(t, err)
|
||||
|
||||
var received *jetstream.Msg
|
||||
for msg := range msgBatch.Messages() {
|
||||
received = &msg
|
||||
break
|
||||
}
|
||||
|
||||
require.NotNil(t, received, "should receive firmware message within 5 seconds")
|
||||
assert.Equal(t, "device.firmware.dev-fw-789", (*received).Subject())
|
||||
|
||||
var got bus.DeviceFirmwareEvent
|
||||
err = json.Unmarshal((*received).Data(), &got)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, event.DeviceID, got.DeviceID)
|
||||
assert.Equal(t, event.InstalledVersion, got.InstalledVersion)
|
||||
assert.Equal(t, event.LatestVersion, got.LatestVersion)
|
||||
assert.Equal(t, event.Status, got.Status)
|
||||
}
|
||||
|
||||
func TestPublisher_NewPublisher_StreamCreation_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
natsURL, cleanup := testutil.SetupNATS(t)
|
||||
defer cleanup()
|
||||
|
||||
pub, err := bus.NewPublisher(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer pub.Close()
|
||||
|
||||
// Verify the DEVICE_EVENTS stream was created with correct config.
|
||||
nc, err := nats.Connect(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer nc.Close()
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx := context.Background()
|
||||
stream, err := js.Stream(ctx, "DEVICE_EVENTS")
|
||||
require.NoError(t, err, "DEVICE_EVENTS stream should exist")
|
||||
|
||||
info, err := stream.Info(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, "DEVICE_EVENTS", info.Config.Name)
|
||||
assert.Contains(t, info.Config.Subjects, "device.status.>",
|
||||
"stream should cover device.status.> subjects")
|
||||
assert.Contains(t, info.Config.Subjects, "device.metrics.>",
|
||||
"stream should cover device.metrics.> subjects")
|
||||
assert.Contains(t, info.Config.Subjects, "device.firmware.>",
|
||||
"stream should cover device.firmware.> subjects")
|
||||
}
|
||||
Reference in New Issue
Block a user