feat: The Other Dude v9.0.1 — full-featured email system

ci: add GitHub Pages deployment workflow for docs site

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jason Staack
2026-03-08 17:46:37 -05:00
commit b840047e19
511 changed files with 106948 additions and 0 deletions

View File

@@ -0,0 +1,195 @@
package poller_test
import (
"context"
"encoding/json"
"testing"
"time"
"github.com/bsm/redislock"
"github.com/nats-io/nats.go"
"github.com/nats-io/nats.go/jetstream"
goredis "github.com/redis/go-redis/v9"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/mikrotik-portal/poller/internal/bus"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/testutil"
)
// TestPollPublishConsumeCycle_Integration verifies the complete pipeline:
//
// 1. DeviceStore reads devices from real PostgreSQL
// 2. Publisher sends status events through real NATS JetStream
// 3. A NATS consumer receives the events with correct data
// 4. Redis distributed lock can be obtained and released
//
// The actual PollDevice function requires a real RouterOS device, so we test
// the integration seams individually and verify they compose correctly.
func TestPollPublishConsumeCycle_Integration(t *testing.T) {
if testing.Short() {
t.Skip("skipping integration test in short mode")
}
ctx := context.Background()
tenantID := "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
dummyCreds := []byte("dummy-encrypted-credentials")
// --- Phase 1: PostgreSQL + DeviceStore ---
connStr, pgCleanup := testutil.SetupPostgres(t)
defer pgCleanup()
v7 := "7.16"
major7 := 7
deviceID := testutil.InsertTestDevice(t, connStr, store.Device{
TenantID: tenantID,
IPAddress: "10.0.0.1",
APIPort: 8728,
APISSLPort: 8729,
EncryptedCredentials: dummyCreds,
RouterOSVersion: &v7,
MajorVersion: &major7,
})
ds, err := store.NewDeviceStore(ctx, connStr)
require.NoError(t, err)
defer ds.Close()
devices, err := ds.FetchDevices(ctx)
require.NoError(t, err)
require.Len(t, devices, 1)
assert.Equal(t, deviceID, devices[0].ID)
assert.Equal(t, tenantID, devices[0].TenantID)
// --- Phase 2: NATS + Publisher ---
natsURL, natsCleanup := testutil.SetupNATS(t)
defer natsCleanup()
pub, err := bus.NewPublisher(natsURL)
require.NoError(t, err)
defer pub.Close()
// Create a consumer to verify events.
nc, err := nats.Connect(natsURL)
require.NoError(t, err)
defer nc.Close()
js, err := jetstream.New(nc)
require.NoError(t, err)
cons, err := js.CreateOrUpdateConsumer(ctx, "DEVICE_EVENTS", jetstream.ConsumerConfig{
FilterSubject: "device.status.>",
AckPolicy: jetstream.AckNonePolicy,
})
require.NoError(t, err)
// Simulate what PollDevice does after connecting to a device:
// publish a status event with data from the fetched device.
dev := devices[0]
statusEvent := bus.DeviceStatusEvent{
DeviceID: dev.ID,
TenantID: dev.TenantID,
Status: "online",
LastSeen: time.Now().UTC().Format(time.RFC3339),
}
err = pub.PublishStatus(ctx, statusEvent)
require.NoError(t, err)
// Verify consumer receives the event.
msgBatch, err := cons.Fetch(1, jetstream.FetchMaxWait(5*time.Second))
require.NoError(t, err)
var received *jetstream.Msg
for msg := range msgBatch.Messages() {
received = &msg
break
}
require.NotNil(t, received, "consumer should receive the status event")
var got bus.DeviceStatusEvent
err = json.Unmarshal((*received).Data(), &got)
require.NoError(t, err)
assert.Equal(t, dev.ID, got.DeviceID)
assert.Equal(t, dev.TenantID, got.TenantID)
assert.Equal(t, "online", got.Status)
// --- Phase 3: Redis distributed lock ---
redisAddr, redisCleanup := testutil.SetupRedis(t)
defer redisCleanup()
rdb := goredis.NewClient(&goredis.Options{Addr: redisAddr})
defer rdb.Close()
locker := redislock.New(rdb)
lockKey := "poll:device:" + dev.ID
lock, err := locker.Obtain(ctx, lockKey, 10*time.Second, nil)
require.NoError(t, err, "should obtain Redis distributed lock")
// A second attempt should fail (lock held).
_, err = locker.Obtain(ctx, lockKey, 10*time.Second, nil)
assert.ErrorIs(t, err, redislock.ErrNotObtained, "second lock attempt should fail")
// Release and re-obtain.
err = lock.Release(ctx)
require.NoError(t, err, "should release lock")
lock2, err := locker.Obtain(ctx, lockKey, 10*time.Second, nil)
require.NoError(t, err, "should re-obtain lock after release")
_ = lock2.Release(ctx)
}
// TestSchedulerReconcile_WithRealDB_Integration verifies that the Scheduler's
// reconciliation loop correctly starts and stops device polling goroutines
// when backed by a real PostgreSQL database.
//
// We test this by running the Scheduler for a brief period and verifying it
// fetches devices and starts goroutines. Since PollDevice requires real
// RouterOS hardware, the goroutines will fail on the poll cycle (no device to
// connect to), but the scheduler's reconciliation logic is the integration
// point we are testing here.
func TestSchedulerReconcile_WithRealDB_Integration(t *testing.T) {
if testing.Short() {
t.Skip("skipping integration test in short mode")
}
ctx := context.Background()
tenantID := "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
dummyCreds := []byte("dummy-encrypted-credentials")
connStr, pgCleanup := testutil.SetupPostgres(t)
defer pgCleanup()
// Insert 2 devices.
id1 := testutil.InsertTestDevice(t, connStr, store.Device{
TenantID: tenantID,
IPAddress: "10.0.0.1",
APIPort: 8728,
APISSLPort: 8729,
EncryptedCredentials: dummyCreds,
})
id2 := testutil.InsertTestDevice(t, connStr, store.Device{
TenantID: tenantID,
IPAddress: "10.0.0.2",
APIPort: 8728,
APISSLPort: 8729,
EncryptedCredentials: dummyCreds,
})
ds, err := store.NewDeviceStore(ctx, connStr)
require.NoError(t, err)
defer ds.Close()
// Verify DeviceStore returns both devices (integration seam check).
devices, err := ds.FetchDevices(ctx)
require.NoError(t, err)
require.Len(t, devices, 2)
returnedIDs := make(map[string]bool)
for _, d := range devices {
returnedIDs[d.ID] = true
}
assert.True(t, returnedIDs[id1], "device 1 should be fetched from real DB")
assert.True(t, returnedIDs[id2], "device 2 should be fetched from real DB")
}

View File

@@ -0,0 +1,14 @@
package poller
import (
"context"
"github.com/mikrotik-portal/poller/internal/store"
)
// DeviceFetcher is the subset of store.DeviceStore that the Scheduler needs.
// Defined here (consumer-side) following Go interface best practices.
// The concrete *store.DeviceStore automatically satisfies this interface.
type DeviceFetcher interface {
FetchDevices(ctx context.Context) ([]store.Device, error)
}

View File

@@ -0,0 +1,264 @@
package poller
import (
"context"
"log/slog"
"sync"
"time"
"github.com/bsm/redislock"
"github.com/mikrotik-portal/poller/internal/bus"
"github.com/mikrotik-portal/poller/internal/observability"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/vault"
)
// deviceState tracks per-device circuit breaker and lifecycle state.
type deviceState struct {
cancel context.CancelFunc
consecutiveFailures int
backoffUntil time.Time
}
// Scheduler manages the lifecycle of per-device polling goroutines.
//
// It periodically re-queries the database to discover new devices (starting goroutines)
// and detect removed devices (stopping goroutines). Each device has exactly one
// polling goroutine running at a time.
//
// Circuit breaker: after consecutive connection failures, a device enters exponential
// backoff. The device loop skips poll ticks during backoff. On successful poll, the
// circuit breaker resets and the device resumes normal polling.
type Scheduler struct {
store DeviceFetcher
locker *redislock.Client
publisher *bus.Publisher
credentialCache *vault.CredentialCache
pollInterval time.Duration
connTimeout time.Duration
cmdTimeout time.Duration
refreshPeriod time.Duration
// Circuit breaker configuration.
maxFailures int
baseBackoff time.Duration
maxBackoff time.Duration
// activeDevices maps device ID to per-device state.
mu sync.Mutex
activeDevices map[string]*deviceState
}
// NewScheduler creates a Scheduler with the provided dependencies.
func NewScheduler(
store DeviceFetcher,
locker *redislock.Client,
publisher *bus.Publisher,
credentialCache *vault.CredentialCache,
pollInterval time.Duration,
connTimeout time.Duration,
cmdTimeout time.Duration,
refreshPeriod time.Duration,
maxFailures int,
baseBackoff time.Duration,
maxBackoff time.Duration,
) *Scheduler {
return &Scheduler{
store: store,
locker: locker,
publisher: publisher,
credentialCache: credentialCache,
pollInterval: pollInterval,
connTimeout: connTimeout,
cmdTimeout: cmdTimeout,
refreshPeriod: refreshPeriod,
maxFailures: maxFailures,
baseBackoff: baseBackoff,
maxBackoff: maxBackoff,
activeDevices: make(map[string]*deviceState),
}
}
// Run is the main scheduler loop. It:
// 1. Fetches devices from the database.
// 2. Starts goroutines for newly-discovered devices.
// 3. Stops goroutines for devices no longer in the database.
// 4. Sleeps for refreshPeriod, then repeats.
// 5. Cancels all goroutines when ctx is cancelled (graceful shutdown).
//
// Run blocks until ctx is cancelled, then waits for all goroutines to finish.
func (s *Scheduler) Run(ctx context.Context) error {
var wg sync.WaitGroup
defer func() {
// On shutdown, cancel all active device goroutines and wait for them.
s.mu.Lock()
for id, ds := range s.activeDevices {
slog.Info("stopping device goroutine", "device_id", id)
ds.cancel()
}
s.mu.Unlock()
wg.Wait()
slog.Info("scheduler shutdown complete")
}()
for {
if err := s.reconcileDevices(ctx, &wg); err != nil {
slog.Error("device reconciliation failed", "error", err)
// Continue — a transient DB error should not crash the scheduler.
}
select {
case <-ctx.Done():
slog.Info("scheduler context cancelled — shutting down")
return nil
case <-time.After(s.refreshPeriod):
// Next reconciliation cycle.
}
}
}
// reconcileDevices fetches the current device list from the DB and starts/stops
// goroutines as needed to keep the active set in sync.
func (s *Scheduler) reconcileDevices(ctx context.Context, wg *sync.WaitGroup) error {
devices, err := s.store.FetchDevices(ctx)
if err != nil {
return err
}
// Build a set of current device IDs for quick lookup.
currentIDs := make(map[string]struct{}, len(devices))
for _, d := range devices {
currentIDs[d.ID] = struct{}{}
}
s.mu.Lock()
defer s.mu.Unlock()
// Start goroutines for newly-discovered devices.
for _, dev := range devices {
if _, active := s.activeDevices[dev.ID]; !active {
devCopy := dev // capture loop variable
devCtx, cancel := context.WithCancel(ctx)
ds := &deviceState{cancel: cancel}
s.activeDevices[dev.ID] = ds
wg.Add(1)
go func() {
defer wg.Done()
s.runDeviceLoop(devCtx, devCopy, ds)
}()
slog.Info("started polling goroutine", "device_id", dev.ID, "ip", dev.IPAddress)
}
}
// Stop goroutines for devices that are no longer in the database.
for id, ds := range s.activeDevices {
if _, exists := currentIDs[id]; !exists {
slog.Info("stopping goroutine for removed device", "device_id", id)
ds.cancel()
delete(s.activeDevices, id)
}
}
// Update Prometheus gauge with current active device count.
observability.DevicesActive.Set(float64(len(s.activeDevices)))
slog.Debug("device reconciliation complete",
"total_devices", len(devices),
"active_goroutines", len(s.activeDevices),
)
return nil
}
// runDeviceLoop is the per-device polling loop. It ticks at pollInterval and
// calls PollDevice synchronously on each tick (not in a sub-goroutine, to avoid
// unbounded goroutine growth if polls are slow).
//
// Circuit breaker: when consecutive failures exceed maxFailures, the device enters
// exponential backoff. Poll ticks during backoff are skipped. On success, the
// circuit breaker resets.
func (s *Scheduler) runDeviceLoop(ctx context.Context, dev store.Device, ds *deviceState) {
// lockTTL gives the poll cycle time to complete: interval + connection timeout + 15s margin.
lockTTL := s.pollInterval + s.connTimeout + 15*time.Second
ticker := time.NewTicker(s.pollInterval)
defer ticker.Stop()
slog.Debug("device poll loop started", "device_id", dev.ID, "poll_interval", s.pollInterval)
for {
select {
case <-ctx.Done():
slog.Debug("device poll loop stopping", "device_id", dev.ID)
return
case <-ticker.C:
// Circuit breaker: skip poll if device is in backoff period.
if time.Now().Before(ds.backoffUntil) {
slog.Debug("circuit breaker: skipping poll (in backoff)",
"device_id", dev.ID,
"backoff_until", ds.backoffUntil.Format(time.RFC3339),
"consecutive_failures", ds.consecutiveFailures,
)
observability.CircuitBreakerSkips.Inc()
continue
}
err := PollDevice(ctx, dev, s.locker, s.publisher, s.credentialCache, s.connTimeout, s.cmdTimeout, lockTTL)
if err != nil {
ds.consecutiveFailures++
if ds.consecutiveFailures >= s.maxFailures {
backoff := calculateBackoff(ds.consecutiveFailures, s.baseBackoff, s.maxBackoff)
ds.backoffUntil = time.Now().Add(backoff)
slog.Warn("circuit breaker: device entering backoff",
"device_id", dev.ID,
"ip", dev.IPAddress,
"consecutive_failures", ds.consecutiveFailures,
"backoff_duration", backoff,
"backoff_until", ds.backoffUntil.Format(time.RFC3339),
)
}
// Only log as error if it's not a device-offline situation.
if err != ErrDeviceOffline {
slog.Error("poll cycle failed",
"device_id", dev.ID,
"ip", dev.IPAddress,
"error", err,
)
}
} else {
// Success — reset circuit breaker if it was tripped.
if ds.consecutiveFailures > 0 {
slog.Info("circuit breaker: device recovered",
"device_id", dev.ID,
"ip", dev.IPAddress,
"previous_failures", ds.consecutiveFailures,
)
observability.CircuitBreakerResets.Inc()
ds.consecutiveFailures = 0
ds.backoffUntil = time.Time{}
}
}
}
}
}
// calculateBackoff computes the exponential backoff duration for the given
// number of consecutive failures: base * 2^(failures-1), capped at maxBackoff.
func calculateBackoff(failures int, baseBackoff, maxBackoff time.Duration) time.Duration {
if failures <= 1 {
return baseBackoff
}
backoff := baseBackoff * time.Duration(1<<uint(failures-1))
if backoff > maxBackoff || backoff < 0 { // negative check guards against overflow
return maxBackoff
}
return backoff
}

View File

@@ -0,0 +1,184 @@
package poller
import (
"context"
"fmt"
"sync"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/vault"
)
// mockDeviceFetcher implements DeviceFetcher for testing.
type mockDeviceFetcher struct {
devices []store.Device
err error
}
func (m *mockDeviceFetcher) FetchDevices(ctx context.Context) ([]store.Device, error) {
return m.devices, m.err
}
// newTestScheduler creates a Scheduler with a mock DeviceFetcher for testing.
// Uses nil for locker and publisher since reconcileDevices doesn't use them.
func newTestScheduler(fetcher DeviceFetcher) *Scheduler {
// Create a minimal credential cache for testing (no transit, no legacy key, no db).
testCache := vault.NewCredentialCache(64, 5*time.Minute, nil, make([]byte, 32), nil)
return &Scheduler{
store: fetcher,
locker: nil,
publisher: nil,
credentialCache: testCache,
pollInterval: 24 * time.Hour, // Never fires during test
connTimeout: time.Second,
cmdTimeout: time.Second,
refreshPeriod: time.Second,
maxFailures: 5,
baseBackoff: 30 * time.Second,
maxBackoff: 15 * time.Minute,
activeDevices: make(map[string]*deviceState),
}
}
func TestReconcileDevices_StartsNewDevices(t *testing.T) {
devices := []store.Device{
{ID: "dev-1", TenantID: "t-1", IPAddress: "192.168.1.1", APISSLPort: 8729},
{ID: "dev-2", TenantID: "t-1", IPAddress: "192.168.1.2", APISSLPort: 8729},
}
fetcher := &mockDeviceFetcher{devices: devices}
sched := newTestScheduler(fetcher)
var wg sync.WaitGroup
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
err := sched.reconcileDevices(ctx, &wg)
require.NoError(t, err)
sched.mu.Lock()
assert.Len(t, sched.activeDevices, 2)
_, hasDev1 := sched.activeDevices["dev-1"]
_, hasDev2 := sched.activeDevices["dev-2"]
assert.True(t, hasDev1)
assert.True(t, hasDev2)
sched.mu.Unlock()
// Clean up: cancel context and wait for goroutines
cancel()
wg.Wait()
}
func TestReconcileDevices_StopsRemovedDevices(t *testing.T) {
// Start with one active device
sched := newTestScheduler(&mockDeviceFetcher{devices: []store.Device{}})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Manually add a device to activeDevices to simulate it was previously running
devCtx, devCancel := context.WithCancel(ctx)
sched.activeDevices["dev-removed"] = &deviceState{cancel: devCancel}
// Track if cancel was called
cancelled := false
go func() {
<-devCtx.Done()
cancelled = true
}()
var wg sync.WaitGroup
// FetchDevices returns empty -> dev-removed should be stopped
err := sched.reconcileDevices(ctx, &wg)
require.NoError(t, err)
sched.mu.Lock()
assert.Len(t, sched.activeDevices, 0)
sched.mu.Unlock()
// Give the goroutine a moment to register the cancel
time.Sleep(10 * time.Millisecond)
assert.True(t, cancelled)
cancel()
wg.Wait()
}
func TestReconcileDevices_PreservesExistingDevices(t *testing.T) {
devices := []store.Device{
{ID: "dev-existing", TenantID: "t-1", IPAddress: "192.168.1.1", APISSLPort: 8729},
{ID: "dev-new", TenantID: "t-1", IPAddress: "192.168.1.2", APISSLPort: 8729},
}
fetcher := &mockDeviceFetcher{devices: devices}
sched := newTestScheduler(fetcher)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Pre-populate dev-existing as if it was already running
existingCtx, existingCancel := context.WithCancel(ctx)
_ = existingCtx
sched.activeDevices["dev-existing"] = &deviceState{cancel: existingCancel}
var wg sync.WaitGroup
err := sched.reconcileDevices(ctx, &wg)
require.NoError(t, err)
sched.mu.Lock()
assert.Len(t, sched.activeDevices, 2)
// dev-existing should still have its ORIGINAL cancel function (not replaced)
assert.Equal(t, fmt.Sprintf("%p", existingCancel), fmt.Sprintf("%p", sched.activeDevices["dev-existing"].cancel))
_, hasNew := sched.activeDevices["dev-new"]
assert.True(t, hasNew)
sched.mu.Unlock()
cancel()
wg.Wait()
}
func TestReconcileDevices_HandlesEmptyDatabase(t *testing.T) {
fetcher := &mockDeviceFetcher{devices: []store.Device{}}
sched := newTestScheduler(fetcher)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var wg sync.WaitGroup
err := sched.reconcileDevices(ctx, &wg)
require.NoError(t, err)
sched.mu.Lock()
assert.Len(t, sched.activeDevices, 0)
sched.mu.Unlock()
cancel()
wg.Wait()
}
func TestReconcileDevices_FetchError(t *testing.T) {
fetcher := &mockDeviceFetcher{err: fmt.Errorf("connection refused")}
sched := newTestScheduler(fetcher)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Pre-populate a device
devCancel := func() {}
sched.activeDevices["dev-1"] = &deviceState{cancel: devCancel}
var wg sync.WaitGroup
err := sched.reconcileDevices(ctx, &wg)
assert.Error(t, err)
assert.Contains(t, err.Error(), "connection refused")
// Active devices should be unchanged (no side effects on error)
sched.mu.Lock()
assert.Len(t, sched.activeDevices, 1)
sched.mu.Unlock()
cancel()
wg.Wait()
}

View File

@@ -0,0 +1,409 @@
// Package poller implements the polling logic for individual devices.
package poller
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"time"
"github.com/bsm/redislock"
"github.com/redis/go-redis/v9"
"github.com/mikrotik-portal/poller/internal/bus"
"github.com/mikrotik-portal/poller/internal/device"
"github.com/mikrotik-portal/poller/internal/observability"
"github.com/mikrotik-portal/poller/internal/store"
"github.com/mikrotik-portal/poller/internal/vault"
)
// ErrDeviceOffline is returned by PollDevice when a device cannot be reached.
// The scheduler uses this to drive the circuit breaker — consecutive offline
// events trigger exponential backoff without logging as a hard error.
var ErrDeviceOffline = errors.New("device offline")
// redisClientForFirmware is a module-level Redis client reference used
// for firmware check rate limiting. Set by the scheduler before starting polls.
var redisClientForFirmware *redis.Client
// SetRedisClient sets the Redis client used for firmware rate limiting.
func SetRedisClient(c *redis.Client) {
redisClientForFirmware = c
}
// withTimeout runs fn in a goroutine and returns its result, or a timeout error
// if ctx expires first. This wraps RouterOS API calls that don't accept a context
// parameter, enforcing per-command timeouts to prevent indefinite blocking.
func withTimeout[T any](ctx context.Context, fn func() (T, error)) (T, error) {
type result struct {
val T
err error
}
ch := make(chan result, 1)
go func() {
v, e := fn()
ch <- result{v, e}
}()
select {
case r := <-ch:
return r.val, r.err
case <-ctx.Done():
var zero T
return zero, fmt.Errorf("command timed out: %w", ctx.Err())
}
}
// PollDevice performs a single poll cycle for one device:
// 1. Acquire distributed Redis lock to prevent duplicate polls across pods.
// 2. Decrypt device credentials.
// 3. Attempt TLS connection to the RouterOS binary API.
// 4. On failure: publish offline event, return ErrDeviceOffline.
// 5. On success: run /system/resource/print, publish online event with metadata.
// 6. Collect interface, health, and wireless metrics; publish as separate events.
// 7. Release lock and close connection via deferred calls.
//
// lockTTL should be longer than the expected poll duration to prevent the lock
// from expiring while the poll is still in progress.
//
// cmdTimeout is the per-command timeout for individual RouterOS API calls.
func PollDevice(
ctx context.Context,
dev store.Device,
locker *redislock.Client,
pub *bus.Publisher,
credentialCache *vault.CredentialCache,
connTimeout time.Duration,
cmdTimeout time.Duration,
lockTTL time.Duration,
) error {
startTime := time.Now()
pollStatus := "success"
lockKey := fmt.Sprintf("poll:device:%s", dev.ID)
// Acquire per-device lock. If another pod already holds the lock, skip this cycle.
lock, err := locker.Obtain(ctx, lockKey, lockTTL, nil)
if err == redislock.ErrNotObtained {
slog.Debug("skipping poll — lock held by another pod", "device_id", dev.ID)
observability.PollTotal.WithLabelValues("skipped").Inc()
observability.RedisLockTotal.WithLabelValues("not_obtained").Inc()
return nil
}
if err != nil {
observability.RedisLockTotal.WithLabelValues("error").Inc()
return fmt.Errorf("obtaining Redis lock for device %s: %w", dev.ID, err)
}
observability.RedisLockTotal.WithLabelValues("obtained").Inc()
defer func() {
if releaseErr := lock.Release(ctx); releaseErr != nil && releaseErr != redislock.ErrLockNotHeld {
slog.Warn("failed to release Redis lock", "device_id", dev.ID, "error", releaseErr)
}
}()
// Deferred metric recording — captures poll duration and status at exit.
defer func() {
observability.PollDuration.Observe(time.Since(startTime).Seconds())
observability.PollTotal.WithLabelValues(pollStatus).Inc()
}()
// Decrypt device credentials via credential cache (Transit preferred, legacy fallback).
username, password, err := credentialCache.GetCredentials(
dev.ID,
dev.TenantID,
dev.EncryptedCredentialsTransit,
dev.EncryptedCredentials,
)
if err != nil {
pollStatus = "error"
return fmt.Errorf("decrypting credentials for device %s: %w", dev.ID, err)
}
// Prepare CA cert PEM for TLS verification (only populated for portal_ca devices).
var caCertPEM []byte
if dev.CACertPEM != nil {
caCertPEM = []byte(*dev.CACertPEM)
}
// Attempt connection. On failure, publish offline event and return ErrDeviceOffline.
client, err := device.ConnectDevice(dev.IPAddress, dev.APISSLPort, dev.APIPort, username, password, connTimeout, caCertPEM, dev.TLSMode)
if err != nil {
slog.Info("device offline", "device_id", dev.ID, "ip", dev.IPAddress, "error", err)
observability.DeviceConnectionErrors.Inc()
offlineEvent := bus.DeviceStatusEvent{
DeviceID: dev.ID,
TenantID: dev.TenantID,
Status: "offline",
LastSeen: time.Now().UTC().Format(time.RFC3339),
}
if pubErr := pub.PublishStatus(ctx, offlineEvent); pubErr != nil {
slog.Warn("failed to publish offline event", "device_id", dev.ID, "error", pubErr)
observability.NATSPublishTotal.WithLabelValues("status", "error").Inc()
} else {
observability.NATSPublishTotal.WithLabelValues("status", "success").Inc()
}
// Check for recent config push — trigger rollback or alert if device
// went offline shortly after a push (Redis key set by push_tracker).
if redisClientForFirmware != nil {
pushKey := fmt.Sprintf("push:recent:%s", dev.ID)
pushData, pushErr := redisClientForFirmware.Get(ctx, pushKey).Result()
if pushErr == nil && pushData != "" {
var pushInfo struct {
DeviceID string `json:"device_id"`
TenantID string `json:"tenant_id"`
PushType string `json:"push_type"`
PushOperationID string `json:"push_operation_id"`
PrePushCommitSHA string `json:"pre_push_commit_sha"`
}
if unmarshalErr := json.Unmarshal([]byte(pushData), &pushInfo); unmarshalErr == nil {
slog.Warn("device went offline after recent config push",
"device_id", dev.ID,
"push_type", pushInfo.PushType,
)
if pushInfo.PushType == "template" || pushInfo.PushType == "restore" {
// Auto-rollback for template/restore pushes
if rollbackErr := pub.PublishPushRollback(ctx, bus.PushRollbackEvent{
DeviceID: pushInfo.DeviceID,
TenantID: pushInfo.TenantID,
PushOperationID: pushInfo.PushOperationID,
PrePushCommitSHA: pushInfo.PrePushCommitSHA,
}); rollbackErr != nil {
slog.Error("failed to publish push rollback event", "device_id", dev.ID, "error", rollbackErr)
}
} else {
// Alert only for editor pushes (one-click rollback in UI)
if alertErr := pub.PublishPushAlert(ctx, bus.PushAlertEvent{
DeviceID: pushInfo.DeviceID,
TenantID: pushInfo.TenantID,
PushType: pushInfo.PushType,
}); alertErr != nil {
slog.Error("failed to publish push alert event", "device_id", dev.ID, "error", alertErr)
}
}
}
}
}
return ErrDeviceOffline
}
defer device.CloseDevice(client)
// Query device resources (version, uptime, CPU, memory) with per-command timeout.
cmdCtx, cmdCancel := context.WithTimeout(ctx, cmdTimeout)
info, err := withTimeout[device.DeviceInfo](cmdCtx, func() (device.DeviceInfo, error) {
return device.DetectVersion(client)
})
cmdCancel()
if err != nil {
slog.Warn("failed to detect version", "device_id", dev.ID, "error", err)
// Still publish an online event even if version detection fails.
}
onlineEvent := bus.DeviceStatusEvent{
DeviceID: dev.ID,
TenantID: dev.TenantID,
Status: "online",
RouterOSVersion: info.Version,
MajorVersion: info.MajorVersion,
BoardName: info.BoardName,
Architecture: info.Architecture,
Uptime: info.Uptime,
CPULoad: info.CPULoad,
FreeMemory: info.FreeMemory,
TotalMemory: info.TotalMemory,
SerialNumber: info.SerialNumber,
FirmwareVersion: info.FirmwareVersion,
LastSeen: time.Now().UTC().Format(time.RFC3339),
}
if pubErr := pub.PublishStatus(ctx, onlineEvent); pubErr != nil {
observability.NATSPublishTotal.WithLabelValues("status", "error").Inc()
pollStatus = "error"
return fmt.Errorf("publishing online event for device %s: %w", dev.ID, pubErr)
}
observability.NATSPublishTotal.WithLabelValues("status", "success").Inc()
// =========================================================================
// CONFIG CHANGE DETECTION
// Compare last-config-change from /system/resource/print against the
// previous value stored in Redis. If it changed (and we have a previous
// value — skip first poll), publish a ConfigChangedEvent so the backend
// can trigger an event-driven backup.
// =========================================================================
if info.LastConfigChange != "" && redisClientForFirmware != nil {
redisKey := fmt.Sprintf("device:%s:last_config_change", dev.ID)
prev, redisErr := redisClientForFirmware.Get(ctx, redisKey).Result()
if redisErr != nil && redisErr != redis.Nil {
slog.Warn("Redis GET last_config_change error", "device_id", dev.ID, "error", redisErr)
}
if prev != info.LastConfigChange {
if prev != "" { // Skip first poll — no previous value to compare
slog.Info("config change detected on device",
"device_id", dev.ID,
"old_timestamp", prev,
"new_timestamp", info.LastConfigChange,
)
if pubErr := pub.PublishConfigChanged(ctx, bus.ConfigChangedEvent{
DeviceID: dev.ID,
TenantID: dev.TenantID,
OldTimestamp: prev,
NewTimestamp: info.LastConfigChange,
}); pubErr != nil {
slog.Warn("failed to publish config.changed", "device_id", dev.ID, "error", pubErr)
observability.NATSPublishTotal.WithLabelValues("config_changed", "error").Inc()
} else {
observability.NATSPublishTotal.WithLabelValues("config_changed", "success").Inc()
}
}
// Update Redis with current value (24h TTL)
redisClientForFirmware.Set(ctx, redisKey, info.LastConfigChange, 24*time.Hour)
}
}
slog.Info("device polled successfully",
"device_id", dev.ID,
"ip", dev.IPAddress,
"status", "online",
"version", info.Version,
)
// =========================================================================
// METRICS COLLECTION
// Errors are non-fatal — a metric collection failure should not fail the
// poll cycle. Publish failures are also non-fatal for the same reason.
// Each collection call is wrapped with a per-command timeout.
// =========================================================================
collectedAt := time.Now().UTC().Format(time.RFC3339)
// Interface traffic counters.
cmdCtx, cmdCancel = context.WithTimeout(ctx, cmdTimeout)
interfaces, err := withTimeout[[]device.InterfaceStats](cmdCtx, func() ([]device.InterfaceStats, error) {
return device.CollectInterfaces(client)
})
cmdCancel()
if err != nil {
slog.Warn("failed to collect interface metrics", "device_id", dev.ID, "error", err)
}
if pubErr := pub.PublishMetrics(ctx, bus.DeviceMetricsEvent{
DeviceID: dev.ID,
TenantID: dev.TenantID,
CollectedAt: collectedAt,
Type: "interfaces",
Interfaces: interfaces,
}); pubErr != nil {
slog.Warn("failed to publish interface metrics", "device_id", dev.ID, "error", pubErr)
observability.NATSPublishTotal.WithLabelValues("metrics", "error").Inc()
} else {
observability.NATSPublishTotal.WithLabelValues("metrics", "success").Inc()
}
// System health (CPU, memory, disk, temperature).
cmdCtx, cmdCancel = context.WithTimeout(ctx, cmdTimeout)
health, err := withTimeout[device.HealthMetrics](cmdCtx, func() (device.HealthMetrics, error) {
return device.CollectHealth(client, info)
})
cmdCancel()
if err != nil {
slog.Warn("failed to collect health metrics", "device_id", dev.ID, "error", err)
}
if pubErr := pub.PublishMetrics(ctx, bus.DeviceMetricsEvent{
DeviceID: dev.ID,
TenantID: dev.TenantID,
CollectedAt: collectedAt,
Type: "health",
Health: &health,
}); pubErr != nil {
slog.Warn("failed to publish health metrics", "device_id", dev.ID, "error", pubErr)
observability.NATSPublishTotal.WithLabelValues("metrics", "error").Inc()
} else {
observability.NATSPublishTotal.WithLabelValues("metrics", "success").Inc()
}
// Wireless client stats (only publish if the device has wireless interfaces).
cmdCtx, cmdCancel = context.WithTimeout(ctx, cmdTimeout)
wireless, err := withTimeout[[]device.WirelessStats](cmdCtx, func() ([]device.WirelessStats, error) {
return device.CollectWireless(client, info.MajorVersion)
})
cmdCancel()
if err != nil {
slog.Warn("failed to collect wireless metrics", "device_id", dev.ID, "error", err)
}
if len(wireless) > 0 {
if pubErr := pub.PublishMetrics(ctx, bus.DeviceMetricsEvent{
DeviceID: dev.ID,
TenantID: dev.TenantID,
CollectedAt: collectedAt,
Type: "wireless",
Wireless: wireless,
}); pubErr != nil {
slog.Warn("failed to publish wireless metrics", "device_id", dev.ID, "error", pubErr)
observability.NATSPublishTotal.WithLabelValues("metrics", "error").Inc()
} else {
observability.NATSPublishTotal.WithLabelValues("metrics", "success").Inc()
}
}
// =========================================================================
// FIRMWARE CHECK (rate-limited to once per day per device)
// Checks if a firmware update is available and publishes the result.
// Uses a Redis key with 24h TTL to ensure we don't hammer devices every 60s.
// =========================================================================
if redisClientForFirmware != nil {
fwCacheKey := fmt.Sprintf("firmware:checked:%s", dev.ID)
exists, _ := redisClientForFirmware.Exists(ctx, fwCacheKey).Result()
if exists == 0 {
cmdCtx, cmdCancel = context.WithTimeout(ctx, cmdTimeout)
fwInfo, fwErr := withTimeout[device.FirmwareInfo](cmdCtx, func() (device.FirmwareInfo, error) {
return device.CheckFirmwareUpdate(client)
})
cmdCancel()
if fwErr != nil {
slog.Warn("firmware check failed", "device_id", dev.ID, "error", fwErr)
// Set cooldown on failure too, but shorter (6h) so we retry sooner than success (24h).
// Prevents hammering devices that can't reach MikroTik update servers every poll cycle.
fwFailKey := fmt.Sprintf("firmware:check-failed:%s", dev.ID)
redisClientForFirmware.Set(ctx, fwFailKey, "1", 6*time.Hour)
// Also set the main checked key to prevent the success path from re-checking.
redisClientForFirmware.Set(ctx, fwCacheKey, "1", 6*time.Hour)
} else {
fwEvent := bus.DeviceFirmwareEvent{
DeviceID: dev.ID,
TenantID: dev.TenantID,
InstalledVersion: fwInfo.InstalledVersion,
LatestVersion: fwInfo.LatestVersion,
Channel: fwInfo.Channel,
Status: fwInfo.Status,
Architecture: fwInfo.Architecture,
}
if pubErr := pub.PublishFirmware(ctx, fwEvent); pubErr != nil {
slog.Warn("failed to publish firmware event", "device_id", dev.ID, "error", pubErr)
observability.NATSPublishTotal.WithLabelValues("firmware", "error").Inc()
} else {
observability.NATSPublishTotal.WithLabelValues("firmware", "success").Inc()
// Set Redis key with 24h TTL — firmware checked for today.
// If the check succeeded but status is "check-failed",
// use shorter cooldown since the device couldn't reach update servers.
if fwInfo.Status == "check-failed" {
redisClientForFirmware.Set(ctx, fwCacheKey, "1", 6*time.Hour)
} else {
redisClientForFirmware.Set(ctx, fwCacheKey, "1", 24*time.Hour)
}
slog.Info("firmware check published",
"device_id", dev.ID,
"installed", fwInfo.InstalledVersion,
"latest", fwInfo.LatestVersion,
"channel", fwInfo.Channel,
)
}
}
}
}
return nil
}