feat: The Other Dude v9.0.1 — full-featured email system

ci: add GitHub Pages deployment workflow for docs site Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 17:46:37 -05:00
commit b840047e19
511 changed files with 106948 additions and 0 deletions
--- a/poller/internal/poller/integration_test.go
+++ b/poller/internal/poller/integration_test.go
@@ -0,0 +1,195 @@
+package poller_test
+
+import (
+	"context"
+	"encoding/json"
+	"testing"
+	"time"
+
+	"github.com/bsm/redislock"
+	"github.com/nats-io/nats.go"
+	"github.com/nats-io/nats.go/jetstream"
+	goredis "github.com/redis/go-redis/v9"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/mikrotik-portal/poller/internal/bus"
+	"github.com/mikrotik-portal/poller/internal/store"
+	"github.com/mikrotik-portal/poller/internal/testutil"
+)
+
+// TestPollPublishConsumeCycle_Integration verifies the complete pipeline:
+//
+//  1. DeviceStore reads devices from real PostgreSQL
+//  2. Publisher sends status events through real NATS JetStream
+//  3. A NATS consumer receives the events with correct data
+//  4. Redis distributed lock can be obtained and released
+//
+// The actual PollDevice function requires a real RouterOS device, so we test
+// the integration seams individually and verify they compose correctly.
+func TestPollPublishConsumeCycle_Integration(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping integration test in short mode")
+	}
+
+	ctx := context.Background()
+	tenantID := "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
+	dummyCreds := []byte("dummy-encrypted-credentials")
+
+	// --- Phase 1: PostgreSQL + DeviceStore ---
+	connStr, pgCleanup := testutil.SetupPostgres(t)
+	defer pgCleanup()
+
+	v7 := "7.16"
+	major7 := 7
+	deviceID := testutil.InsertTestDevice(t, connStr, store.Device{
+		TenantID:             tenantID,
+		IPAddress:            "10.0.0.1",
+		APIPort:              8728,
+		APISSLPort:           8729,
+		EncryptedCredentials: dummyCreds,
+		RouterOSVersion:      &v7,
+		MajorVersion:         &major7,
+	})
+
+	ds, err := store.NewDeviceStore(ctx, connStr)
+	require.NoError(t, err)
+	defer ds.Close()
+
+	devices, err := ds.FetchDevices(ctx)
+	require.NoError(t, err)
+	require.Len(t, devices, 1)
+	assert.Equal(t, deviceID, devices[0].ID)
+	assert.Equal(t, tenantID, devices[0].TenantID)
+
+	// --- Phase 2: NATS + Publisher ---
+	natsURL, natsCleanup := testutil.SetupNATS(t)
+	defer natsCleanup()
+
+	pub, err := bus.NewPublisher(natsURL)
+	require.NoError(t, err)
+	defer pub.Close()
+
+	// Create a consumer to verify events.
+	nc, err := nats.Connect(natsURL)
+	require.NoError(t, err)
+	defer nc.Close()
+
+	js, err := jetstream.New(nc)
+	require.NoError(t, err)
+
+	cons, err := js.CreateOrUpdateConsumer(ctx, "DEVICE_EVENTS", jetstream.ConsumerConfig{
+		FilterSubject: "device.status.>",
+		AckPolicy:     jetstream.AckNonePolicy,
+	})
+	require.NoError(t, err)
+
+	// Simulate what PollDevice does after connecting to a device:
+	// publish a status event with data from the fetched device.
+	dev := devices[0]
+	statusEvent := bus.DeviceStatusEvent{
+		DeviceID: dev.ID,
+		TenantID: dev.TenantID,
+		Status:   "online",
+		LastSeen: time.Now().UTC().Format(time.RFC3339),
+	}
+	err = pub.PublishStatus(ctx, statusEvent)
+	require.NoError(t, err)
+
+	// Verify consumer receives the event.
+	msgBatch, err := cons.Fetch(1, jetstream.FetchMaxWait(5*time.Second))
+	require.NoError(t, err)
+
+	var received *jetstream.Msg
+	for msg := range msgBatch.Messages() {
+		received = &msg
+		break
+	}
+	require.NotNil(t, received, "consumer should receive the status event")
+
+	var got bus.DeviceStatusEvent
+	err = json.Unmarshal((*received).Data(), &got)
+	require.NoError(t, err)
+	assert.Equal(t, dev.ID, got.DeviceID)
+	assert.Equal(t, dev.TenantID, got.TenantID)
+	assert.Equal(t, "online", got.Status)
+
+	// --- Phase 3: Redis distributed lock ---
+	redisAddr, redisCleanup := testutil.SetupRedis(t)
+	defer redisCleanup()
+
+	rdb := goredis.NewClient(&goredis.Options{Addr: redisAddr})
+	defer rdb.Close()
+
+	locker := redislock.New(rdb)
+
+	lockKey := "poll:device:" + dev.ID
+	lock, err := locker.Obtain(ctx, lockKey, 10*time.Second, nil)
+	require.NoError(t, err, "should obtain Redis distributed lock")
+
+	// A second attempt should fail (lock held).
+	_, err = locker.Obtain(ctx, lockKey, 10*time.Second, nil)
+	assert.ErrorIs(t, err, redislock.ErrNotObtained, "second lock attempt should fail")
+
+	// Release and re-obtain.
+	err = lock.Release(ctx)
+	require.NoError(t, err, "should release lock")
+
+	lock2, err := locker.Obtain(ctx, lockKey, 10*time.Second, nil)
+	require.NoError(t, err, "should re-obtain lock after release")
+	_ = lock2.Release(ctx)
+}
+
+// TestSchedulerReconcile_WithRealDB_Integration verifies that the Scheduler's
+// reconciliation loop correctly starts and stops device polling goroutines
+// when backed by a real PostgreSQL database.
+//
+// We test this by running the Scheduler for a brief period and verifying it
+// fetches devices and starts goroutines. Since PollDevice requires real
+// RouterOS hardware, the goroutines will fail on the poll cycle (no device to
+// connect to), but the scheduler's reconciliation logic is the integration
+// point we are testing here.
+func TestSchedulerReconcile_WithRealDB_Integration(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping integration test in short mode")
+	}
+
+	ctx := context.Background()
+	tenantID := "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
+	dummyCreds := []byte("dummy-encrypted-credentials")
+
+	connStr, pgCleanup := testutil.SetupPostgres(t)
+	defer pgCleanup()
+
+	// Insert 2 devices.
+	id1 := testutil.InsertTestDevice(t, connStr, store.Device{
+		TenantID:             tenantID,
+		IPAddress:            "10.0.0.1",
+		APIPort:              8728,
+		APISSLPort:           8729,
+		EncryptedCredentials: dummyCreds,
+	})
+	id2 := testutil.InsertTestDevice(t, connStr, store.Device{
+		TenantID:             tenantID,
+		IPAddress:            "10.0.0.2",
+		APIPort:              8728,
+		APISSLPort:           8729,
+		EncryptedCredentials: dummyCreds,
+	})
+
+	ds, err := store.NewDeviceStore(ctx, connStr)
+	require.NoError(t, err)
+	defer ds.Close()
+
+	// Verify DeviceStore returns both devices (integration seam check).
+	devices, err := ds.FetchDevices(ctx)
+	require.NoError(t, err)
+	require.Len(t, devices, 2)
+
+	returnedIDs := make(map[string]bool)
+	for _, d := range devices {
+		returnedIDs[d.ID] = true
+	}
+	assert.True(t, returnedIDs[id1], "device 1 should be fetched from real DB")
+	assert.True(t, returnedIDs[id2], "device 2 should be fetched from real DB")
+}
--- a/poller/internal/poller/interfaces.go
+++ b/poller/internal/poller/interfaces.go
@@ -0,0 +1,14 @@
+package poller
+
+import (
+	"context"
+
+	"github.com/mikrotik-portal/poller/internal/store"
+)
+
+// DeviceFetcher is the subset of store.DeviceStore that the Scheduler needs.
+// Defined here (consumer-side) following Go interface best practices.
+// The concrete *store.DeviceStore automatically satisfies this interface.
+type DeviceFetcher interface {
+	FetchDevices(ctx context.Context) ([]store.Device, error)
+}
--- a/poller/internal/poller/scheduler.go
+++ b/poller/internal/poller/scheduler.go
@@ -0,0 +1,264 @@
+package poller
+
+import (
+	"context"
+	"log/slog"
+	"sync"
+	"time"
+
+	"github.com/bsm/redislock"
+
+	"github.com/mikrotik-portal/poller/internal/bus"
+	"github.com/mikrotik-portal/poller/internal/observability"
+	"github.com/mikrotik-portal/poller/internal/store"
+	"github.com/mikrotik-portal/poller/internal/vault"
+)
+
+// deviceState tracks per-device circuit breaker and lifecycle state.
+type deviceState struct {
+	cancel              context.CancelFunc
+	consecutiveFailures int
+	backoffUntil        time.Time
+}
+
+// Scheduler manages the lifecycle of per-device polling goroutines.
+//
+// It periodically re-queries the database to discover new devices (starting goroutines)
+// and detect removed devices (stopping goroutines). Each device has exactly one
+// polling goroutine running at a time.
+//
+// Circuit breaker: after consecutive connection failures, a device enters exponential
+// backoff. The device loop skips poll ticks during backoff. On successful poll, the
+// circuit breaker resets and the device resumes normal polling.
+type Scheduler struct {
+	store           DeviceFetcher
+	locker          *redislock.Client
+	publisher       *bus.Publisher
+	credentialCache *vault.CredentialCache
+	pollInterval    time.Duration
+	connTimeout     time.Duration
+	cmdTimeout      time.Duration
+	refreshPeriod   time.Duration
+
+	// Circuit breaker configuration.
+	maxFailures int
+	baseBackoff time.Duration
+	maxBackoff  time.Duration
+
+	// activeDevices maps device ID to per-device state.
+	mu            sync.Mutex
+	activeDevices map[string]*deviceState
+}
+
+// NewScheduler creates a Scheduler with the provided dependencies.
+func NewScheduler(
+	store DeviceFetcher,
+	locker *redislock.Client,
+	publisher *bus.Publisher,
+	credentialCache *vault.CredentialCache,
+	pollInterval time.Duration,
+	connTimeout time.Duration,
+	cmdTimeout time.Duration,
+	refreshPeriod time.Duration,
+	maxFailures int,
+	baseBackoff time.Duration,
+	maxBackoff time.Duration,
+) *Scheduler {
+	return &Scheduler{
+		store:           store,
+		locker:          locker,
+		publisher:       publisher,
+		credentialCache: credentialCache,
+		pollInterval:    pollInterval,
+		connTimeout:     connTimeout,
+		cmdTimeout:      cmdTimeout,
+		refreshPeriod:   refreshPeriod,
+		maxFailures:     maxFailures,
+		baseBackoff:     baseBackoff,
+		maxBackoff:      maxBackoff,
+		activeDevices:   make(map[string]*deviceState),
+	}
+}
+
+// Run is the main scheduler loop. It:
+//  1. Fetches devices from the database.
+//  2. Starts goroutines for newly-discovered devices.
+//  3. Stops goroutines for devices no longer in the database.
+//  4. Sleeps for refreshPeriod, then repeats.
+//  5. Cancels all goroutines when ctx is cancelled (graceful shutdown).
+//
+// Run blocks until ctx is cancelled, then waits for all goroutines to finish.
+func (s *Scheduler) Run(ctx context.Context) error {
+	var wg sync.WaitGroup
+
+	defer func() {
+		// On shutdown, cancel all active device goroutines and wait for them.
+		s.mu.Lock()
+		for id, ds := range s.activeDevices {
+			slog.Info("stopping device goroutine", "device_id", id)
+			ds.cancel()
+		}
+		s.mu.Unlock()
+		wg.Wait()
+		slog.Info("scheduler shutdown complete")
+	}()
+
+	for {
+		if err := s.reconcileDevices(ctx, &wg); err != nil {
+			slog.Error("device reconciliation failed", "error", err)
+			// Continue — a transient DB error should not crash the scheduler.
+		}
+
+		select {
+		case <-ctx.Done():
+			slog.Info("scheduler context cancelled — shutting down")
+			return nil
+		case <-time.After(s.refreshPeriod):
+			// Next reconciliation cycle.
+		}
+	}
+}
+
+// reconcileDevices fetches the current device list from the DB and starts/stops
+// goroutines as needed to keep the active set in sync.
+func (s *Scheduler) reconcileDevices(ctx context.Context, wg *sync.WaitGroup) error {
+	devices, err := s.store.FetchDevices(ctx)
+	if err != nil {
+		return err
+	}
+
+	// Build a set of current device IDs for quick lookup.
+	currentIDs := make(map[string]struct{}, len(devices))
+	for _, d := range devices {
+		currentIDs[d.ID] = struct{}{}
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// Start goroutines for newly-discovered devices.
+	for _, dev := range devices {
+		if _, active := s.activeDevices[dev.ID]; !active {
+			devCopy := dev // capture loop variable
+			devCtx, cancel := context.WithCancel(ctx)
+			ds := &deviceState{cancel: cancel}
+			s.activeDevices[dev.ID] = ds
+
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
+				s.runDeviceLoop(devCtx, devCopy, ds)
+			}()
+
+			slog.Info("started polling goroutine", "device_id", dev.ID, "ip", dev.IPAddress)
+		}
+	}
+
+	// Stop goroutines for devices that are no longer in the database.
+	for id, ds := range s.activeDevices {
+		if _, exists := currentIDs[id]; !exists {
+			slog.Info("stopping goroutine for removed device", "device_id", id)
+			ds.cancel()
+			delete(s.activeDevices, id)
+		}
+	}
+
+	// Update Prometheus gauge with current active device count.
+	observability.DevicesActive.Set(float64(len(s.activeDevices)))
+
+	slog.Debug("device reconciliation complete",
+		"total_devices", len(devices),
+		"active_goroutines", len(s.activeDevices),
+	)
+
+	return nil
+}
+
+// runDeviceLoop is the per-device polling loop. It ticks at pollInterval and
+// calls PollDevice synchronously on each tick (not in a sub-goroutine, to avoid
+// unbounded goroutine growth if polls are slow).
+//
+// Circuit breaker: when consecutive failures exceed maxFailures, the device enters
+// exponential backoff. Poll ticks during backoff are skipped. On success, the
+// circuit breaker resets.
+func (s *Scheduler) runDeviceLoop(ctx context.Context, dev store.Device, ds *deviceState) {
+	// lockTTL gives the poll cycle time to complete: interval + connection timeout + 15s margin.
+	lockTTL := s.pollInterval + s.connTimeout + 15*time.Second
+
+	ticker := time.NewTicker(s.pollInterval)
+	defer ticker.Stop()
+
+	slog.Debug("device poll loop started", "device_id", dev.ID, "poll_interval", s.pollInterval)
+
+	for {
+		select {
+		case <-ctx.Done():
+			slog.Debug("device poll loop stopping", "device_id", dev.ID)
+			return
+
+		case <-ticker.C:
+			// Circuit breaker: skip poll if device is in backoff period.
+			if time.Now().Before(ds.backoffUntil) {
+				slog.Debug("circuit breaker: skipping poll (in backoff)",
+					"device_id", dev.ID,
+					"backoff_until", ds.backoffUntil.Format(time.RFC3339),
+					"consecutive_failures", ds.consecutiveFailures,
+				)
+				observability.CircuitBreakerSkips.Inc()
+				continue
+			}
+
+			err := PollDevice(ctx, dev, s.locker, s.publisher, s.credentialCache, s.connTimeout, s.cmdTimeout, lockTTL)
+
+			if err != nil {
+				ds.consecutiveFailures++
+
+				if ds.consecutiveFailures >= s.maxFailures {
+					backoff := calculateBackoff(ds.consecutiveFailures, s.baseBackoff, s.maxBackoff)
+					ds.backoffUntil = time.Now().Add(backoff)
+					slog.Warn("circuit breaker: device entering backoff",
+						"device_id", dev.ID,
+						"ip", dev.IPAddress,
+						"consecutive_failures", ds.consecutiveFailures,
+						"backoff_duration", backoff,
+						"backoff_until", ds.backoffUntil.Format(time.RFC3339),
+					)
+				}
+
+				// Only log as error if it's not a device-offline situation.
+				if err != ErrDeviceOffline {
+					slog.Error("poll cycle failed",
+						"device_id", dev.ID,
+						"ip", dev.IPAddress,
+						"error", err,
+					)
+				}
+			} else {
+				// Success — reset circuit breaker if it was tripped.
+				if ds.consecutiveFailures > 0 {
+					slog.Info("circuit breaker: device recovered",
+						"device_id", dev.ID,
+						"ip", dev.IPAddress,
+						"previous_failures", ds.consecutiveFailures,
+					)
+					observability.CircuitBreakerResets.Inc()
+					ds.consecutiveFailures = 0
+					ds.backoffUntil = time.Time{}
+				}
+			}
+		}
+	}
+}
+
+// calculateBackoff computes the exponential backoff duration for the given
+// number of consecutive failures: base * 2^(failures-1), capped at maxBackoff.
+func calculateBackoff(failures int, baseBackoff, maxBackoff time.Duration) time.Duration {
+	if failures <= 1 {
+		return baseBackoff
+	}
+	backoff := baseBackoff * time.Duration(1<<uint(failures-1))
+	if backoff > maxBackoff || backoff < 0 { // negative check guards against overflow
+		return maxBackoff
+	}
+	return backoff
+}
--- a/poller/internal/poller/scheduler_test.go
+++ b/poller/internal/poller/scheduler_test.go
@@ -0,0 +1,184 @@
+package poller
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/mikrotik-portal/poller/internal/store"
+	"github.com/mikrotik-portal/poller/internal/vault"
+)
+
+// mockDeviceFetcher implements DeviceFetcher for testing.
+type mockDeviceFetcher struct {
+	devices []store.Device
+	err     error
+}
+
+func (m *mockDeviceFetcher) FetchDevices(ctx context.Context) ([]store.Device, error) {
+	return m.devices, m.err
+}
+
+// newTestScheduler creates a Scheduler with a mock DeviceFetcher for testing.
+// Uses nil for locker and publisher since reconcileDevices doesn't use them.
+func newTestScheduler(fetcher DeviceFetcher) *Scheduler {
+	// Create a minimal credential cache for testing (no transit, no legacy key, no db).
+	testCache := vault.NewCredentialCache(64, 5*time.Minute, nil, make([]byte, 32), nil)
+	return &Scheduler{
+		store:           fetcher,
+		locker:          nil,
+		publisher:       nil,
+		credentialCache: testCache,
+		pollInterval:    24 * time.Hour, // Never fires during test
+		connTimeout:     time.Second,
+		cmdTimeout:      time.Second,
+		refreshPeriod:   time.Second,
+		maxFailures:     5,
+		baseBackoff:     30 * time.Second,
+		maxBackoff:      15 * time.Minute,
+		activeDevices:   make(map[string]*deviceState),
+	}
+}
+
+func TestReconcileDevices_StartsNewDevices(t *testing.T) {
+	devices := []store.Device{
+		{ID: "dev-1", TenantID: "t-1", IPAddress: "192.168.1.1", APISSLPort: 8729},
+		{ID: "dev-2", TenantID: "t-1", IPAddress: "192.168.1.2", APISSLPort: 8729},
+	}
+	fetcher := &mockDeviceFetcher{devices: devices}
+	sched := newTestScheduler(fetcher)
+
+	var wg sync.WaitGroup
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	err := sched.reconcileDevices(ctx, &wg)
+	require.NoError(t, err)
+
+	sched.mu.Lock()
+	assert.Len(t, sched.activeDevices, 2)
+	_, hasDev1 := sched.activeDevices["dev-1"]
+	_, hasDev2 := sched.activeDevices["dev-2"]
+	assert.True(t, hasDev1)
+	assert.True(t, hasDev2)
+	sched.mu.Unlock()
+
+	// Clean up: cancel context and wait for goroutines
+	cancel()
+	wg.Wait()
+}
+
+func TestReconcileDevices_StopsRemovedDevices(t *testing.T) {
+	// Start with one active device
+	sched := newTestScheduler(&mockDeviceFetcher{devices: []store.Device{}})
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// Manually add a device to activeDevices to simulate it was previously running
+	devCtx, devCancel := context.WithCancel(ctx)
+	sched.activeDevices["dev-removed"] = &deviceState{cancel: devCancel}
+
+	// Track if cancel was called
+	cancelled := false
+	go func() {
+		<-devCtx.Done()
+		cancelled = true
+	}()
+
+	var wg sync.WaitGroup
+	// FetchDevices returns empty -> dev-removed should be stopped
+	err := sched.reconcileDevices(ctx, &wg)
+	require.NoError(t, err)
+
+	sched.mu.Lock()
+	assert.Len(t, sched.activeDevices, 0)
+	sched.mu.Unlock()
+
+	// Give the goroutine a moment to register the cancel
+	time.Sleep(10 * time.Millisecond)
+	assert.True(t, cancelled)
+
+	cancel()
+	wg.Wait()
+}
+
+func TestReconcileDevices_PreservesExistingDevices(t *testing.T) {
+	devices := []store.Device{
+		{ID: "dev-existing", TenantID: "t-1", IPAddress: "192.168.1.1", APISSLPort: 8729},
+		{ID: "dev-new", TenantID: "t-1", IPAddress: "192.168.1.2", APISSLPort: 8729},
+	}
+	fetcher := &mockDeviceFetcher{devices: devices}
+	sched := newTestScheduler(fetcher)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// Pre-populate dev-existing as if it was already running
+	existingCtx, existingCancel := context.WithCancel(ctx)
+	_ = existingCtx
+	sched.activeDevices["dev-existing"] = &deviceState{cancel: existingCancel}
+
+	var wg sync.WaitGroup
+	err := sched.reconcileDevices(ctx, &wg)
+	require.NoError(t, err)
+
+	sched.mu.Lock()
+	assert.Len(t, sched.activeDevices, 2)
+	// dev-existing should still have its ORIGINAL cancel function (not replaced)
+	assert.Equal(t, fmt.Sprintf("%p", existingCancel), fmt.Sprintf("%p", sched.activeDevices["dev-existing"].cancel))
+	_, hasNew := sched.activeDevices["dev-new"]
+	assert.True(t, hasNew)
+	sched.mu.Unlock()
+
+	cancel()
+	wg.Wait()
+}
+
+func TestReconcileDevices_HandlesEmptyDatabase(t *testing.T) {
+	fetcher := &mockDeviceFetcher{devices: []store.Device{}}
+	sched := newTestScheduler(fetcher)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	var wg sync.WaitGroup
+	err := sched.reconcileDevices(ctx, &wg)
+	require.NoError(t, err)
+
+	sched.mu.Lock()
+	assert.Len(t, sched.activeDevices, 0)
+	sched.mu.Unlock()
+
+	cancel()
+	wg.Wait()
+}
+
+func TestReconcileDevices_FetchError(t *testing.T) {
+	fetcher := &mockDeviceFetcher{err: fmt.Errorf("connection refused")}
+	sched := newTestScheduler(fetcher)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// Pre-populate a device
+	devCancel := func() {}
+	sched.activeDevices["dev-1"] = &deviceState{cancel: devCancel}
+
+	var wg sync.WaitGroup
+	err := sched.reconcileDevices(ctx, &wg)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "connection refused")
+
+	// Active devices should be unchanged (no side effects on error)
+	sched.mu.Lock()
+	assert.Len(t, sched.activeDevices, 1)
+	sched.mu.Unlock()
+
+	cancel()
+	wg.Wait()
+}
--- a/poller/internal/poller/worker.go
+++ b/poller/internal/poller/worker.go
@@ -0,0 +1,409 @@
+// Package poller implements the polling logic for individual devices.
+package poller
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log/slog"
+	"time"
+
+	"github.com/bsm/redislock"
+	"github.com/redis/go-redis/v9"
+
+	"github.com/mikrotik-portal/poller/internal/bus"
+	"github.com/mikrotik-portal/poller/internal/device"
+	"github.com/mikrotik-portal/poller/internal/observability"
+	"github.com/mikrotik-portal/poller/internal/store"
+	"github.com/mikrotik-portal/poller/internal/vault"
+)
+
+// ErrDeviceOffline is returned by PollDevice when a device cannot be reached.
+// The scheduler uses this to drive the circuit breaker — consecutive offline
+// events trigger exponential backoff without logging as a hard error.
+var ErrDeviceOffline = errors.New("device offline")
+
+// redisClientForFirmware is a module-level Redis client reference used
+// for firmware check rate limiting. Set by the scheduler before starting polls.
+var redisClientForFirmware *redis.Client
+
+// SetRedisClient sets the Redis client used for firmware rate limiting.
+func SetRedisClient(c *redis.Client) {
+	redisClientForFirmware = c
+}
+
+// withTimeout runs fn in a goroutine and returns its result, or a timeout error
+// if ctx expires first. This wraps RouterOS API calls that don't accept a context
+// parameter, enforcing per-command timeouts to prevent indefinite blocking.
+func withTimeout[T any](ctx context.Context, fn func() (T, error)) (T, error) {
+	type result struct {
+		val T
+		err error
+	}
+	ch := make(chan result, 1)
+	go func() {
+		v, e := fn()
+		ch <- result{v, e}
+	}()
+	select {
+	case r := <-ch:
+		return r.val, r.err
+	case <-ctx.Done():
+		var zero T
+		return zero, fmt.Errorf("command timed out: %w", ctx.Err())
+	}
+}
+
+// PollDevice performs a single poll cycle for one device:
+//  1. Acquire distributed Redis lock to prevent duplicate polls across pods.
+//  2. Decrypt device credentials.
+//  3. Attempt TLS connection to the RouterOS binary API.
+//  4. On failure: publish offline event, return ErrDeviceOffline.
+//  5. On success: run /system/resource/print, publish online event with metadata.
+//  6. Collect interface, health, and wireless metrics; publish as separate events.
+//  7. Release lock and close connection via deferred calls.
+//
+// lockTTL should be longer than the expected poll duration to prevent the lock
+// from expiring while the poll is still in progress.
+//
+// cmdTimeout is the per-command timeout for individual RouterOS API calls.
+func PollDevice(
+	ctx context.Context,
+	dev store.Device,
+	locker *redislock.Client,
+	pub *bus.Publisher,
+	credentialCache *vault.CredentialCache,
+	connTimeout time.Duration,
+	cmdTimeout time.Duration,
+	lockTTL time.Duration,
+) error {
+	startTime := time.Now()
+	pollStatus := "success"
+
+	lockKey := fmt.Sprintf("poll:device:%s", dev.ID)
+
+	// Acquire per-device lock. If another pod already holds the lock, skip this cycle.
+	lock, err := locker.Obtain(ctx, lockKey, lockTTL, nil)
+	if err == redislock.ErrNotObtained {
+		slog.Debug("skipping poll — lock held by another pod", "device_id", dev.ID)
+		observability.PollTotal.WithLabelValues("skipped").Inc()
+		observability.RedisLockTotal.WithLabelValues("not_obtained").Inc()
+		return nil
+	}
+	if err != nil {
+		observability.RedisLockTotal.WithLabelValues("error").Inc()
+		return fmt.Errorf("obtaining Redis lock for device %s: %w", dev.ID, err)
+	}
+	observability.RedisLockTotal.WithLabelValues("obtained").Inc()
+
+	defer func() {
+		if releaseErr := lock.Release(ctx); releaseErr != nil && releaseErr != redislock.ErrLockNotHeld {
+			slog.Warn("failed to release Redis lock", "device_id", dev.ID, "error", releaseErr)
+		}
+	}()
+
+	// Deferred metric recording — captures poll duration and status at exit.
+	defer func() {
+		observability.PollDuration.Observe(time.Since(startTime).Seconds())
+		observability.PollTotal.WithLabelValues(pollStatus).Inc()
+	}()
+
+	// Decrypt device credentials via credential cache (Transit preferred, legacy fallback).
+	username, password, err := credentialCache.GetCredentials(
+		dev.ID,
+		dev.TenantID,
+		dev.EncryptedCredentialsTransit,
+		dev.EncryptedCredentials,
+	)
+	if err != nil {
+		pollStatus = "error"
+		return fmt.Errorf("decrypting credentials for device %s: %w", dev.ID, err)
+	}
+
+	// Prepare CA cert PEM for TLS verification (only populated for portal_ca devices).
+	var caCertPEM []byte
+	if dev.CACertPEM != nil {
+		caCertPEM = []byte(*dev.CACertPEM)
+	}
+
+	// Attempt connection. On failure, publish offline event and return ErrDeviceOffline.
+	client, err := device.ConnectDevice(dev.IPAddress, dev.APISSLPort, dev.APIPort, username, password, connTimeout, caCertPEM, dev.TLSMode)
+	if err != nil {
+		slog.Info("device offline", "device_id", dev.ID, "ip", dev.IPAddress, "error", err)
+		observability.DeviceConnectionErrors.Inc()
+
+		offlineEvent := bus.DeviceStatusEvent{
+			DeviceID: dev.ID,
+			TenantID: dev.TenantID,
+			Status:   "offline",
+			LastSeen: time.Now().UTC().Format(time.RFC3339),
+		}
+		if pubErr := pub.PublishStatus(ctx, offlineEvent); pubErr != nil {
+			slog.Warn("failed to publish offline event", "device_id", dev.ID, "error", pubErr)
+			observability.NATSPublishTotal.WithLabelValues("status", "error").Inc()
+		} else {
+			observability.NATSPublishTotal.WithLabelValues("status", "success").Inc()
+		}
+
+		// Check for recent config push — trigger rollback or alert if device
+		// went offline shortly after a push (Redis key set by push_tracker).
+		if redisClientForFirmware != nil {
+			pushKey := fmt.Sprintf("push:recent:%s", dev.ID)
+			pushData, pushErr := redisClientForFirmware.Get(ctx, pushKey).Result()
+			if pushErr == nil && pushData != "" {
+				var pushInfo struct {
+					DeviceID         string `json:"device_id"`
+					TenantID         string `json:"tenant_id"`
+					PushType         string `json:"push_type"`
+					PushOperationID  string `json:"push_operation_id"`
+					PrePushCommitSHA string `json:"pre_push_commit_sha"`
+				}
+				if unmarshalErr := json.Unmarshal([]byte(pushData), &pushInfo); unmarshalErr == nil {
+					slog.Warn("device went offline after recent config push",
+						"device_id", dev.ID,
+						"push_type", pushInfo.PushType,
+					)
+
+					if pushInfo.PushType == "template" || pushInfo.PushType == "restore" {
+						// Auto-rollback for template/restore pushes
+						if rollbackErr := pub.PublishPushRollback(ctx, bus.PushRollbackEvent{
+							DeviceID:         pushInfo.DeviceID,
+							TenantID:         pushInfo.TenantID,
+							PushOperationID:  pushInfo.PushOperationID,
+							PrePushCommitSHA: pushInfo.PrePushCommitSHA,
+						}); rollbackErr != nil {
+							slog.Error("failed to publish push rollback event", "device_id", dev.ID, "error", rollbackErr)
+						}
+					} else {
+						// Alert only for editor pushes (one-click rollback in UI)
+						if alertErr := pub.PublishPushAlert(ctx, bus.PushAlertEvent{
+							DeviceID: pushInfo.DeviceID,
+							TenantID: pushInfo.TenantID,
+							PushType: pushInfo.PushType,
+						}); alertErr != nil {
+							slog.Error("failed to publish push alert event", "device_id", dev.ID, "error", alertErr)
+						}
+					}
+				}
+			}
+		}
+
+		return ErrDeviceOffline
+	}
+	defer device.CloseDevice(client)
+
+	// Query device resources (version, uptime, CPU, memory) with per-command timeout.
+	cmdCtx, cmdCancel := context.WithTimeout(ctx, cmdTimeout)
+	info, err := withTimeout[device.DeviceInfo](cmdCtx, func() (device.DeviceInfo, error) {
+		return device.DetectVersion(client)
+	})
+	cmdCancel()
+	if err != nil {
+		slog.Warn("failed to detect version", "device_id", dev.ID, "error", err)
+		// Still publish an online event even if version detection fails.
+	}
+
+	onlineEvent := bus.DeviceStatusEvent{
+		DeviceID:        dev.ID,
+		TenantID:        dev.TenantID,
+		Status:          "online",
+		RouterOSVersion: info.Version,
+		MajorVersion:    info.MajorVersion,
+		BoardName:       info.BoardName,
+		Architecture:    info.Architecture,
+		Uptime:          info.Uptime,
+		CPULoad:         info.CPULoad,
+		FreeMemory:      info.FreeMemory,
+		TotalMemory:     info.TotalMemory,
+		SerialNumber:    info.SerialNumber,
+		FirmwareVersion: info.FirmwareVersion,
+		LastSeen:        time.Now().UTC().Format(time.RFC3339),
+	}
+
+	if pubErr := pub.PublishStatus(ctx, onlineEvent); pubErr != nil {
+		observability.NATSPublishTotal.WithLabelValues("status", "error").Inc()
+		pollStatus = "error"
+		return fmt.Errorf("publishing online event for device %s: %w", dev.ID, pubErr)
+	}
+	observability.NATSPublishTotal.WithLabelValues("status", "success").Inc()
+
+	// =========================================================================
+	// CONFIG CHANGE DETECTION
+	// Compare last-config-change from /system/resource/print against the
+	// previous value stored in Redis. If it changed (and we have a previous
+	// value — skip first poll), publish a ConfigChangedEvent so the backend
+	// can trigger an event-driven backup.
+	// =========================================================================
+	if info.LastConfigChange != "" && redisClientForFirmware != nil {
+		redisKey := fmt.Sprintf("device:%s:last_config_change", dev.ID)
+		prev, redisErr := redisClientForFirmware.Get(ctx, redisKey).Result()
+		if redisErr != nil && redisErr != redis.Nil {
+			slog.Warn("Redis GET last_config_change error", "device_id", dev.ID, "error", redisErr)
+		}
+
+		if prev != info.LastConfigChange {
+			if prev != "" { // Skip first poll — no previous value to compare
+				slog.Info("config change detected on device",
+					"device_id", dev.ID,
+					"old_timestamp", prev,
+					"new_timestamp", info.LastConfigChange,
+				)
+				if pubErr := pub.PublishConfigChanged(ctx, bus.ConfigChangedEvent{
+					DeviceID:    dev.ID,
+					TenantID:    dev.TenantID,
+					OldTimestamp: prev,
+					NewTimestamp: info.LastConfigChange,
+				}); pubErr != nil {
+					slog.Warn("failed to publish config.changed", "device_id", dev.ID, "error", pubErr)
+					observability.NATSPublishTotal.WithLabelValues("config_changed", "error").Inc()
+				} else {
+					observability.NATSPublishTotal.WithLabelValues("config_changed", "success").Inc()
+				}
+			}
+			// Update Redis with current value (24h TTL)
+			redisClientForFirmware.Set(ctx, redisKey, info.LastConfigChange, 24*time.Hour)
+		}
+	}
+
+	slog.Info("device polled successfully",
+		"device_id", dev.ID,
+		"ip", dev.IPAddress,
+		"status", "online",
+		"version", info.Version,
+	)
+
+	// =========================================================================
+	// METRICS COLLECTION
+	// Errors are non-fatal — a metric collection failure should not fail the
+	// poll cycle. Publish failures are also non-fatal for the same reason.
+	// Each collection call is wrapped with a per-command timeout.
+	// =========================================================================
+	collectedAt := time.Now().UTC().Format(time.RFC3339)
+
+	// Interface traffic counters.
+	cmdCtx, cmdCancel = context.WithTimeout(ctx, cmdTimeout)
+	interfaces, err := withTimeout[[]device.InterfaceStats](cmdCtx, func() ([]device.InterfaceStats, error) {
+		return device.CollectInterfaces(client)
+	})
+	cmdCancel()
+	if err != nil {
+		slog.Warn("failed to collect interface metrics", "device_id", dev.ID, "error", err)
+	}
+	if pubErr := pub.PublishMetrics(ctx, bus.DeviceMetricsEvent{
+		DeviceID:    dev.ID,
+		TenantID:    dev.TenantID,
+		CollectedAt: collectedAt,
+		Type:        "interfaces",
+		Interfaces:  interfaces,
+	}); pubErr != nil {
+		slog.Warn("failed to publish interface metrics", "device_id", dev.ID, "error", pubErr)
+		observability.NATSPublishTotal.WithLabelValues("metrics", "error").Inc()
+	} else {
+		observability.NATSPublishTotal.WithLabelValues("metrics", "success").Inc()
+	}
+
+	// System health (CPU, memory, disk, temperature).
+	cmdCtx, cmdCancel = context.WithTimeout(ctx, cmdTimeout)
+	health, err := withTimeout[device.HealthMetrics](cmdCtx, func() (device.HealthMetrics, error) {
+		return device.CollectHealth(client, info)
+	})
+	cmdCancel()
+	if err != nil {
+		slog.Warn("failed to collect health metrics", "device_id", dev.ID, "error", err)
+	}
+	if pubErr := pub.PublishMetrics(ctx, bus.DeviceMetricsEvent{
+		DeviceID:    dev.ID,
+		TenantID:    dev.TenantID,
+		CollectedAt: collectedAt,
+		Type:        "health",
+		Health:      &health,
+	}); pubErr != nil {
+		slog.Warn("failed to publish health metrics", "device_id", dev.ID, "error", pubErr)
+		observability.NATSPublishTotal.WithLabelValues("metrics", "error").Inc()
+	} else {
+		observability.NATSPublishTotal.WithLabelValues("metrics", "success").Inc()
+	}
+
+	// Wireless client stats (only publish if the device has wireless interfaces).
+	cmdCtx, cmdCancel = context.WithTimeout(ctx, cmdTimeout)
+	wireless, err := withTimeout[[]device.WirelessStats](cmdCtx, func() ([]device.WirelessStats, error) {
+		return device.CollectWireless(client, info.MajorVersion)
+	})
+	cmdCancel()
+	if err != nil {
+		slog.Warn("failed to collect wireless metrics", "device_id", dev.ID, "error", err)
+	}
+	if len(wireless) > 0 {
+		if pubErr := pub.PublishMetrics(ctx, bus.DeviceMetricsEvent{
+			DeviceID:    dev.ID,
+			TenantID:    dev.TenantID,
+			CollectedAt: collectedAt,
+			Type:        "wireless",
+			Wireless:    wireless,
+		}); pubErr != nil {
+			slog.Warn("failed to publish wireless metrics", "device_id", dev.ID, "error", pubErr)
+			observability.NATSPublishTotal.WithLabelValues("metrics", "error").Inc()
+		} else {
+			observability.NATSPublishTotal.WithLabelValues("metrics", "success").Inc()
+		}
+	}
+
+	// =========================================================================
+	// FIRMWARE CHECK (rate-limited to once per day per device)
+	// Checks if a firmware update is available and publishes the result.
+	// Uses a Redis key with 24h TTL to ensure we don't hammer devices every 60s.
+	// =========================================================================
+	if redisClientForFirmware != nil {
+		fwCacheKey := fmt.Sprintf("firmware:checked:%s", dev.ID)
+		exists, _ := redisClientForFirmware.Exists(ctx, fwCacheKey).Result()
+		if exists == 0 {
+			cmdCtx, cmdCancel = context.WithTimeout(ctx, cmdTimeout)
+			fwInfo, fwErr := withTimeout[device.FirmwareInfo](cmdCtx, func() (device.FirmwareInfo, error) {
+				return device.CheckFirmwareUpdate(client)
+			})
+			cmdCancel()
+			if fwErr != nil {
+				slog.Warn("firmware check failed", "device_id", dev.ID, "error", fwErr)
+				// Set cooldown on failure too, but shorter (6h) so we retry sooner than success (24h).
+				// Prevents hammering devices that can't reach MikroTik update servers every poll cycle.
+				fwFailKey := fmt.Sprintf("firmware:check-failed:%s", dev.ID)
+				redisClientForFirmware.Set(ctx, fwFailKey, "1", 6*time.Hour)
+				// Also set the main checked key to prevent the success path from re-checking.
+				redisClientForFirmware.Set(ctx, fwCacheKey, "1", 6*time.Hour)
+			} else {
+				fwEvent := bus.DeviceFirmwareEvent{
+					DeviceID:         dev.ID,
+					TenantID:         dev.TenantID,
+					InstalledVersion: fwInfo.InstalledVersion,
+					LatestVersion:    fwInfo.LatestVersion,
+					Channel:          fwInfo.Channel,
+					Status:           fwInfo.Status,
+					Architecture:     fwInfo.Architecture,
+				}
+				if pubErr := pub.PublishFirmware(ctx, fwEvent); pubErr != nil {
+					slog.Warn("failed to publish firmware event", "device_id", dev.ID, "error", pubErr)
+					observability.NATSPublishTotal.WithLabelValues("firmware", "error").Inc()
+				} else {
+					observability.NATSPublishTotal.WithLabelValues("firmware", "success").Inc()
+					// Set Redis key with 24h TTL — firmware checked for today.
+					// If the check succeeded but status is "check-failed",
+					// use shorter cooldown since the device couldn't reach update servers.
+					if fwInfo.Status == "check-failed" {
+						redisClientForFirmware.Set(ctx, fwCacheKey, "1", 6*time.Hour)
+					} else {
+						redisClientForFirmware.Set(ctx, fwCacheKey, "1", 24*time.Hour)
+					}
+					slog.Info("firmware check published",
+						"device_id", dev.ID,
+						"installed", fwInfo.InstalledVersion,
+						"latest", fwInfo.LatestVersion,
+						"channel", fwInfo.Channel,
+					)
+				}
+			}
+		}
+	}
+
+	return nil
+}