feat: The Other Dude v9.0.1 — full-featured email system
ci: add GitHub Pages deployment workflow for docs site Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
9
poller/.dockerignore
Normal file
9
poller/.dockerignore
Normal file
@@ -0,0 +1,9 @@
|
||||
# Git
|
||||
.git
|
||||
.gitignore
|
||||
|
||||
# Go build artifacts
|
||||
vendor/
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
7
poller/.gitignore
vendored
Normal file
7
poller/.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
# Go build output
|
||||
/poller
|
||||
/cmd/poller/poller
|
||||
|
||||
# Test binaries
|
||||
*.test
|
||||
*.out
|
||||
17
poller/Dockerfile
Normal file
17
poller/Dockerfile
Normal file
@@ -0,0 +1,17 @@
|
||||
FROM golang:1.24-alpine AS builder
|
||||
WORKDIR /build
|
||||
COPY go.mod go.sum ./
|
||||
RUN go mod download
|
||||
COPY . .
|
||||
# GOMAXPROCS=1 limits the Go compiler to one OS thread during the Docker build.
|
||||
# Without this, go build spawns workers proportional to GOMAXPROCS (defaults to
|
||||
# the host CPU count), which combined with the parallel Node and Python builds
|
||||
# can saturate all cores and spike RAM on a 2-core / 2-4 GB server.
|
||||
RUN CGO_ENABLED=0 GOOS=linux GOMAXPROCS=1 go build -o /poller ./cmd/poller
|
||||
|
||||
FROM alpine:3.21
|
||||
RUN apk add --no-cache ca-certificates iproute2
|
||||
COPY --from=builder /poller /usr/local/bin/poller
|
||||
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
|
||||
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
||||
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
|
||||
231
poller/cmd/poller/main.go
Normal file
231
poller/cmd/poller/main.go
Normal file
@@ -0,0 +1,231 @@
|
||||
// Command poller is the MikroTik device polling microservice.
|
||||
//
|
||||
// It connects to RouterOS devices via the binary API (port 8729 TLS), detects
|
||||
// their online/offline status and version, and publishes events to NATS JetStream.
|
||||
// It uses Redis distributed locking to prevent duplicate polls when running as
|
||||
// multiple replicas.
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/bsm/redislock"
|
||||
"github.com/redis/go-redis/v9"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/bus"
|
||||
"github.com/mikrotik-portal/poller/internal/config"
|
||||
"github.com/mikrotik-portal/poller/internal/observability"
|
||||
"github.com/mikrotik-portal/poller/internal/poller"
|
||||
"github.com/mikrotik-portal/poller/internal/store"
|
||||
"github.com/mikrotik-portal/poller/internal/vault"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// -----------------------------------------------------------------------
|
||||
// Structured logging setup (log/slog, JSON for production)
|
||||
// -----------------------------------------------------------------------
|
||||
slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
|
||||
Level: slog.LevelInfo, // overridden below once config is loaded
|
||||
}).WithAttrs([]slog.Attr{
|
||||
slog.String("service", "poller"),
|
||||
})))
|
||||
|
||||
slog.Info("mikrotik poller starting")
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Load configuration from environment
|
||||
// -----------------------------------------------------------------------
|
||||
cfg, err := config.Load()
|
||||
if err != nil {
|
||||
slog.Error("failed to load configuration", "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Apply configured log level.
|
||||
var logLevel slog.Level
|
||||
switch cfg.LogLevel {
|
||||
case "debug":
|
||||
logLevel = slog.LevelDebug
|
||||
case "warn":
|
||||
logLevel = slog.LevelWarn
|
||||
case "error":
|
||||
logLevel = slog.LevelError
|
||||
default:
|
||||
logLevel = slog.LevelInfo
|
||||
}
|
||||
hostname, _ := os.Hostname()
|
||||
slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
|
||||
Level: logLevel,
|
||||
}).WithAttrs([]slog.Attr{
|
||||
slog.String("service", "poller"),
|
||||
slog.String("instance", hostname),
|
||||
})))
|
||||
|
||||
slog.Info("configuration loaded",
|
||||
"poll_interval_s", cfg.PollIntervalSeconds,
|
||||
"device_refresh_s", cfg.DeviceRefreshSeconds,
|
||||
"connection_timeout_s", cfg.ConnectionTimeoutSeconds,
|
||||
"log_level", cfg.LogLevel,
|
||||
)
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Context with graceful shutdown on SIGINT/SIGTERM
|
||||
// -----------------------------------------------------------------------
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
sigCh := make(chan os.Signal, 1)
|
||||
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
|
||||
go func() {
|
||||
sig := <-sigCh
|
||||
slog.Info("received shutdown signal", "signal", sig.String())
|
||||
cancel()
|
||||
}()
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Initialize PostgreSQL device store
|
||||
// -----------------------------------------------------------------------
|
||||
deviceStore, err := store.NewDeviceStore(ctx, cfg.DatabaseURL)
|
||||
if err != nil {
|
||||
slog.Error("failed to connect to database", "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer deviceStore.Close()
|
||||
|
||||
slog.Info("connected to PostgreSQL")
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Initialize Redis client and distributed locker
|
||||
// -----------------------------------------------------------------------
|
||||
redisOpts, err := redis.ParseURL(cfg.RedisURL)
|
||||
if err != nil {
|
||||
slog.Error("invalid REDIS_URL", "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
redisClient := redis.NewClient(redisOpts)
|
||||
defer redisClient.Close()
|
||||
|
||||
// Verify Redis connectivity.
|
||||
if err := redisClient.Ping(ctx).Err(); err != nil {
|
||||
slog.Error("failed to connect to Redis", "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
slog.Info("connected to Redis")
|
||||
|
||||
locker := redislock.New(redisClient)
|
||||
|
||||
// Make Redis client available to the poller for firmware check rate limiting.
|
||||
poller.SetRedisClient(redisClient)
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Initialize credential cache (OpenBao Transit + legacy fallback)
|
||||
// -----------------------------------------------------------------------
|
||||
var transitClient *vault.TransitClient
|
||||
if cfg.OpenBaoAddr != "" {
|
||||
transitClient = vault.NewTransitClient(cfg.OpenBaoAddr, cfg.OpenBaoToken)
|
||||
slog.Info("OpenBao Transit client initialized", "addr", cfg.OpenBaoAddr)
|
||||
}
|
||||
|
||||
credentialCache := vault.NewCredentialCache(
|
||||
1024, // max 1024 cached credentials
|
||||
5*time.Minute, // 5-minute TTL
|
||||
transitClient, // nil if OpenBao not configured
|
||||
cfg.CredentialEncryptionKey, // nil if legacy key not set
|
||||
deviceStore.Pool(), // for key_access_log inserts
|
||||
)
|
||||
slog.Info("credential cache initialized", "max_size", 1024, "ttl", "5m")
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Initialize NATS JetStream publisher
|
||||
// -----------------------------------------------------------------------
|
||||
publisher, err := bus.NewPublisher(cfg.NatsURL)
|
||||
if err != nil {
|
||||
slog.Error("failed to connect to NATS", "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer publisher.Close()
|
||||
|
||||
slog.Info("connected to NATS JetStream")
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Initialize NATS command responder for interactive device commands
|
||||
// -----------------------------------------------------------------------
|
||||
cmdResponder := bus.NewCmdResponder(publisher.Conn(), deviceStore, credentialCache)
|
||||
if err := cmdResponder.Start(); err != nil {
|
||||
slog.Error("failed to start command responder", "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer cmdResponder.Stop()
|
||||
slog.Info("NATS command responder started (device.cmd.*)")
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Initialize NATS cert deploy responder for certificate deployment
|
||||
// -----------------------------------------------------------------------
|
||||
certDeployResponder := bus.NewCertDeployResponder(publisher.Conn(), deviceStore, credentialCache)
|
||||
if err := certDeployResponder.Start(); err != nil {
|
||||
slog.Error("failed to start cert deploy responder", "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer certDeployResponder.Stop()
|
||||
slog.Info("NATS cert deploy responder started (cert.deploy.*)")
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Initialize NATS credential change subscriber for cache invalidation
|
||||
// -----------------------------------------------------------------------
|
||||
credentialSub := bus.NewCredentialSubscriber(publisher.Conn(), credentialCache)
|
||||
if err := credentialSub.Start(); err != nil {
|
||||
slog.Error("failed to start credential subscriber", "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer credentialSub.Stop()
|
||||
slog.Info("NATS credential subscriber started (device.credential_changed.>)")
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Start observability HTTP server (Prometheus metrics + health endpoint)
|
||||
// -----------------------------------------------------------------------
|
||||
observability.StartServer(ctx, ":9091")
|
||||
slog.Info("observability server started", "addr", ":9091")
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Start the device scheduler
|
||||
// -----------------------------------------------------------------------
|
||||
pollInterval := time.Duration(cfg.PollIntervalSeconds) * time.Second
|
||||
connTimeout := time.Duration(cfg.ConnectionTimeoutSeconds) * time.Second
|
||||
cmdTimeout := time.Duration(cfg.CommandTimeoutSeconds) * time.Second
|
||||
refreshPeriod := time.Duration(cfg.DeviceRefreshSeconds) * time.Second
|
||||
baseBackoff := time.Duration(cfg.CircuitBreakerBaseBackoffSeconds) * time.Second
|
||||
maxBackoff := time.Duration(cfg.CircuitBreakerMaxBackoffSeconds) * time.Second
|
||||
|
||||
scheduler := poller.NewScheduler(
|
||||
deviceStore,
|
||||
locker,
|
||||
publisher,
|
||||
credentialCache,
|
||||
pollInterval,
|
||||
connTimeout,
|
||||
cmdTimeout,
|
||||
refreshPeriod,
|
||||
cfg.CircuitBreakerMaxFailures,
|
||||
baseBackoff,
|
||||
maxBackoff,
|
||||
)
|
||||
|
||||
slog.Info("starting device scheduler",
|
||||
"poll_interval", pollInterval,
|
||||
"refresh_period", refreshPeriod,
|
||||
"conn_timeout", connTimeout,
|
||||
)
|
||||
|
||||
if err := scheduler.Run(ctx); err != nil {
|
||||
slog.Error("scheduler exited with error", "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
slog.Info("poller shutdown complete")
|
||||
}
|
||||
15
poller/docker-entrypoint.sh
Executable file
15
poller/docker-entrypoint.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/sh
|
||||
# Add VPN routes through wireguard container if WIREGUARD_GATEWAY is set
|
||||
# WIREGUARD_GATEWAY can be an IP or hostname (resolved via Docker DNS)
|
||||
if [ -n "$WIREGUARD_GATEWAY" ]; then
|
||||
# Resolve hostname to IP if needed
|
||||
GW_IP=$(getent hosts "$WIREGUARD_GATEWAY" 2>/dev/null | awk '{print $1}')
|
||||
if [ -z "$GW_IP" ]; then
|
||||
GW_IP="$WIREGUARD_GATEWAY"
|
||||
fi
|
||||
ip route add 10.10.0.0/16 via "$GW_IP" 2>/dev/null || true
|
||||
echo "VPN route: 10.10.0.0/16 via $GW_IP ($WIREGUARD_GATEWAY)"
|
||||
fi
|
||||
|
||||
# Drop to nobody and exec poller
|
||||
exec su -s /bin/sh nobody -c "/usr/local/bin/poller"
|
||||
92
poller/go.mod
Normal file
92
poller/go.mod
Normal file
@@ -0,0 +1,92 @@
|
||||
module github.com/mikrotik-portal/poller
|
||||
|
||||
go 1.24.0
|
||||
|
||||
require (
|
||||
github.com/bsm/redislock v0.9.4
|
||||
github.com/go-routeros/routeros/v3 v3.0.0
|
||||
github.com/google/uuid v1.6.0
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7
|
||||
github.com/jackc/pgx/v5 v5.7.4
|
||||
github.com/nats-io/nats.go v1.38.0
|
||||
github.com/pkg/sftp v1.13.10
|
||||
github.com/prometheus/client_golang v1.23.2
|
||||
github.com/redis/go-redis/v9 v9.7.3
|
||||
github.com/stretchr/testify v1.11.1
|
||||
github.com/testcontainers/testcontainers-go v0.40.0
|
||||
github.com/testcontainers/testcontainers-go/modules/nats v0.40.0
|
||||
github.com/testcontainers/testcontainers-go/modules/postgres v0.40.0
|
||||
github.com/testcontainers/testcontainers-go/modules/redis v0.40.0
|
||||
golang.org/x/crypto v0.48.0
|
||||
)
|
||||
|
||||
require (
|
||||
dario.cat/mergo v1.0.2 // indirect
|
||||
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
|
||||
github.com/Microsoft/go-winio v0.6.2 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/containerd/errdefs v1.0.0 // indirect
|
||||
github.com/containerd/errdefs/pkg v0.3.0 // indirect
|
||||
github.com/containerd/log v0.1.0 // indirect
|
||||
github.com/containerd/platforms v0.2.1 // indirect
|
||||
github.com/cpuguy83/dockercfg v0.3.2 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
|
||||
github.com/distribution/reference v0.6.0 // indirect
|
||||
github.com/docker/docker v28.5.1+incompatible // indirect
|
||||
github.com/docker/go-connections v0.6.0 // indirect
|
||||
github.com/docker/go-units v0.5.0 // indirect
|
||||
github.com/ebitengine/purego v0.8.4 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/go-logr/logr v1.4.3 // indirect
|
||||
github.com/go-logr/stdr v1.2.2 // indirect
|
||||
github.com/go-ole/go-ole v1.2.6 // indirect
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect
|
||||
github.com/jackc/pgpassfile v1.0.0 // indirect
|
||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
|
||||
github.com/jackc/puddle/v2 v2.2.2 // indirect
|
||||
github.com/klauspost/compress v1.18.0 // indirect
|
||||
github.com/kr/fs v0.1.0 // indirect
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
|
||||
github.com/magiconair/properties v1.8.10 // indirect
|
||||
github.com/mdelapenya/tlscert v0.2.0 // indirect
|
||||
github.com/moby/docker-image-spec v1.3.1 // indirect
|
||||
github.com/moby/go-archive v0.1.0 // indirect
|
||||
github.com/moby/patternmatcher v0.6.0 // indirect
|
||||
github.com/moby/sys/sequential v0.6.0 // indirect
|
||||
github.com/moby/sys/user v0.4.0 // indirect
|
||||
github.com/moby/sys/userns v0.1.0 // indirect
|
||||
github.com/moby/term v0.5.0 // indirect
|
||||
github.com/morikuni/aec v1.0.0 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/nats-io/nkeys v0.4.9 // indirect
|
||||
github.com/nats-io/nuid v1.0.1 // indirect
|
||||
github.com/opencontainers/go-digest v1.0.0 // indirect
|
||||
github.com/opencontainers/image-spec v1.1.1 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
|
||||
github.com/prometheus/client_model v0.6.2 // indirect
|
||||
github.com/prometheus/common v0.66.1 // indirect
|
||||
github.com/prometheus/procfs v0.16.1 // indirect
|
||||
github.com/shirou/gopsutil/v4 v4.25.6 // indirect
|
||||
github.com/sirupsen/logrus v1.9.3 // indirect
|
||||
github.com/tklauser/go-sysconf v0.3.12 // indirect
|
||||
github.com/tklauser/numcpus v0.6.1 // indirect
|
||||
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
||||
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect
|
||||
go.opentelemetry.io/otel v1.39.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.39.0 // indirect
|
||||
go.opentelemetry.io/otel/sdk v1.39.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.39.0 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.2 // indirect
|
||||
golang.org/x/sync v0.19.0 // indirect
|
||||
golang.org/x/sys v0.41.0 // indirect
|
||||
golang.org/x/text v0.34.0 // indirect
|
||||
google.golang.org/grpc v1.79.1 // indirect
|
||||
google.golang.org/protobuf v1.36.11 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
227
poller/go.sum
Normal file
227
poller/go.sum
Normal file
@@ -0,0 +1,227 @@
|
||||
dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8=
|
||||
dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA=
|
||||
github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk=
|
||||
github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
|
||||
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
|
||||
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
|
||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
|
||||
github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
|
||||
github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
|
||||
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
|
||||
github.com/bsm/redislock v0.9.4 h1:X/Wse1DPpiQgHbVYRE9zv6m070UcKoOGekgvpNhiSvw=
|
||||
github.com/bsm/redislock v0.9.4/go.mod h1:Epf7AJLiSFwLCiZcfi6pWFO/8eAYrYpQXFxEDPoDeAk=
|
||||
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
|
||||
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
|
||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI=
|
||||
github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M=
|
||||
github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE=
|
||||
github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk=
|
||||
github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I=
|
||||
github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo=
|
||||
github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpSBQv6A=
|
||||
github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw=
|
||||
github.com/cpuguy83/dockercfg v0.3.2 h1:DlJTyZGBDlXqUZ2Dk2Q3xHs/FtnooJJVaad2S9GKorA=
|
||||
github.com/cpuguy83/dockercfg v0.3.2/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc=
|
||||
github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY=
|
||||
github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
|
||||
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
|
||||
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
|
||||
github.com/docker/docker v28.5.1+incompatible h1:Bm8DchhSD2J6PsFzxC35TZo4TLGR2PdW/E69rU45NhM=
|
||||
github.com/docker/docker v28.5.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
|
||||
github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94=
|
||||
github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE=
|
||||
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
|
||||
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
|
||||
github.com/ebitengine/purego v0.8.4 h1:CF7LEKg5FFOsASUj0+QwaXf8Ht6TlFxg09+S9wz0omw=
|
||||
github.com/ebitengine/purego v0.8.4/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
|
||||
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
|
||||
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
|
||||
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
|
||||
github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
|
||||
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
|
||||
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
|
||||
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
|
||||
github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
|
||||
github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
|
||||
github.com/go-routeros/routeros/v3 v3.0.0 h1:/V4Cgr+wmn3IyyYIXUX1KYK8pA1ADPiwLSlAi912j1M=
|
||||
github.com/go-routeros/routeros/v3 v3.0.0/go.mod h1:j4mq65czXfKtHsdLkgVv8w7sNzyhLZy1TKi2zQDMpiQ=
|
||||
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
|
||||
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
|
||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
|
||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
|
||||
github.com/jackc/pgx/v5 v5.7.4 h1:9wKznZrhWa2QiHL+NjTSPP6yjl3451BX3imWDnokYlg=
|
||||
github.com/jackc/pgx/v5 v5.7.4/go.mod h1:ncY89UGWxg82EykZUwSpUKEfccBGGYq1xjrOpsbsfGQ=
|
||||
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
|
||||
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
|
||||
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
|
||||
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
|
||||
github.com/kr/fs v0.1.0 h1:Jskdu9ieNAYnjxsi0LbQp1ulIKZV1LAFgK1tWhpZgl8=
|
||||
github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg=
|
||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
||||
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
|
||||
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
|
||||
github.com/magiconair/properties v1.8.10 h1:s31yESBquKXCV9a/ScB3ESkOjUYYv+X0rg8SYxI99mE=
|
||||
github.com/magiconair/properties v1.8.10/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0=
|
||||
github.com/mdelapenya/tlscert v0.2.0 h1:7H81W6Z/4weDvZBNOfQte5GpIMo0lGYEeWbkGp5LJHI=
|
||||
github.com/mdelapenya/tlscert v0.2.0/go.mod h1:O4njj3ELLnJjGdkN7M/vIVCpZ+Cf0L6muqOG4tLSl8o=
|
||||
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
|
||||
github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
|
||||
github.com/moby/go-archive v0.1.0 h1:Kk/5rdW/g+H8NHdJW2gsXyZ7UnzvJNOy6VKJqueWdcQ=
|
||||
github.com/moby/go-archive v0.1.0/go.mod h1:G9B+YoujNohJmrIYFBpSd54GTUB4lt9S+xVQvsJyFuo=
|
||||
github.com/moby/patternmatcher v0.6.0 h1:GmP9lR19aU5GqSSFko+5pRqHi+Ohk1O69aFiKkVGiPk=
|
||||
github.com/moby/patternmatcher v0.6.0/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc=
|
||||
github.com/moby/sys/atomicwriter v0.1.0 h1:kw5D/EqkBwsBFi0ss9v1VG3wIkVhzGvLklJ+w3A14Sw=
|
||||
github.com/moby/sys/atomicwriter v0.1.0/go.mod h1:Ul8oqv2ZMNHOceF643P6FKPXeCmYtlQMvpizfsSoaWs=
|
||||
github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU=
|
||||
github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko=
|
||||
github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs=
|
||||
github.com/moby/sys/user v0.4.0/go.mod h1:bG+tYYYJgaMtRKgEmuueC0hJEAZWwtIbZTB+85uoHjs=
|
||||
github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g=
|
||||
github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28=
|
||||
github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
|
||||
github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
|
||||
github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
|
||||
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||
github.com/nats-io/nats.go v1.38.0 h1:A7P+g7Wjp4/NWqDOOP/K6hfhr54DvdDQUznt5JFg9XA=
|
||||
github.com/nats-io/nats.go v1.38.0/go.mod h1:IGUM++TwokGnXPs82/wCuiHS02/aKrdYUQkU8If6yjw=
|
||||
github.com/nats-io/nkeys v0.4.9 h1:qe9Faq2Gxwi6RZnZMXfmGMZkg3afLLOtrU+gDZJ35b0=
|
||||
github.com/nats-io/nkeys v0.4.9/go.mod h1:jcMqs+FLG+W5YO36OX6wFIFcmpdAns+w1Wm6D3I/evE=
|
||||
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
|
||||
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
|
||||
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
|
||||
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
|
||||
github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040=
|
||||
github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pkg/sftp v1.13.10 h1:+5FbKNTe5Z9aspU88DPIKJ9z2KZoaGCu6Sr6kKR/5mU=
|
||||
github.com/pkg/sftp v1.13.10/go.mod h1:bJ1a7uDhrX/4OII+agvy28lzRvQrmIQuaHrcI1HbeGA=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw=
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
|
||||
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
|
||||
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
|
||||
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
|
||||
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
|
||||
github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
|
||||
github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
|
||||
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
|
||||
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
|
||||
github.com/redis/go-redis/v9 v9.7.3 h1:YpPyAayJV+XErNsatSElgRZZVCwXX9QzkKYNvO7x0wM=
|
||||
github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRltRTZHSvrA=
|
||||
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
|
||||
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
|
||||
github.com/shirou/gopsutil/v4 v4.25.6 h1:kLysI2JsKorfaFPcYmcJqbzROzsBWEOAtw6A7dIfqXs=
|
||||
github.com/shirou/gopsutil/v4 v4.25.6/go.mod h1:PfybzyydfZcN+JMMjkF6Zb8Mq1A/VcogFFg7hj50W9c=
|
||||
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
|
||||
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
|
||||
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
github.com/testcontainers/testcontainers-go v0.40.0 h1:pSdJYLOVgLE8YdUY2FHQ1Fxu+aMnb6JfVz1mxk7OeMU=
|
||||
github.com/testcontainers/testcontainers-go v0.40.0/go.mod h1:FSXV5KQtX2HAMlm7U3APNyLkkap35zNLxukw9oBi/MY=
|
||||
github.com/testcontainers/testcontainers-go/modules/nats v0.40.0 h1:IfMgeVI7Mg7CIu0R9N0c85XYMjai7e4OCCmHvkmG6Hg=
|
||||
github.com/testcontainers/testcontainers-go/modules/nats v0.40.0/go.mod h1:HpKiTohLxK5QGdCkF0W57nEUDzOR5aZsazH1uo8nqso=
|
||||
github.com/testcontainers/testcontainers-go/modules/postgres v0.40.0 h1:s2bIayFXlbDFexo96y+htn7FzuhpXLYJNnIuglNKqOk=
|
||||
github.com/testcontainers/testcontainers-go/modules/postgres v0.40.0/go.mod h1:h+u/2KoREGTnTl9UwrQ/g+XhasAT8E6dClclAADeXoQ=
|
||||
github.com/testcontainers/testcontainers-go/modules/redis v0.40.0 h1:OG4qwcxp2O0re7V7M9lY9w0v6wWgWf7j7rtkpAnGMd0=
|
||||
github.com/testcontainers/testcontainers-go/modules/redis v0.40.0/go.mod h1:Bc+EDhKMo5zI5V5zdBkHiMVzeAXbtI4n5isS/nzf6zw=
|
||||
github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU=
|
||||
github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI=
|
||||
github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk=
|
||||
github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY=
|
||||
github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
|
||||
github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
|
||||
go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
|
||||
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 h1:jq9TW8u3so/bN+JPT166wjOI6/vQPF6Xe7nMNIltagk=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0/go.mod h1:p8pYQP+m5XfbZm9fxtSKAbM6oIllS7s2AfxrChvc7iw=
|
||||
go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48=
|
||||
go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0 h1:Mne5On7VWdx7omSrSSZvM4Kw7cS7NQkOOmLcgscI51U=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0/go.mod h1:IPtUMKL4O3tH5y+iXVyAXqpAwMuzC1IrxVS81rummfE=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 h1:IeMeyr1aBvBiPVYihXIaeIZba6b8E1bYp7lbdxK8CQg=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0/go.mod h1:oVdCUtjq9MK9BlS7TtucsQwUcXcymNiEDjgDD2jMtZU=
|
||||
go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0=
|
||||
go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs=
|
||||
go.opentelemetry.io/otel/sdk v1.39.0 h1:nMLYcjVsvdui1B/4FRkwjzoRVsMK8uL/cj0OyhKzt18=
|
||||
go.opentelemetry.io/otel/sdk v1.39.0/go.mod h1:vDojkC4/jsTJsE+kh+LXYQlbL8CgrEcwmt1ENZszdJE=
|
||||
go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI=
|
||||
go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA=
|
||||
go.opentelemetry.io/proto/otlp v1.0.0 h1:T0TX0tmXU8a3CbNXzEKGeU5mIVOdf0oykP+u2lIVU/I=
|
||||
go.opentelemetry.io/proto/otlp v1.0.0/go.mod h1:Sy6pihPLfYHkr3NkUbEhGHFhINUSI/v80hjKIs5JXpM=
|
||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
|
||||
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
|
||||
golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
|
||||
golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
|
||||
golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o=
|
||||
golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8=
|
||||
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
|
||||
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k=
|
||||
golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg=
|
||||
golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM=
|
||||
golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk=
|
||||
golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA=
|
||||
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44=
|
||||
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 h1:JLQynH/LBHfCTSbDWl+py8C+Rg/k1OVH3xfcaiANuF0=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:kSJwQxqmFXeo79zOmbrALdflXQeAYcUbgS7PbpMknCY=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ=
|
||||
google.golang.org/grpc v1.79.1 h1:zGhSi45ODB9/p3VAawt9a+O/MULLl9dpizzNNpq7flY=
|
||||
google.golang.org/grpc v1.79.1/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ=
|
||||
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
|
||||
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q=
|
||||
gotest.tools/v3 v3.5.2/go.mod h1:LtdLGcnqToBH83WByAAi/wiwSFCArdFIUV/xxN4pcjA=
|
||||
182
poller/internal/bus/cmd_cert_deploy.go
Normal file
182
poller/internal/bus/cmd_cert_deploy.go
Normal file
@@ -0,0 +1,182 @@
|
||||
// Package bus provides a NATS request-reply handler for certificate deployment.
|
||||
//
|
||||
// cmd_cert_deploy.go handles cert.deploy.{device_id} subjects. The Python backend
|
||||
// sends signed certificate PEM data via NATS, and this handler:
|
||||
// 1. Looks up the device and decrypts credentials
|
||||
// 2. Establishes SSH/SFTP + RouterOS API connections
|
||||
// 3. Calls device.DeployCert for the full deployment flow
|
||||
// 4. Returns the result via NATS reply
|
||||
package bus
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/device"
|
||||
"github.com/mikrotik-portal/poller/internal/store"
|
||||
"github.com/mikrotik-portal/poller/internal/vault"
|
||||
)
|
||||
|
||||
// CertDeployResponder handles NATS request-reply for certificate deployment.
|
||||
type CertDeployResponder struct {
|
||||
nc *nats.Conn
|
||||
store *store.DeviceStore
|
||||
credentialCache *vault.CredentialCache
|
||||
sub *nats.Subscription
|
||||
}
|
||||
|
||||
// NewCertDeployResponder creates a certificate deployment responder using the
|
||||
// given NATS connection, device store, and credential cache.
|
||||
func NewCertDeployResponder(nc *nats.Conn, store *store.DeviceStore, credentialCache *vault.CredentialCache) *CertDeployResponder {
|
||||
return &CertDeployResponder{nc: nc, store: store, credentialCache: credentialCache}
|
||||
}
|
||||
|
||||
// Start subscribes to "cert.deploy.*" with a queue group for load balancing
|
||||
// across multiple poller instances.
|
||||
func (r *CertDeployResponder) Start() error {
|
||||
sub, err := r.nc.QueueSubscribe("cert.deploy.*", "cert-deploy-workers", r.handleRequest)
|
||||
if err != nil {
|
||||
return fmt.Errorf("subscribing to cert.deploy.*: %w", err)
|
||||
}
|
||||
r.sub = sub
|
||||
slog.Info("cert deploy responder subscribed", "subject", "cert.deploy.*", "queue", "cert-deploy-workers")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop unsubscribes from NATS.
|
||||
func (r *CertDeployResponder) Stop() {
|
||||
if r.sub != nil {
|
||||
if err := r.sub.Unsubscribe(); err != nil {
|
||||
slog.Warn("error unsubscribing cert deploy responder", "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleRequest processes a single certificate deployment request.
|
||||
func (r *CertDeployResponder) handleRequest(msg *nats.Msg) {
|
||||
// Extract device ID from subject: cert.deploy.{device_id}
|
||||
parts := strings.Split(msg.Subject, ".")
|
||||
if len(parts) < 3 {
|
||||
r.respondError(msg, "invalid subject format")
|
||||
return
|
||||
}
|
||||
deviceID := parts[2]
|
||||
|
||||
// Parse cert deploy request
|
||||
var req device.CertDeployRequest
|
||||
if err := json.Unmarshal(msg.Data, &req); err != nil {
|
||||
r.respondError(msg, fmt.Sprintf("invalid request JSON: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
slog.Info("cert deploy request received",
|
||||
"device_id", deviceID,
|
||||
"cert_name", req.CertName,
|
||||
"ssh_port", req.SSHPort,
|
||||
)
|
||||
|
||||
// Default SSH port if not specified
|
||||
if req.SSHPort == 0 {
|
||||
req.SSHPort = 22
|
||||
}
|
||||
|
||||
// Look up device from DB
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
dev, err := r.store.GetDevice(ctx, deviceID)
|
||||
if err != nil {
|
||||
slog.Warn("device lookup failed for cert deploy", "device_id", deviceID, "error", err)
|
||||
r.respondError(msg, fmt.Sprintf("device not found: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
// Decrypt device credentials via credential cache (Transit preferred, legacy fallback)
|
||||
username, password, err := r.credentialCache.GetCredentials(
|
||||
dev.ID,
|
||||
dev.TenantID,
|
||||
dev.EncryptedCredentialsTransit,
|
||||
dev.EncryptedCredentials,
|
||||
)
|
||||
if err != nil {
|
||||
r.respondError(msg, fmt.Sprintf("credential decryption failed: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
// Create SSH client for SFTP upload
|
||||
sshClient, err := device.NewSSHClient(dev.IPAddress, req.SSHPort, username, password, 30*time.Second)
|
||||
if err != nil {
|
||||
slog.Warn("SSH connection failed for cert deploy",
|
||||
"device_id", deviceID,
|
||||
"ip", dev.IPAddress,
|
||||
"ssh_port", req.SSHPort,
|
||||
"error", err,
|
||||
)
|
||||
r.respondError(msg, fmt.Sprintf("SSH connection failed: %s", err))
|
||||
return
|
||||
}
|
||||
defer sshClient.Close()
|
||||
|
||||
// Create RouterOS API client for certificate import commands.
|
||||
// Uses the existing ConnectDevice which tries TLS then falls back to plain.
|
||||
// Pass nil for caCertPEM -- we're deploying the cert, so the device doesn't
|
||||
// have a portal-signed cert yet. Plan 03 wires per-device CA cert loading.
|
||||
apiClient, err := device.ConnectDevice(
|
||||
dev.IPAddress,
|
||||
dev.APISSLPort,
|
||||
dev.APIPort,
|
||||
username,
|
||||
password,
|
||||
10*time.Second,
|
||||
nil, // caCertPEM: device has no portal cert yet during deployment
|
||||
dev.TLSMode,
|
||||
)
|
||||
if err != nil {
|
||||
slog.Warn("API connection failed for cert deploy",
|
||||
"device_id", deviceID,
|
||||
"ip", dev.IPAddress,
|
||||
"error", err,
|
||||
)
|
||||
r.respondError(msg, fmt.Sprintf("device API connection failed: %s", err))
|
||||
return
|
||||
}
|
||||
defer device.CloseDevice(apiClient)
|
||||
|
||||
// Execute the full deployment flow
|
||||
resp := device.DeployCert(sshClient, apiClient, req)
|
||||
|
||||
slog.Info("cert deploy completed",
|
||||
"device_id", deviceID,
|
||||
"success", resp.Success,
|
||||
"cert_name_on_device", resp.CertNameOnDevice,
|
||||
)
|
||||
|
||||
// Respond with result
|
||||
data, err := json.Marshal(resp)
|
||||
if err != nil {
|
||||
r.respondError(msg, fmt.Sprintf("failed to marshal response: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
if err := msg.Respond(data); err != nil {
|
||||
slog.Error("failed to respond to cert deploy request", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// respondError sends an error response to a NATS cert deploy request.
|
||||
func (r *CertDeployResponder) respondError(msg *nats.Msg, errMsg string) {
|
||||
resp := device.CertDeployResponse{
|
||||
Success: false,
|
||||
Error: errMsg,
|
||||
}
|
||||
data, _ := json.Marshal(resp)
|
||||
if err := msg.Respond(data); err != nil {
|
||||
slog.Error("failed to respond with cert deploy error", "error", err)
|
||||
}
|
||||
}
|
||||
166
poller/internal/bus/cmd_responder.go
Normal file
166
poller/internal/bus/cmd_responder.go
Normal file
@@ -0,0 +1,166 @@
|
||||
// Package bus provides NATS messaging for the poller service.
|
||||
//
|
||||
// cmd_responder.go implements a NATS request-reply handler for interactive
|
||||
// RouterOS device commands. The Python backend sends command requests to
|
||||
// "device.cmd.{device_id}" and receives structured responses.
|
||||
|
||||
package bus
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/device"
|
||||
"github.com/mikrotik-portal/poller/internal/store"
|
||||
"github.com/mikrotik-portal/poller/internal/vault"
|
||||
)
|
||||
|
||||
// CmdResponder handles NATS request-reply for device commands.
|
||||
type CmdResponder struct {
|
||||
nc *nats.Conn
|
||||
store *store.DeviceStore
|
||||
credentialCache *vault.CredentialCache
|
||||
sub *nats.Subscription
|
||||
}
|
||||
|
||||
// NewCmdResponder creates a command responder using the given NATS connection,
|
||||
// device store, and credential cache.
|
||||
func NewCmdResponder(nc *nats.Conn, store *store.DeviceStore, credentialCache *vault.CredentialCache) *CmdResponder {
|
||||
return &CmdResponder{nc: nc, store: store, credentialCache: credentialCache}
|
||||
}
|
||||
|
||||
// Start subscribes to "device.cmd.*" with a queue group for load balancing
|
||||
// across multiple poller instances.
|
||||
func (r *CmdResponder) Start() error {
|
||||
sub, err := r.nc.QueueSubscribe("device.cmd.*", "cmd-workers", r.handleRequest)
|
||||
if err != nil {
|
||||
return fmt.Errorf("subscribing to device.cmd.*: %w", err)
|
||||
}
|
||||
r.sub = sub
|
||||
slog.Info("command responder subscribed", "subject", "device.cmd.*", "queue", "cmd-workers")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop unsubscribes from NATS.
|
||||
func (r *CmdResponder) Stop() {
|
||||
if r.sub != nil {
|
||||
if err := r.sub.Unsubscribe(); err != nil {
|
||||
slog.Warn("error unsubscribing command responder", "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleRequest processes a single device command request.
|
||||
func (r *CmdResponder) handleRequest(msg *nats.Msg) {
|
||||
// Extract device ID from subject: device.cmd.{device_id}
|
||||
parts := strings.Split(msg.Subject, ".")
|
||||
if len(parts) < 3 {
|
||||
r.respondError(msg, "invalid subject format")
|
||||
return
|
||||
}
|
||||
deviceID := parts[2]
|
||||
|
||||
// Parse command request
|
||||
var req device.CommandRequest
|
||||
if err := json.Unmarshal(msg.Data, &req); err != nil {
|
||||
r.respondError(msg, fmt.Sprintf("invalid request JSON: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
slog.Debug("command request received",
|
||||
"device_id", deviceID,
|
||||
"command", req.Command,
|
||||
"args_count", len(req.Args),
|
||||
)
|
||||
|
||||
// Look up device from DB
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
dev, err := r.store.GetDevice(ctx, deviceID)
|
||||
if err != nil {
|
||||
slog.Warn("device lookup failed for command", "device_id", deviceID, "error", err)
|
||||
r.respondError(msg, fmt.Sprintf("device not found: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
// Decrypt credentials via credential cache (Transit preferred, legacy fallback)
|
||||
username, password, err := r.credentialCache.GetCredentials(
|
||||
dev.ID,
|
||||
dev.TenantID,
|
||||
dev.EncryptedCredentialsTransit,
|
||||
dev.EncryptedCredentials,
|
||||
)
|
||||
if err != nil {
|
||||
r.respondError(msg, fmt.Sprintf("credential decryption failed: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
// Prepare CA cert PEM for TLS verification (only populated for portal_ca devices).
|
||||
var caCertPEM []byte
|
||||
if dev.CACertPEM != nil {
|
||||
caCertPEM = []byte(*dev.CACertPEM)
|
||||
}
|
||||
|
||||
// Connect to device with 10-second timeout
|
||||
client, err := device.ConnectDevice(
|
||||
dev.IPAddress,
|
||||
dev.APISSLPort,
|
||||
dev.APIPort,
|
||||
username,
|
||||
password,
|
||||
10*time.Second,
|
||||
caCertPEM,
|
||||
dev.TLSMode,
|
||||
)
|
||||
if err != nil {
|
||||
slog.Info("device connection failed for command",
|
||||
"device_id", deviceID,
|
||||
"ip", dev.IPAddress,
|
||||
"error", err,
|
||||
)
|
||||
r.respondError(msg, fmt.Sprintf("device connection failed: %s", err))
|
||||
return
|
||||
}
|
||||
defer device.CloseDevice(client)
|
||||
|
||||
// Execute the command
|
||||
resp := device.ExecuteCommand(client, req.Command, req.Args)
|
||||
|
||||
slog.Debug("command executed",
|
||||
"device_id", deviceID,
|
||||
"command", req.Command,
|
||||
"success", resp.Success,
|
||||
"result_count", len(resp.Data),
|
||||
)
|
||||
|
||||
// Respond
|
||||
data, err := json.Marshal(resp)
|
||||
if err != nil {
|
||||
r.respondError(msg, fmt.Sprintf("failed to marshal response: %s", err))
|
||||
return
|
||||
}
|
||||
|
||||
if err := msg.Respond(data); err != nil {
|
||||
slog.Error("failed to respond to command request", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// respondError sends an error response to a NATS request.
|
||||
func (r *CmdResponder) respondError(msg *nats.Msg, errMsg string) {
|
||||
resp := device.CommandResponse{
|
||||
Success: false,
|
||||
Data: nil,
|
||||
Error: errMsg,
|
||||
}
|
||||
data, _ := json.Marshal(resp)
|
||||
if err := msg.Respond(data); err != nil {
|
||||
slog.Error("failed to respond with error", "error", err)
|
||||
}
|
||||
}
|
||||
75
poller/internal/bus/credential_subscriber.go
Normal file
75
poller/internal/bus/credential_subscriber.go
Normal file
@@ -0,0 +1,75 @@
|
||||
// Package bus provides NATS messaging for the poller service.
|
||||
//
|
||||
// credential_subscriber.go subscribes to device.credential_changed.> events
|
||||
// and invalidates the credential cache so the poller uses fresh credentials
|
||||
// on the next poll cycle instead of waiting for the 5-minute cache TTL.
|
||||
package bus
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/vault"
|
||||
)
|
||||
|
||||
// CredentialSubscriber listens for credential change events and invalidates
|
||||
// the credential cache. This ensures the poller picks up new credentials
|
||||
// within seconds of a change rather than waiting for the 5-minute TTL.
|
||||
type CredentialSubscriber struct {
|
||||
nc *nats.Conn
|
||||
credentialCache *vault.CredentialCache
|
||||
sub *nats.Subscription
|
||||
}
|
||||
|
||||
// NewCredentialSubscriber creates a subscriber that invalidates cached
|
||||
// credentials when the backend publishes credential_changed events.
|
||||
func NewCredentialSubscriber(nc *nats.Conn, credentialCache *vault.CredentialCache) *CredentialSubscriber {
|
||||
return &CredentialSubscriber{nc: nc, credentialCache: credentialCache}
|
||||
}
|
||||
|
||||
// Start subscribes to "device.credential_changed.>" with a queue group
|
||||
// so only one poller instance processes each event.
|
||||
func (s *CredentialSubscriber) Start() error {
|
||||
sub, err := s.nc.QueueSubscribe("device.credential_changed.>", "credential-invalidators", s.handleEvent)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.sub = sub
|
||||
slog.Info("credential subscriber started", "subject", "device.credential_changed.>", "queue", "credential-invalidators")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop unsubscribes from NATS.
|
||||
func (s *CredentialSubscriber) Stop() {
|
||||
if s.sub != nil {
|
||||
if err := s.sub.Unsubscribe(); err != nil {
|
||||
slog.Warn("error unsubscribing credential subscriber", "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleEvent processes a credential_changed event by invalidating the
|
||||
// device's entry in the credential cache.
|
||||
func (s *CredentialSubscriber) handleEvent(msg *nats.Msg) {
|
||||
var event struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
}
|
||||
if err := json.Unmarshal(msg.Data, &event); err != nil {
|
||||
slog.Warn("failed to unmarshal credential_changed event", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
if event.DeviceID == "" {
|
||||
slog.Warn("credential_changed event missing device_id")
|
||||
return
|
||||
}
|
||||
|
||||
s.credentialCache.Invalidate(event.DeviceID)
|
||||
slog.Info("credential cache invalidated",
|
||||
"device_id", event.DeviceID,
|
||||
"tenant_id", event.TenantID,
|
||||
)
|
||||
}
|
||||
322
poller/internal/bus/publisher.go
Normal file
322
poller/internal/bus/publisher.go
Normal file
@@ -0,0 +1,322 @@
|
||||
// Package bus provides NATS JetStream publishing for device events.
|
||||
package bus
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/device"
|
||||
)
|
||||
|
||||
// DeviceStatusEvent is the payload published to NATS JetStream when a device
|
||||
// is polled. Consumers subscribe to "device.status.>" to receive all events.
|
||||
type DeviceStatusEvent struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
Status string `json:"status"` // "online" or "offline"
|
||||
RouterOSVersion string `json:"routeros_version,omitempty"`
|
||||
MajorVersion int `json:"major_version,omitempty"`
|
||||
BoardName string `json:"board_name,omitempty"`
|
||||
Architecture string `json:"architecture,omitempty"`
|
||||
Uptime string `json:"uptime,omitempty"`
|
||||
CPULoad string `json:"cpu_load,omitempty"`
|
||||
FreeMemory string `json:"free_memory,omitempty"`
|
||||
TotalMemory string `json:"total_memory,omitempty"`
|
||||
SerialNumber string `json:"serial_number,omitempty"`
|
||||
FirmwareVersion string `json:"firmware_version,omitempty"`
|
||||
LastSeen string `json:"last_seen"` // RFC3339
|
||||
}
|
||||
|
||||
// DeviceMetricsEvent is the payload published to NATS JetStream for metric data
|
||||
// collected from a RouterOS device on each poll cycle.
|
||||
//
|
||||
// Events are published to "device.metrics.{type}.{device_id}" where type is one
|
||||
// of "health", "interfaces", or "wireless". Only the field matching the type will
|
||||
// be populated; the others will be omitted from the JSON payload.
|
||||
type DeviceMetricsEvent struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
CollectedAt string `json:"collected_at"` // RFC3339
|
||||
Type string `json:"type"` // "health", "interfaces", "wireless"
|
||||
Health *device.HealthMetrics `json:"health,omitempty"`
|
||||
Interfaces []device.InterfaceStats `json:"interfaces,omitempty"`
|
||||
Wireless []device.WirelessStats `json:"wireless,omitempty"`
|
||||
}
|
||||
|
||||
// ConfigChangedEvent is published when a device's config changes out-of-band.
|
||||
type ConfigChangedEvent struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
OldTimestamp string `json:"old_timestamp"`
|
||||
NewTimestamp string `json:"new_timestamp"`
|
||||
}
|
||||
|
||||
// PushRollbackEvent triggers automatic rollback for template pushes.
|
||||
type PushRollbackEvent struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
PushOperationID string `json:"push_operation_id"`
|
||||
PrePushCommitSHA string `json:"pre_push_commit_sha"`
|
||||
}
|
||||
|
||||
// PushAlertEvent triggers an alert for editor pushes (one-click rollback).
|
||||
type PushAlertEvent struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
PushType string `json:"push_type"`
|
||||
}
|
||||
|
||||
// Publisher wraps a NATS JetStream connection for publishing device events.
|
||||
type Publisher struct {
|
||||
nc *nats.Conn
|
||||
js jetstream.JetStream
|
||||
}
|
||||
|
||||
// NewPublisher connects to NATS and ensures the DEVICE_EVENTS stream exists.
|
||||
//
|
||||
// The DEVICE_EVENTS stream covers device.status.>, device.metrics.>, and
|
||||
// device.firmware.> subjects. These are explicit to avoid capturing
|
||||
// device.cmd.* (used by CmdResponder for request-reply). This allows
|
||||
// the Python API to subscribe to either family via durable consumers.
|
||||
//
|
||||
// The connection uses unlimited reconnects with a 2-second wait between attempts
|
||||
// so the poller survives transient NATS restarts gracefully.
|
||||
func NewPublisher(natsURL string) (*Publisher, error) {
|
||||
nc, err := nats.Connect(natsURL,
|
||||
nats.MaxReconnects(-1),
|
||||
nats.ReconnectWait(2*time.Second),
|
||||
nats.DisconnectErrHandler(func(nc *nats.Conn, err error) {
|
||||
slog.Warn("NATS disconnected", "error", err)
|
||||
}),
|
||||
nats.ReconnectHandler(func(nc *nats.Conn) {
|
||||
slog.Info("NATS reconnected", "url", nc.ConnectedUrl())
|
||||
}),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("connecting to NATS at %s: %w", natsURL, err)
|
||||
}
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
return nil, fmt.Errorf("creating JetStream context: %w", err)
|
||||
}
|
||||
|
||||
// Ensure the DEVICE_EVENTS stream exists. CreateOrUpdateStream is idempotent.
|
||||
// Subjects are explicit (not "device.>") to avoid capturing device.cmd.*
|
||||
// which is used by CmdResponder for core NATS request-reply.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
_, err = js.CreateOrUpdateStream(ctx, jetstream.StreamConfig{
|
||||
Name: "DEVICE_EVENTS",
|
||||
Subjects: []string{
|
||||
"device.status.>",
|
||||
"device.metrics.>",
|
||||
"device.firmware.>",
|
||||
"device.credential_changed.>",
|
||||
"config.changed.>",
|
||||
"config.push.rollback.>",
|
||||
"config.push.alert.>",
|
||||
},
|
||||
MaxAge: 24 * time.Hour,
|
||||
})
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
return nil, fmt.Errorf("ensuring DEVICE_EVENTS stream: %w", err)
|
||||
}
|
||||
|
||||
slog.Info("NATS JetStream DEVICE_EVENTS stream ready")
|
||||
|
||||
return &Publisher{nc: nc, js: js}, nil
|
||||
}
|
||||
|
||||
// PublishStatus publishes a device status event to NATS JetStream.
|
||||
//
|
||||
// Events are published to "device.status.{DeviceID}" so consumers can subscribe
|
||||
// to individual devices or all events via "device.status.>".
|
||||
func (p *Publisher) PublishStatus(ctx context.Context, event DeviceStatusEvent) error {
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshalling event: %w", err)
|
||||
}
|
||||
|
||||
subject := fmt.Sprintf("device.status.%s", event.DeviceID)
|
||||
|
||||
_, err = p.js.Publish(ctx, subject, data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("publishing to %s: %w", subject, err)
|
||||
}
|
||||
|
||||
slog.Debug("published device status event",
|
||||
"device_id", event.DeviceID,
|
||||
"status", event.Status,
|
||||
"subject", subject,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// PublishMetrics publishes a device metrics event to NATS JetStream.
|
||||
//
|
||||
// Events are published to "device.metrics.{type}.{device_id}" so consumers can
|
||||
// subscribe to all metrics via "device.metrics.>" or filter by type.
|
||||
func (p *Publisher) PublishMetrics(ctx context.Context, event DeviceMetricsEvent) error {
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshalling metrics event: %w", err)
|
||||
}
|
||||
|
||||
subject := fmt.Sprintf("device.metrics.%s.%s", event.Type, event.DeviceID)
|
||||
|
||||
_, err = p.js.Publish(ctx, subject, data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("publishing to %s: %w", subject, err)
|
||||
}
|
||||
|
||||
slog.Debug("published device metrics event",
|
||||
"device_id", event.DeviceID,
|
||||
"type", event.Type,
|
||||
"subject", subject,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// DeviceFirmwareEvent is the payload published to NATS JetStream when the poller
|
||||
// checks a device's firmware update status (rate-limited to once per day per device).
|
||||
type DeviceFirmwareEvent struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
InstalledVersion string `json:"installed_version"`
|
||||
LatestVersion string `json:"latest_version,omitempty"`
|
||||
Channel string `json:"channel,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Architecture string `json:"architecture"`
|
||||
}
|
||||
|
||||
// PublishFirmware publishes a device firmware status event to NATS JetStream.
|
||||
//
|
||||
// Events are published to "device.firmware.{DeviceID}" so the Python firmware
|
||||
// subscriber can process them and update the firmware_versions table.
|
||||
func (p *Publisher) PublishFirmware(ctx context.Context, event DeviceFirmwareEvent) error {
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshalling firmware event: %w", err)
|
||||
}
|
||||
|
||||
subject := fmt.Sprintf("device.firmware.%s", event.DeviceID)
|
||||
|
||||
_, err = p.js.Publish(ctx, subject, data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("publishing to %s: %w", subject, err)
|
||||
}
|
||||
|
||||
slog.Debug("published device firmware event",
|
||||
"device_id", event.DeviceID,
|
||||
"installed", event.InstalledVersion,
|
||||
"latest", event.LatestVersion,
|
||||
"subject", subject,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// PublishConfigChanged publishes a config change event for a device.
|
||||
//
|
||||
// Events are published to "config.changed.{TenantID}.{DeviceID}" so the Python
|
||||
// backend can trigger event-driven backups when out-of-band changes are detected.
|
||||
func (p *Publisher) PublishConfigChanged(ctx context.Context, event ConfigChangedEvent) error {
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal config changed event: %w", err)
|
||||
}
|
||||
|
||||
subject := fmt.Sprintf("config.changed.%s.%s", event.TenantID, event.DeviceID)
|
||||
|
||||
_, err = p.js.Publish(ctx, subject, data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("publish config changed: %w", err)
|
||||
}
|
||||
|
||||
slog.Debug("published config changed event",
|
||||
"device_id", event.DeviceID,
|
||||
"tenant_id", event.TenantID,
|
||||
"old_timestamp", event.OldTimestamp,
|
||||
"new_timestamp", event.NewTimestamp,
|
||||
"subject", subject,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// PublishPushRollback publishes a push rollback event when a device goes offline
|
||||
// after a template or restore config push, triggering automatic rollback.
|
||||
func (p *Publisher) PublishPushRollback(ctx context.Context, event PushRollbackEvent) error {
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal push rollback event: %w", err)
|
||||
}
|
||||
|
||||
subject := fmt.Sprintf("config.push.rollback.%s.%s", event.TenantID, event.DeviceID)
|
||||
|
||||
_, err = p.js.Publish(ctx, subject, data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("publishing to %s: %w", subject, err)
|
||||
}
|
||||
|
||||
slog.Info("published push rollback event",
|
||||
"device_id", event.DeviceID,
|
||||
"tenant_id", event.TenantID,
|
||||
"push_operation_id", event.PushOperationID,
|
||||
"subject", subject,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// PublishPushAlert publishes a push alert event when a device goes offline
|
||||
// after an editor config push, enabling one-click rollback in the UI.
|
||||
func (p *Publisher) PublishPushAlert(ctx context.Context, event PushAlertEvent) error {
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal push alert event: %w", err)
|
||||
}
|
||||
|
||||
subject := fmt.Sprintf("config.push.alert.%s.%s", event.TenantID, event.DeviceID)
|
||||
|
||||
_, err = p.js.Publish(ctx, subject, data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("publishing to %s: %w", subject, err)
|
||||
}
|
||||
|
||||
slog.Info("published push alert event",
|
||||
"device_id", event.DeviceID,
|
||||
"tenant_id", event.TenantID,
|
||||
"push_type", event.PushType,
|
||||
"subject", subject,
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Conn returns the raw NATS connection for use by other components
|
||||
// (e.g., CmdResponder for request-reply subscriptions).
|
||||
func (p *Publisher) Conn() *nats.Conn {
|
||||
return p.nc
|
||||
}
|
||||
|
||||
// Close drains the NATS connection, flushing pending messages before closing.
|
||||
func (p *Publisher) Close() {
|
||||
if p.nc != nil {
|
||||
if err := p.nc.Drain(); err != nil {
|
||||
slog.Warn("error draining NATS connection", "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
232
poller/internal/bus/publisher_integration_test.go
Normal file
232
poller/internal/bus/publisher_integration_test.go
Normal file
@@ -0,0 +1,232 @@
|
||||
package bus_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/bus"
|
||||
"github.com/mikrotik-portal/poller/internal/testutil"
|
||||
)
|
||||
|
||||
func TestPublisher_PublishStatus_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
natsURL, cleanup := testutil.SetupNATS(t)
|
||||
defer cleanup()
|
||||
|
||||
pub, err := bus.NewPublisher(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer pub.Close()
|
||||
|
||||
// Create a direct NATS consumer to receive messages.
|
||||
nc, err := nats.Connect(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer nc.Close()
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Create a consumer on the DEVICE_EVENTS stream.
|
||||
cons, err := js.CreateOrUpdateConsumer(ctx, "DEVICE_EVENTS", jetstream.ConsumerConfig{
|
||||
FilterSubject: "device.status.>",
|
||||
AckPolicy: jetstream.AckNonePolicy,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Publish a status event.
|
||||
event := bus.DeviceStatusEvent{
|
||||
DeviceID: "dev-abc-123",
|
||||
TenantID: "tenant-xyz",
|
||||
Status: "online",
|
||||
LastSeen: time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
err = pub.PublishStatus(ctx, event)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Consume the message with timeout.
|
||||
msgBatch, err := cons.Fetch(1, jetstream.FetchMaxWait(5*time.Second))
|
||||
require.NoError(t, err)
|
||||
|
||||
var received *jetstream.Msg
|
||||
for msg := range msgBatch.Messages() {
|
||||
received = &msg
|
||||
break
|
||||
}
|
||||
|
||||
require.NotNil(t, received, "should receive a message within 5 seconds")
|
||||
|
||||
var got bus.DeviceStatusEvent
|
||||
err = json.Unmarshal((*received).Data(), &got)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, event.DeviceID, got.DeviceID)
|
||||
assert.Equal(t, event.TenantID, got.TenantID)
|
||||
assert.Equal(t, event.Status, got.Status)
|
||||
}
|
||||
|
||||
func TestPublisher_PublishMetrics_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
natsURL, cleanup := testutil.SetupNATS(t)
|
||||
defer cleanup()
|
||||
|
||||
pub, err := bus.NewPublisher(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer pub.Close()
|
||||
|
||||
nc, err := nats.Connect(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer nc.Close()
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Create a consumer filtering on metrics subjects.
|
||||
cons, err := js.CreateOrUpdateConsumer(ctx, "DEVICE_EVENTS", jetstream.ConsumerConfig{
|
||||
FilterSubject: "device.metrics.>",
|
||||
AckPolicy: jetstream.AckNonePolicy,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Publish a metrics event.
|
||||
event := bus.DeviceMetricsEvent{
|
||||
DeviceID: "dev-metrics-456",
|
||||
TenantID: "tenant-xyz",
|
||||
CollectedAt: time.Now().UTC().Format(time.RFC3339),
|
||||
Type: "health",
|
||||
}
|
||||
err = pub.PublishMetrics(ctx, event)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Consume the message.
|
||||
msgBatch, err := cons.Fetch(1, jetstream.FetchMaxWait(5*time.Second))
|
||||
require.NoError(t, err)
|
||||
|
||||
var received *jetstream.Msg
|
||||
for msg := range msgBatch.Messages() {
|
||||
received = &msg
|
||||
break
|
||||
}
|
||||
|
||||
require.NotNil(t, received, "should receive metrics message within 5 seconds")
|
||||
|
||||
// Verify the subject includes the type and device_id.
|
||||
assert.Equal(t, "device.metrics.health.dev-metrics-456", (*received).Subject())
|
||||
|
||||
var got bus.DeviceMetricsEvent
|
||||
err = json.Unmarshal((*received).Data(), &got)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, event.DeviceID, got.DeviceID)
|
||||
assert.Equal(t, event.TenantID, got.TenantID)
|
||||
assert.Equal(t, event.Type, got.Type)
|
||||
}
|
||||
|
||||
func TestPublisher_PublishFirmware_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
natsURL, cleanup := testutil.SetupNATS(t)
|
||||
defer cleanup()
|
||||
|
||||
pub, err := bus.NewPublisher(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer pub.Close()
|
||||
|
||||
nc, err := nats.Connect(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer nc.Close()
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
cons, err := js.CreateOrUpdateConsumer(ctx, "DEVICE_EVENTS", jetstream.ConsumerConfig{
|
||||
FilterSubject: "device.firmware.>",
|
||||
AckPolicy: jetstream.AckNonePolicy,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
event := bus.DeviceFirmwareEvent{
|
||||
DeviceID: "dev-fw-789",
|
||||
TenantID: "tenant-xyz",
|
||||
InstalledVersion: "7.15",
|
||||
LatestVersion: "7.16",
|
||||
Channel: "stable",
|
||||
Status: "update_available",
|
||||
Architecture: "arm64",
|
||||
}
|
||||
err = pub.PublishFirmware(ctx, event)
|
||||
require.NoError(t, err)
|
||||
|
||||
msgBatch, err := cons.Fetch(1, jetstream.FetchMaxWait(5*time.Second))
|
||||
require.NoError(t, err)
|
||||
|
||||
var received *jetstream.Msg
|
||||
for msg := range msgBatch.Messages() {
|
||||
received = &msg
|
||||
break
|
||||
}
|
||||
|
||||
require.NotNil(t, received, "should receive firmware message within 5 seconds")
|
||||
assert.Equal(t, "device.firmware.dev-fw-789", (*received).Subject())
|
||||
|
||||
var got bus.DeviceFirmwareEvent
|
||||
err = json.Unmarshal((*received).Data(), &got)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, event.DeviceID, got.DeviceID)
|
||||
assert.Equal(t, event.InstalledVersion, got.InstalledVersion)
|
||||
assert.Equal(t, event.LatestVersion, got.LatestVersion)
|
||||
assert.Equal(t, event.Status, got.Status)
|
||||
}
|
||||
|
||||
func TestPublisher_NewPublisher_StreamCreation_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
natsURL, cleanup := testutil.SetupNATS(t)
|
||||
defer cleanup()
|
||||
|
||||
pub, err := bus.NewPublisher(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer pub.Close()
|
||||
|
||||
// Verify the DEVICE_EVENTS stream was created with correct config.
|
||||
nc, err := nats.Connect(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer nc.Close()
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx := context.Background()
|
||||
stream, err := js.Stream(ctx, "DEVICE_EVENTS")
|
||||
require.NoError(t, err, "DEVICE_EVENTS stream should exist")
|
||||
|
||||
info, err := stream.Info(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, "DEVICE_EVENTS", info.Config.Name)
|
||||
assert.Contains(t, info.Config.Subjects, "device.status.>",
|
||||
"stream should cover device.status.> subjects")
|
||||
assert.Contains(t, info.Config.Subjects, "device.metrics.>",
|
||||
"stream should cover device.metrics.> subjects")
|
||||
assert.Contains(t, info.Config.Subjects, "device.firmware.>",
|
||||
"stream should cover device.firmware.> subjects")
|
||||
}
|
||||
160
poller/internal/config/config.go
Normal file
160
poller/internal/config/config.go
Normal file
@@ -0,0 +1,160 @@
|
||||
// Package config loads poller configuration from environment variables.
|
||||
package config
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// Config holds all runtime configuration for the poller service.
|
||||
type Config struct {
|
||||
// Environment is the deployment environment (dev, staging, production).
|
||||
// Controls startup validation of security-sensitive defaults.
|
||||
Environment string
|
||||
|
||||
// DatabaseURL is the PostgreSQL connection string for the poller_user role.
|
||||
// Example: postgres://poller_user:poller_password@localhost:5432/mikrotik
|
||||
DatabaseURL string
|
||||
|
||||
// RedisURL is the Redis connection URL.
|
||||
RedisURL string
|
||||
|
||||
// NatsURL is the NATS server URL.
|
||||
NatsURL string
|
||||
|
||||
// CredentialEncryptionKey is the 32-byte AES key decoded from base64.
|
||||
// MUST match the Python backend CREDENTIAL_ENCRYPTION_KEY environment variable.
|
||||
// OPTIONAL when OpenBao Transit is configured (OPENBAO_ADDR set).
|
||||
CredentialEncryptionKey []byte
|
||||
|
||||
// OpenBaoAddr is the OpenBao server address for Transit API calls.
|
||||
// Example: http://openbao:8200
|
||||
OpenBaoAddr string
|
||||
|
||||
// OpenBaoToken is the authentication token for OpenBao API calls.
|
||||
OpenBaoToken string
|
||||
|
||||
// PollIntervalSeconds is how often each device is polled.
|
||||
PollIntervalSeconds int
|
||||
|
||||
// DeviceRefreshSeconds is how often the DB is queried for new/removed devices.
|
||||
DeviceRefreshSeconds int
|
||||
|
||||
// ConnectionTimeoutSeconds is the TLS connection timeout per device.
|
||||
ConnectionTimeoutSeconds int
|
||||
|
||||
// LogLevel controls log verbosity (debug, info, warn, error).
|
||||
LogLevel string
|
||||
|
||||
// CircuitBreakerMaxFailures is the number of consecutive connection failures
|
||||
// before the circuit breaker enters backoff mode for a device.
|
||||
CircuitBreakerMaxFailures int
|
||||
|
||||
// CircuitBreakerBaseBackoffSeconds is the base backoff duration in seconds.
|
||||
// Actual backoff is exponential: base * 2^(failures-1), capped at max.
|
||||
CircuitBreakerBaseBackoffSeconds int
|
||||
|
||||
// CircuitBreakerMaxBackoffSeconds is the maximum backoff duration in seconds.
|
||||
CircuitBreakerMaxBackoffSeconds int
|
||||
|
||||
// CommandTimeoutSeconds is the per-command timeout for RouterOS API calls.
|
||||
// Each API call (DetectVersion, CollectInterfaces, etc.) is wrapped with
|
||||
// this timeout to prevent indefinite blocking on unresponsive devices.
|
||||
CommandTimeoutSeconds int
|
||||
}
|
||||
|
||||
// knownInsecureEncryptionKey is the base64-encoded dev default encryption key.
|
||||
// Production environments MUST NOT use this value.
|
||||
const knownInsecureEncryptionKey = "LLLjnfBZTSycvL2U07HDSxUeTtLxb9cZzryQl0R9E4w="
|
||||
|
||||
// Load reads configuration from environment variables, applying defaults where appropriate.
|
||||
// Returns an error if any required variable is missing or invalid.
|
||||
func Load() (*Config, error) {
|
||||
cfg := &Config{
|
||||
Environment: getEnv("ENVIRONMENT", "dev"),
|
||||
DatabaseURL: getEnv("DATABASE_URL", ""),
|
||||
RedisURL: getEnv("REDIS_URL", "redis://localhost:6379/0"),
|
||||
NatsURL: getEnv("NATS_URL", "nats://localhost:4222"),
|
||||
LogLevel: getEnv("LOG_LEVEL", "info"),
|
||||
PollIntervalSeconds: getEnvInt("POLL_INTERVAL_SECONDS", 60),
|
||||
DeviceRefreshSeconds: getEnvInt("DEVICE_REFRESH_SECONDS", 60),
|
||||
ConnectionTimeoutSeconds: getEnvInt("CONNECTION_TIMEOUT_SECONDS", 10),
|
||||
CircuitBreakerMaxFailures: getEnvInt("CIRCUIT_BREAKER_MAX_FAILURES", 5),
|
||||
CircuitBreakerBaseBackoffSeconds: getEnvInt("CIRCUIT_BREAKER_BASE_BACKOFF_SECONDS", 30),
|
||||
CircuitBreakerMaxBackoffSeconds: getEnvInt("CIRCUIT_BREAKER_MAX_BACKOFF_SECONDS", 900),
|
||||
CommandTimeoutSeconds: getEnvInt("COMMAND_TIMEOUT_SECONDS", 30),
|
||||
}
|
||||
|
||||
if cfg.DatabaseURL == "" {
|
||||
return nil, fmt.Errorf("DATABASE_URL environment variable is required")
|
||||
}
|
||||
|
||||
// OpenBao Transit configuration (optional -- required for Phase 29+ envelope encryption)
|
||||
cfg.OpenBaoAddr = getEnv("OPENBAO_ADDR", "")
|
||||
cfg.OpenBaoToken = getEnv("OPENBAO_TOKEN", "")
|
||||
|
||||
if cfg.OpenBaoAddr != "" && cfg.OpenBaoToken == "" {
|
||||
return nil, fmt.Errorf("OPENBAO_TOKEN is required when OPENBAO_ADDR is set")
|
||||
}
|
||||
|
||||
// Decode the AES-256-GCM encryption key from base64.
|
||||
// Must use StdEncoding (NOT URLEncoding) to match Python's base64.b64encode output.
|
||||
// OPTIONAL when OpenBao Transit is configured (OPENBAO_ADDR set).
|
||||
keyB64 := getEnv("CREDENTIAL_ENCRYPTION_KEY", "")
|
||||
if keyB64 == "" {
|
||||
if cfg.OpenBaoAddr == "" {
|
||||
return nil, fmt.Errorf("CREDENTIAL_ENCRYPTION_KEY environment variable is required (or configure OPENBAO_ADDR for Transit encryption)")
|
||||
}
|
||||
// OpenBao configured without legacy key -- OK for post-migration
|
||||
slog.Info("CREDENTIAL_ENCRYPTION_KEY not set; OpenBao Transit will handle all credential decryption")
|
||||
} else {
|
||||
// Validate production safety BEFORE decode: reject known insecure defaults in non-dev environments.
|
||||
// This runs first so placeholder values like "CHANGE_ME_IN_PRODUCTION" get a clear security
|
||||
// error instead of a confusing "not valid base64" error.
|
||||
if cfg.Environment != "dev" {
|
||||
if keyB64 == knownInsecureEncryptionKey || keyB64 == "CHANGE_ME_IN_PRODUCTION" {
|
||||
return nil, fmt.Errorf(
|
||||
"FATAL: CREDENTIAL_ENCRYPTION_KEY uses a known insecure default in '%s' environment. "+
|
||||
"Generate a secure key for production: "+
|
||||
"python -c \"import secrets, base64; print(base64.b64encode(secrets.token_bytes(32)).decode())\"",
|
||||
cfg.Environment,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
key, err := base64.StdEncoding.DecodeString(keyB64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("CREDENTIAL_ENCRYPTION_KEY is not valid base64: %w", err)
|
||||
}
|
||||
if len(key) != 32 {
|
||||
return nil, fmt.Errorf("CREDENTIAL_ENCRYPTION_KEY must decode to exactly 32 bytes, got %d", len(key))
|
||||
}
|
||||
cfg.CredentialEncryptionKey = key
|
||||
}
|
||||
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
// getEnv returns the value of an environment variable, or the defaultValue if not set.
|
||||
func getEnv(key, defaultValue string) string {
|
||||
if val := os.Getenv(key); val != "" {
|
||||
return val
|
||||
}
|
||||
return defaultValue
|
||||
}
|
||||
|
||||
// getEnvInt returns the integer value of an environment variable, or the defaultValue if not set or invalid.
|
||||
func getEnvInt(key string, defaultValue int) int {
|
||||
val := os.Getenv(key)
|
||||
if val == "" {
|
||||
return defaultValue
|
||||
}
|
||||
n, err := strconv.Atoi(val)
|
||||
if err != nil {
|
||||
return defaultValue
|
||||
}
|
||||
return n
|
||||
}
|
||||
79
poller/internal/config/config_prod_test.go
Normal file
79
poller/internal/config/config_prod_test.go
Normal file
@@ -0,0 +1,79 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestProductionValidationRejectsInsecureKey(t *testing.T) {
|
||||
// Save and restore env
|
||||
origEnv := os.Getenv("ENVIRONMENT")
|
||||
origDB := os.Getenv("DATABASE_URL")
|
||||
origKey := os.Getenv("CREDENTIAL_ENCRYPTION_KEY")
|
||||
defer func() {
|
||||
os.Setenv("ENVIRONMENT", origEnv)
|
||||
os.Setenv("DATABASE_URL", origDB)
|
||||
os.Setenv("CREDENTIAL_ENCRYPTION_KEY", origKey)
|
||||
}()
|
||||
|
||||
os.Setenv("DATABASE_URL", "postgres://test:test@localhost:5432/test")
|
||||
|
||||
// Test: production with known insecure default key should fail
|
||||
os.Setenv("ENVIRONMENT", "production")
|
||||
os.Setenv("CREDENTIAL_ENCRYPTION_KEY", "LLLjnfBZTSycvL2U07HDSxUeTtLxb9cZzryQl0R9E4w=")
|
||||
|
||||
_, err := Load()
|
||||
if err == nil {
|
||||
t.Fatal("expected error for insecure key in production, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "FATAL") {
|
||||
t.Fatalf("expected FATAL in error message, got: %s", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func TestProductionValidationRejectsPlaceholder(t *testing.T) {
|
||||
origEnv := os.Getenv("ENVIRONMENT")
|
||||
origDB := os.Getenv("DATABASE_URL")
|
||||
origKey := os.Getenv("CREDENTIAL_ENCRYPTION_KEY")
|
||||
defer func() {
|
||||
os.Setenv("ENVIRONMENT", origEnv)
|
||||
os.Setenv("DATABASE_URL", origDB)
|
||||
os.Setenv("CREDENTIAL_ENCRYPTION_KEY", origKey)
|
||||
}()
|
||||
|
||||
os.Setenv("DATABASE_URL", "postgres://test:test@localhost:5432/test")
|
||||
os.Setenv("ENVIRONMENT", "production")
|
||||
os.Setenv("CREDENTIAL_ENCRYPTION_KEY", "CHANGE_ME_IN_PRODUCTION")
|
||||
|
||||
_, err := Load()
|
||||
if err == nil {
|
||||
t.Fatal("expected error for CHANGE_ME_IN_PRODUCTION in production, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "FATAL") {
|
||||
t.Fatalf("expected FATAL in error message for placeholder, got: %s", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func TestDevModeAcceptsInsecureDefaults(t *testing.T) {
|
||||
origEnv := os.Getenv("ENVIRONMENT")
|
||||
origDB := os.Getenv("DATABASE_URL")
|
||||
origKey := os.Getenv("CREDENTIAL_ENCRYPTION_KEY")
|
||||
defer func() {
|
||||
os.Setenv("ENVIRONMENT", origEnv)
|
||||
os.Setenv("DATABASE_URL", origDB)
|
||||
os.Setenv("CREDENTIAL_ENCRYPTION_KEY", origKey)
|
||||
}()
|
||||
|
||||
os.Setenv("ENVIRONMENT", "dev")
|
||||
os.Setenv("DATABASE_URL", "postgres://test:test@localhost:5432/test")
|
||||
os.Setenv("CREDENTIAL_ENCRYPTION_KEY", "LLLjnfBZTSycvL2U07HDSxUeTtLxb9cZzryQl0R9E4w=")
|
||||
|
||||
cfg, err := Load()
|
||||
if err != nil {
|
||||
t.Fatalf("dev mode should accept insecure defaults, got: %s", err.Error())
|
||||
}
|
||||
if cfg.Environment != "dev" {
|
||||
t.Fatalf("expected Environment=dev, got %s", cfg.Environment)
|
||||
}
|
||||
}
|
||||
104
poller/internal/config/config_test.go
Normal file
104
poller/internal/config/config_test.go
Normal file
@@ -0,0 +1,104 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestLoad_RequiredDatabaseURL(t *testing.T) {
|
||||
// Clear DATABASE_URL to trigger required field error
|
||||
t.Setenv("DATABASE_URL", "")
|
||||
t.Setenv("CREDENTIAL_ENCRYPTION_KEY", base64.StdEncoding.EncodeToString(make([]byte, 32)))
|
||||
|
||||
_, err := Load()
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "DATABASE_URL")
|
||||
}
|
||||
|
||||
func TestLoad_RequiredEncryptionKey(t *testing.T) {
|
||||
t.Setenv("DATABASE_URL", "postgres://user:pass@localhost/db")
|
||||
t.Setenv("CREDENTIAL_ENCRYPTION_KEY", "")
|
||||
|
||||
_, err := Load()
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "CREDENTIAL_ENCRYPTION_KEY")
|
||||
}
|
||||
|
||||
func TestLoad_InvalidBase64Key(t *testing.T) {
|
||||
t.Setenv("DATABASE_URL", "postgres://user:pass@localhost/db")
|
||||
t.Setenv("CREDENTIAL_ENCRYPTION_KEY", "not-valid-base64!!!")
|
||||
|
||||
_, err := Load()
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "base64")
|
||||
}
|
||||
|
||||
func TestLoad_WrongKeyLength(t *testing.T) {
|
||||
// Encode a 16-byte key (too short -- must be 32)
|
||||
t.Setenv("DATABASE_URL", "postgres://user:pass@localhost/db")
|
||||
t.Setenv("CREDENTIAL_ENCRYPTION_KEY", base64.StdEncoding.EncodeToString(make([]byte, 16)))
|
||||
|
||||
_, err := Load()
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "32 bytes")
|
||||
}
|
||||
|
||||
func TestLoad_DefaultValues(t *testing.T) {
|
||||
t.Setenv("DATABASE_URL", "postgres://user:pass@localhost/db")
|
||||
t.Setenv("CREDENTIAL_ENCRYPTION_KEY", base64.StdEncoding.EncodeToString(make([]byte, 32)))
|
||||
// Clear optional vars to test defaults
|
||||
t.Setenv("REDIS_URL", "")
|
||||
t.Setenv("NATS_URL", "")
|
||||
t.Setenv("LOG_LEVEL", "")
|
||||
t.Setenv("POLL_INTERVAL_SECONDS", "")
|
||||
t.Setenv("DEVICE_REFRESH_SECONDS", "")
|
||||
t.Setenv("CONNECTION_TIMEOUT_SECONDS", "")
|
||||
|
||||
cfg, err := Load()
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, "redis://localhost:6379/0", cfg.RedisURL)
|
||||
assert.Equal(t, "nats://localhost:4222", cfg.NatsURL)
|
||||
assert.Equal(t, "info", cfg.LogLevel)
|
||||
assert.Equal(t, 60, cfg.PollIntervalSeconds)
|
||||
assert.Equal(t, 60, cfg.DeviceRefreshSeconds)
|
||||
assert.Equal(t, 10, cfg.ConnectionTimeoutSeconds)
|
||||
}
|
||||
|
||||
func TestLoad_CustomValues(t *testing.T) {
|
||||
t.Setenv("DATABASE_URL", "postgres://custom:pass@db:5432/mydb")
|
||||
t.Setenv("CREDENTIAL_ENCRYPTION_KEY", base64.StdEncoding.EncodeToString(make([]byte, 32)))
|
||||
t.Setenv("REDIS_URL", "redis://custom-redis:6380/1")
|
||||
t.Setenv("NATS_URL", "nats://custom-nats:4223")
|
||||
t.Setenv("LOG_LEVEL", "debug")
|
||||
t.Setenv("POLL_INTERVAL_SECONDS", "30")
|
||||
t.Setenv("DEVICE_REFRESH_SECONDS", "120")
|
||||
t.Setenv("CONNECTION_TIMEOUT_SECONDS", "5")
|
||||
|
||||
cfg, err := Load()
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, "postgres://custom:pass@db:5432/mydb", cfg.DatabaseURL)
|
||||
assert.Equal(t, "redis://custom-redis:6380/1", cfg.RedisURL)
|
||||
assert.Equal(t, "nats://custom-nats:4223", cfg.NatsURL)
|
||||
assert.Equal(t, "debug", cfg.LogLevel)
|
||||
assert.Equal(t, 30, cfg.PollIntervalSeconds)
|
||||
assert.Equal(t, 120, cfg.DeviceRefreshSeconds)
|
||||
assert.Equal(t, 5, cfg.ConnectionTimeoutSeconds)
|
||||
}
|
||||
|
||||
func TestLoad_ValidEncryptionKey(t *testing.T) {
|
||||
key := make([]byte, 32)
|
||||
for i := range key {
|
||||
key[i] = byte(i) // deterministic test key
|
||||
}
|
||||
t.Setenv("DATABASE_URL", "postgres://user:pass@localhost/db")
|
||||
t.Setenv("CREDENTIAL_ENCRYPTION_KEY", base64.StdEncoding.EncodeToString(key))
|
||||
|
||||
cfg, err := Load()
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, key, cfg.CredentialEncryptionKey)
|
||||
}
|
||||
122
poller/internal/device/cert_deploy.go
Normal file
122
poller/internal/device/cert_deploy.go
Normal file
@@ -0,0 +1,122 @@
|
||||
// Package device provides the full certificate deployment flow for RouterOS devices.
|
||||
//
|
||||
// The deployment follows these steps:
|
||||
// 1. Upload cert.pem and key.pem via SFTP
|
||||
// 2. Import the certificate via RouterOS API (/certificate/import)
|
||||
// 3. Import the private key via RouterOS API (/certificate/import)
|
||||
// 4. Determine the certificate name on device
|
||||
// 5. Assign the certificate to the api-ssl service (/ip/service/set)
|
||||
// 6. Clean up uploaded PEM files from device filesystem (/file/remove)
|
||||
package device
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
||||
routeros "github.com/go-routeros/routeros/v3"
|
||||
"golang.org/x/crypto/ssh"
|
||||
)
|
||||
|
||||
// CertDeployRequest is the NATS request payload for certificate deployment.
|
||||
type CertDeployRequest struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
CertPEM string `json:"cert_pem"`
|
||||
KeyPEM string `json:"key_pem"`
|
||||
CertName string `json:"cert_name"` // e.g., "portal-device-cert"
|
||||
SSHPort int `json:"ssh_port"`
|
||||
}
|
||||
|
||||
// CertDeployResponse is the NATS reply payload.
|
||||
type CertDeployResponse struct {
|
||||
Success bool `json:"success"`
|
||||
CertNameOnDevice string `json:"cert_name_on_device,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// DeployCert performs the full certificate deployment flow:
|
||||
// 1. Upload cert.pem and key.pem files via SFTP
|
||||
// 2. Import certificate via RouterOS API
|
||||
// 3. Import key via RouterOS API
|
||||
// 4. Assign certificate to api-ssl service
|
||||
// 5. Clean up uploaded PEM files from device filesystem
|
||||
func DeployCert(sshClient *ssh.Client, apiClient *routeros.Client, req CertDeployRequest) CertDeployResponse {
|
||||
certFile := req.CertName + ".pem"
|
||||
keyFile := req.CertName + "-key.pem"
|
||||
|
||||
// Step 1: Upload cert via SFTP
|
||||
slog.Debug("uploading cert file via SFTP", "file", certFile, "device_id", req.DeviceID)
|
||||
if err := UploadFile(sshClient, certFile, []byte(req.CertPEM)); err != nil {
|
||||
return CertDeployResponse{Success: false, Error: fmt.Sprintf("SFTP cert upload: %s", err)}
|
||||
}
|
||||
|
||||
// Step 2: Upload key via SFTP
|
||||
slog.Debug("uploading key file via SFTP", "file", keyFile, "device_id", req.DeviceID)
|
||||
if err := UploadFile(sshClient, keyFile, []byte(req.KeyPEM)); err != nil {
|
||||
return CertDeployResponse{Success: false, Error: fmt.Sprintf("SFTP key upload: %s", err)}
|
||||
}
|
||||
|
||||
// Step 3: Import certificate
|
||||
slog.Debug("importing certificate", "file", certFile, "device_id", req.DeviceID)
|
||||
importResult := ExecuteCommand(apiClient, "/certificate/import", []string{
|
||||
"=file-name=" + certFile,
|
||||
})
|
||||
if !importResult.Success {
|
||||
return CertDeployResponse{Success: false, Error: fmt.Sprintf("cert import: %s", importResult.Error)}
|
||||
}
|
||||
|
||||
// Step 4: Import private key
|
||||
slog.Debug("importing private key", "file", keyFile, "device_id", req.DeviceID)
|
||||
keyImportResult := ExecuteCommand(apiClient, "/certificate/import", []string{
|
||||
"=file-name=" + keyFile,
|
||||
})
|
||||
if !keyImportResult.Success {
|
||||
return CertDeployResponse{Success: false, Error: fmt.Sprintf("key import: %s", keyImportResult.Error)}
|
||||
}
|
||||
|
||||
// Determine the certificate name on device.
|
||||
// RouterOS names imported certs as <filename>_0 by convention.
|
||||
// Query to find the actual name by looking for certs with a private key.
|
||||
certNameOnDevice := certFile + "_0"
|
||||
printResult := ExecuteCommand(apiClient, "/certificate/print", []string{
|
||||
"=.proplist=name,common-name,private-key",
|
||||
})
|
||||
if printResult.Success && len(printResult.Data) > 0 {
|
||||
// Use the last cert that has a private key (most recently imported)
|
||||
for _, entry := range printResult.Data {
|
||||
if name, ok := entry["name"]; ok {
|
||||
if pk, hasPK := entry["private-key"]; hasPK && pk == "true" {
|
||||
certNameOnDevice = name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5: Assign to api-ssl service
|
||||
slog.Debug("assigning certificate to api-ssl", "cert_name", certNameOnDevice, "device_id", req.DeviceID)
|
||||
assignResult := ExecuteCommand(apiClient, "/ip/service/set", []string{
|
||||
"=numbers=api-ssl",
|
||||
"=certificate=" + certNameOnDevice,
|
||||
})
|
||||
if !assignResult.Success {
|
||||
slog.Warn("api-ssl assignment failed (cert still imported)",
|
||||
"device_id", req.DeviceID,
|
||||
"error", assignResult.Error,
|
||||
)
|
||||
// Don't fail entirely -- cert is imported, assignment can be retried
|
||||
}
|
||||
|
||||
// Step 6: Clean up uploaded PEM files from device filesystem
|
||||
slog.Debug("cleaning up PEM files", "device_id", req.DeviceID)
|
||||
ExecuteCommand(apiClient, "/file/remove", []string{"=.id=" + certFile})
|
||||
ExecuteCommand(apiClient, "/file/remove", []string{"=.id=" + keyFile})
|
||||
// File cleanup failures are non-fatal
|
||||
|
||||
slog.Info("certificate deployed successfully",
|
||||
"device_id", req.DeviceID,
|
||||
"cert_name", certNameOnDevice,
|
||||
)
|
||||
return CertDeployResponse{
|
||||
Success: true,
|
||||
CertNameOnDevice: certNameOnDevice,
|
||||
}
|
||||
}
|
||||
115
poller/internal/device/client.go
Normal file
115
poller/internal/device/client.go
Normal file
@@ -0,0 +1,115 @@
|
||||
// Package device handles RouterOS device connections and queries.
|
||||
package device
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
routeros "github.com/go-routeros/routeros/v3"
|
||||
)
|
||||
|
||||
// buildTLSConfig creates a TLS config using the portal CA cert for verification.
|
||||
// Falls back to InsecureSkipVerify if caCertPEM is empty or invalid.
|
||||
func buildTLSConfig(caCertPEM []byte) *tls.Config {
|
||||
if len(caCertPEM) == 0 {
|
||||
return &tls.Config{InsecureSkipVerify: true} //nolint:gosec // no CA cert available
|
||||
}
|
||||
pool := x509.NewCertPool()
|
||||
if !pool.AppendCertsFromPEM(caCertPEM) {
|
||||
slog.Warn("failed to parse CA cert PEM, falling back to insecure TLS")
|
||||
return &tls.Config{InsecureSkipVerify: true} //nolint:gosec // invalid CA cert
|
||||
}
|
||||
return &tls.Config{RootCAs: pool}
|
||||
}
|
||||
|
||||
// ConnectDevice establishes a connection to a RouterOS device.
|
||||
//
|
||||
// Connection strategy is governed by tlsMode:
|
||||
//
|
||||
// - "auto" (default): Try CA-verified TLS (if caCertPEM provided) ->
|
||||
// InsecureSkipVerify -> STOP. No plain-text fallback.
|
||||
// - "portal_ca": Try CA-verified TLS only (strict).
|
||||
// - "insecure": Skip directly to InsecureSkipVerify TLS (no CA check).
|
||||
// - "plain": Explicit opt-in for plain-text API connection.
|
||||
//
|
||||
// Callers must call CloseDevice when done.
|
||||
func ConnectDevice(ip string, sslPort, plainPort int, username, password string, timeout time.Duration, caCertPEM []byte, tlsMode string) (*routeros.Client, error) {
|
||||
sslAddr := fmt.Sprintf("%s:%d", ip, sslPort)
|
||||
|
||||
switch tlsMode {
|
||||
case "plain":
|
||||
// Explicit opt-in: plain-text connection only
|
||||
plainAddr := fmt.Sprintf("%s:%d", ip, plainPort)
|
||||
slog.Debug("connecting to RouterOS device (plain — explicit opt-in)", "address", plainAddr)
|
||||
client, err := routeros.DialTimeout(plainAddr, username, password, timeout)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("plain-text connection to %s failed: %w", plainAddr, err)
|
||||
}
|
||||
slog.Debug("connected to RouterOS device (plain — explicit opt-in)", "address", plainAddr)
|
||||
return client, nil
|
||||
|
||||
case "insecure":
|
||||
// Skip CA verification, go straight to InsecureSkipVerify
|
||||
insecureTLS := &tls.Config{InsecureSkipVerify: true} //nolint:gosec // insecure mode requested
|
||||
slog.Debug("connecting to RouterOS device (insecure TLS)", "address", sslAddr)
|
||||
client, err := routeros.DialTLSTimeout(sslAddr, username, password, insecureTLS, timeout)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("insecure TLS connection to %s failed: %w", sslAddr, err)
|
||||
}
|
||||
slog.Debug("connected with insecure TLS", "address", sslAddr)
|
||||
return client, nil
|
||||
|
||||
case "portal_ca":
|
||||
// Strict CA-verified TLS only
|
||||
verifiedTLS := buildTLSConfig(caCertPEM)
|
||||
if verifiedTLS.RootCAs == nil {
|
||||
return nil, fmt.Errorf("portal_ca mode requires a valid CA cert but none available for %s", sslAddr)
|
||||
}
|
||||
slog.Debug("connecting to RouterOS device (CA-verified TLS)", "address", sslAddr)
|
||||
client, err := routeros.DialTLSTimeout(sslAddr, username, password, verifiedTLS, timeout)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("CA-verified TLS connection to %s failed: %w", sslAddr, err)
|
||||
}
|
||||
slog.Debug("connected with CA-verified TLS", "address", sslAddr)
|
||||
return client, nil
|
||||
|
||||
default:
|
||||
// "auto" mode: CA-verified -> InsecureSkipVerify -> STOP (no plain-text)
|
||||
|
||||
// Tier 1: CA-verified TLS (if CA cert available)
|
||||
if len(caCertPEM) > 0 {
|
||||
verifiedTLS := buildTLSConfig(caCertPEM)
|
||||
if verifiedTLS.RootCAs != nil { // only try if PEM parsed OK
|
||||
slog.Debug("connecting to RouterOS device (CA-verified TLS)", "address", sslAddr)
|
||||
client, err := routeros.DialTLSTimeout(sslAddr, username, password, verifiedTLS, timeout)
|
||||
if err == nil {
|
||||
slog.Debug("connected with CA-verified TLS", "address", sslAddr)
|
||||
return client, nil
|
||||
}
|
||||
slog.Debug("CA-verified TLS failed, trying insecure TLS", "address", sslAddr, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Tier 2: InsecureSkipVerify TLS (fallback)
|
||||
insecureTLS := &tls.Config{InsecureSkipVerify: true} //nolint:gosec // fallback for unprovisioned devices
|
||||
slog.Debug("connecting to RouterOS device (insecure TLS)", "address", sslAddr)
|
||||
client, err := routeros.DialTLSTimeout(sslAddr, username, password, insecureTLS, timeout)
|
||||
if err != nil {
|
||||
// NO plain-text fallback in auto mode — this is the key security change
|
||||
return nil, fmt.Errorf("TLS connection to %s failed (auto mode — no plain-text fallback): %w", sslAddr, err)
|
||||
}
|
||||
slog.Debug("connected with insecure TLS", "address", sslAddr)
|
||||
return client, nil
|
||||
}
|
||||
}
|
||||
|
||||
// CloseDevice closes a RouterOS client connection. Safe to call on a nil client.
|
||||
func CloseDevice(c *routeros.Client) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.Close()
|
||||
}
|
||||
50
poller/internal/device/command.go
Normal file
50
poller/internal/device/command.go
Normal file
@@ -0,0 +1,50 @@
|
||||
package device
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strings"
|
||||
|
||||
routeros "github.com/go-routeros/routeros/v3"
|
||||
)
|
||||
|
||||
// CommandRequest is the JSON payload received from the Python backend via NATS.
|
||||
type CommandRequest struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
Command string `json:"command"`
|
||||
Args []string `json:"args"`
|
||||
}
|
||||
|
||||
// CommandResponse is the JSON payload returned to the Python backend via NATS.
|
||||
type CommandResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Data []map[string]string `json:"data"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// ExecuteCommand runs an arbitrary RouterOS API command on a connected device.
|
||||
// The command string is the full path (e.g., "/ip/address/print").
|
||||
// Args are optional RouterOS API arguments (e.g., "=.proplist=.id,address").
|
||||
func ExecuteCommand(client *routeros.Client, command string, args []string) CommandResponse {
|
||||
cmdParts := make([]string, 0, 1+len(args))
|
||||
cmdParts = append(cmdParts, command)
|
||||
cmdParts = append(cmdParts, args...)
|
||||
|
||||
reply, err := client.Run(cmdParts...)
|
||||
if err != nil {
|
||||
// RouterOS 7.x returns !empty for empty results (e.g., no firewall rules).
|
||||
// go-routeros/v3 doesn't recognize this word and returns UnknownReplyError.
|
||||
// Treat !empty as a successful empty response.
|
||||
var unkErr *routeros.UnknownReplyError
|
||||
if errors.As(err, &unkErr) && strings.TrimPrefix(unkErr.Sentence.Word, "!") == "empty" {
|
||||
return CommandResponse{Success: true, Data: []map[string]string{}}
|
||||
}
|
||||
return CommandResponse{Success: false, Data: nil, Error: err.Error()}
|
||||
}
|
||||
|
||||
data := make([]map[string]string, 0, len(reply.Re))
|
||||
for _, re := range reply.Re {
|
||||
data = append(data, re.Map)
|
||||
}
|
||||
|
||||
return CommandResponse{Success: true, Data: data}
|
||||
}
|
||||
61
poller/internal/device/crypto.go
Normal file
61
poller/internal/device/crypto.go
Normal file
@@ -0,0 +1,61 @@
|
||||
package device
|
||||
|
||||
import (
|
||||
"crypto/aes"
|
||||
"crypto/cipher"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// credentialsJSON is the JSON structure stored in encrypted device credentials.
|
||||
// Must match the Python backend's encryption format.
|
||||
type credentialsJSON struct {
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
}
|
||||
|
||||
// DecryptCredentials decrypts AES-256-GCM encrypted credentials and returns the
|
||||
// username and password stored within.
|
||||
//
|
||||
// The ciphertext format MUST match what Python's cryptography.hazmat.primitives.ciphers.aead.AESGCM
|
||||
// produces when called as: nonce + AESGCM.encrypt(nonce, plaintext, None)
|
||||
//
|
||||
// Layout on disk:
|
||||
// - bytes [0:12] — 12-byte random nonce (GCM standard)
|
||||
// - bytes [12:] — ciphertext + 16-byte GCM authentication tag (appended by library)
|
||||
//
|
||||
// Go's cipher.AEAD.Open expects the GCM tag appended to the ciphertext, which is exactly
|
||||
// how Python's cryptography library stores it, so the two are directly compatible.
|
||||
func DecryptCredentials(ciphertext []byte, key []byte) (username, password string, err error) {
|
||||
if len(key) != 32 {
|
||||
return "", "", fmt.Errorf("encryption key must be 32 bytes, got %d", len(key))
|
||||
}
|
||||
if len(ciphertext) < 12+16 {
|
||||
return "", "", fmt.Errorf("ciphertext too short: need at least 28 bytes (12 nonce + 16 tag), got %d", len(ciphertext))
|
||||
}
|
||||
|
||||
block, err := aes.NewCipher(key)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("creating AES cipher: %w", err)
|
||||
}
|
||||
|
||||
gcm, err := cipher.NewGCM(block)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("creating GCM cipher: %w", err)
|
||||
}
|
||||
|
||||
nonce := ciphertext[:12]
|
||||
encryptedData := ciphertext[12:]
|
||||
|
||||
plaintext, err := gcm.Open(nil, nonce, encryptedData, nil)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("decrypting credentials (wrong key or tampered data): %w", err)
|
||||
}
|
||||
|
||||
var creds credentialsJSON
|
||||
if err := json.Unmarshal(plaintext, &creds); err != nil {
|
||||
return "", "", fmt.Errorf("unmarshalling decrypted credentials JSON: %w", err)
|
||||
}
|
||||
|
||||
return creds.Username, creds.Password, nil
|
||||
}
|
||||
91
poller/internal/device/crypto_test.go
Normal file
91
poller/internal/device/crypto_test.go
Normal file
@@ -0,0 +1,91 @@
|
||||
package device
|
||||
|
||||
import (
|
||||
"crypto/aes"
|
||||
"crypto/cipher"
|
||||
"crypto/rand"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// encrypt is a test helper that encrypts using the same format as Python's AESGCM.
|
||||
// This verifies Go-side decryption is compatible with Python-side encryption.
|
||||
func encrypt(t *testing.T, plaintext []byte, key []byte) []byte {
|
||||
t.Helper()
|
||||
block, err := aes.NewCipher(key)
|
||||
require.NoError(t, err)
|
||||
gcm, err := cipher.NewGCM(block)
|
||||
require.NoError(t, err)
|
||||
nonce := make([]byte, 12)
|
||||
_, err = rand.Read(nonce)
|
||||
require.NoError(t, err)
|
||||
// gcm.Seal appends ciphertext+tag after nonce
|
||||
return gcm.Seal(nonce, nonce, plaintext, nil)
|
||||
}
|
||||
|
||||
func TestDecryptCredentials_RoundTrip(t *testing.T) {
|
||||
key := make([]byte, 32)
|
||||
_, err := rand.Read(key)
|
||||
require.NoError(t, err)
|
||||
|
||||
creds := credentialsJSON{Username: "admin", Password: "secret123"}
|
||||
plaintext, err := json.Marshal(creds)
|
||||
require.NoError(t, err)
|
||||
|
||||
ciphertext := encrypt(t, plaintext, key)
|
||||
|
||||
username, password, err := DecryptCredentials(ciphertext, key)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "admin", username)
|
||||
assert.Equal(t, "secret123", password)
|
||||
}
|
||||
|
||||
func TestDecryptCredentials_WrongKey(t *testing.T) {
|
||||
key1 := make([]byte, 32)
|
||||
key2 := make([]byte, 32)
|
||||
_, _ = rand.Read(key1)
|
||||
_, _ = rand.Read(key2)
|
||||
|
||||
creds := credentialsJSON{Username: "admin", Password: "secret"}
|
||||
plaintext, _ := json.Marshal(creds)
|
||||
ciphertext := encrypt(t, plaintext, key1)
|
||||
|
||||
_, _, err := DecryptCredentials(ciphertext, key2)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "wrong key or tampered")
|
||||
}
|
||||
|
||||
func TestDecryptCredentials_ShortCiphertext(t *testing.T) {
|
||||
key := make([]byte, 32)
|
||||
_, _ = rand.Read(key)
|
||||
|
||||
_, _, err := DecryptCredentials([]byte("short"), key)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "too short")
|
||||
}
|
||||
|
||||
func TestDecryptCredentials_WrongKeyLength(t *testing.T) {
|
||||
_, _, err := DecryptCredentials(make([]byte, 50), make([]byte, 16))
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "32 bytes")
|
||||
}
|
||||
|
||||
func TestDecryptCredentials_TamperedCiphertext(t *testing.T) {
|
||||
key := make([]byte, 32)
|
||||
_, _ = rand.Read(key)
|
||||
|
||||
creds := credentialsJSON{Username: "admin", Password: "secret"}
|
||||
plaintext, _ := json.Marshal(creds)
|
||||
ciphertext := encrypt(t, plaintext, key)
|
||||
|
||||
// Flip a byte in the encrypted portion (after 12-byte nonce)
|
||||
tampered := make([]byte, len(ciphertext))
|
||||
copy(tampered, ciphertext)
|
||||
tampered[15] ^= 0xFF
|
||||
|
||||
_, _, err := DecryptCredentials(tampered, key)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
99
poller/internal/device/firmware.go
Normal file
99
poller/internal/device/firmware.go
Normal file
@@ -0,0 +1,99 @@
|
||||
package device
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
|
||||
routeros "github.com/go-routeros/routeros/v3"
|
||||
)
|
||||
|
||||
// FirmwareInfo holds firmware update status collected from a RouterOS device.
|
||||
type FirmwareInfo struct {
|
||||
InstalledVersion string `json:"installed_version"`
|
||||
LatestVersion string `json:"latest_version,omitempty"`
|
||||
Channel string `json:"channel,omitempty"`
|
||||
Status string `json:"status"` // "New version is available", "System is already up to date", "check-failed"
|
||||
Architecture string `json:"architecture"` // CPU architecture (e.g., "arm", "arm64", "mipsbe")
|
||||
}
|
||||
|
||||
// CheckFirmwareUpdate queries a RouterOS device for firmware update status.
|
||||
//
|
||||
// It performs two API calls:
|
||||
// 1. /system/resource/print — to get the architecture and installed version.
|
||||
// 2. /system/package/update/check-for-updates + /system/package/update/print
|
||||
// — to get the latest available version from MikroTik's servers.
|
||||
//
|
||||
// If the device cannot reach MikroTik's servers (no internet), the function
|
||||
// returns what it knows (installed version, architecture) with status "check-failed".
|
||||
// This is non-fatal — the device may simply not have internet access.
|
||||
func CheckFirmwareUpdate(c *routeros.Client) (FirmwareInfo, error) {
|
||||
// 1. Get architecture and installed version from /system/resource/print.
|
||||
resReply, err := c.Run("/system/resource/print")
|
||||
if err != nil {
|
||||
return FirmwareInfo{}, err
|
||||
}
|
||||
|
||||
arch := ""
|
||||
installedVer := ""
|
||||
if len(resReply.Re) > 0 {
|
||||
arch = resReply.Re[0].Map["architecture-name"]
|
||||
installedVer = resReply.Re[0].Map["version"]
|
||||
}
|
||||
|
||||
// 2. Trigger check-for-updates (makes outbound HTTP from device to MikroTik servers).
|
||||
_, err = c.Run("/system/package/update/check-for-updates")
|
||||
if err != nil {
|
||||
slog.Debug("firmware update check failed (device may lack internet)",
|
||||
"error", err,
|
||||
"architecture", arch,
|
||||
)
|
||||
// Non-fatal: return what we know.
|
||||
return FirmwareInfo{
|
||||
InstalledVersion: installedVer,
|
||||
Architecture: arch,
|
||||
Status: "check-failed",
|
||||
}, nil
|
||||
}
|
||||
|
||||
// 3. Read results from /system/package/update/print.
|
||||
reply, err := c.Run("/system/package/update/print")
|
||||
if err != nil {
|
||||
return FirmwareInfo{
|
||||
InstalledVersion: installedVer,
|
||||
Architecture: arch,
|
||||
Status: "check-failed",
|
||||
}, nil
|
||||
}
|
||||
|
||||
if len(reply.Re) == 0 {
|
||||
return FirmwareInfo{
|
||||
InstalledVersion: installedVer,
|
||||
Architecture: arch,
|
||||
Status: "check-failed",
|
||||
}, nil
|
||||
}
|
||||
|
||||
m := reply.Re[0].Map
|
||||
|
||||
info := FirmwareInfo{
|
||||
InstalledVersion: m["installed-version"],
|
||||
LatestVersion: m["latest-version"],
|
||||
Channel: m["channel"],
|
||||
Status: m["status"],
|
||||
Architecture: arch,
|
||||
}
|
||||
|
||||
// Use the resource-detected values as fallback.
|
||||
if info.InstalledVersion == "" {
|
||||
info.InstalledVersion = installedVer
|
||||
}
|
||||
|
||||
slog.Debug("firmware update check complete",
|
||||
"installed", info.InstalledVersion,
|
||||
"latest", info.LatestVersion,
|
||||
"channel", info.Channel,
|
||||
"status", info.Status,
|
||||
"architecture", info.Architecture,
|
||||
)
|
||||
|
||||
return info, nil
|
||||
}
|
||||
110
poller/internal/device/health.go
Normal file
110
poller/internal/device/health.go
Normal file
@@ -0,0 +1,110 @@
|
||||
package device
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
||||
routeros "github.com/go-routeros/routeros/v3"
|
||||
)
|
||||
|
||||
// HealthMetrics holds system resource metrics collected from a RouterOS device.
|
||||
// String fields match the raw RouterOS API values so the subscriber can parse
|
||||
// and validate them before inserting into TimescaleDB.
|
||||
type HealthMetrics struct {
|
||||
CPULoad string `json:"cpu_load"`
|
||||
FreeMemory string `json:"free_memory"`
|
||||
TotalMemory string `json:"total_memory"`
|
||||
FreeDisk string `json:"free_disk"`
|
||||
TotalDisk string `json:"total_disk"`
|
||||
Temperature string `json:"temperature"` // empty string if device has no sensor
|
||||
}
|
||||
|
||||
// CollectHealth gathers system health metrics for a RouterOS device.
|
||||
//
|
||||
// It combines data already present in DeviceInfo (CPU, memory) with additional
|
||||
// disk stats from /system/resource/print and temperature from /system/health/print.
|
||||
//
|
||||
// Temperature handling:
|
||||
// - RouterOS v7: /system/health/print returns rows with name/value columns;
|
||||
// looks for "cpu-temperature" then "board-temperature" as a fallback.
|
||||
// - RouterOS v6: /system/health/print returns a flat map; looks for
|
||||
// "cpu-temperature" key directly.
|
||||
// - If the command fails or no temperature key is found, Temperature is set to "".
|
||||
func CollectHealth(client *routeros.Client, info DeviceInfo) (HealthMetrics, error) {
|
||||
health := HealthMetrics{
|
||||
CPULoad: info.CPULoad,
|
||||
FreeMemory: info.FreeMemory,
|
||||
TotalMemory: info.TotalMemory,
|
||||
}
|
||||
|
||||
// Collect disk stats (not included in the default /system/resource/print proplist
|
||||
// used by DetectVersion, so we query explicitly here).
|
||||
diskReply, err := client.Run(
|
||||
"/system/resource/print",
|
||||
"=.proplist=free-hdd-space,total-hdd-space",
|
||||
)
|
||||
if err != nil {
|
||||
slog.Warn("could not collect disk stats", "error", err)
|
||||
} else if len(diskReply.Re) > 0 {
|
||||
m := diskReply.Re[0].Map
|
||||
health.FreeDisk = m["free-hdd-space"]
|
||||
health.TotalDisk = m["total-hdd-space"]
|
||||
}
|
||||
|
||||
// Collect temperature from /system/health/print.
|
||||
// This command may not exist on all devices, so errors are non-fatal.
|
||||
health.Temperature = collectTemperature(client, info.MajorVersion)
|
||||
|
||||
return health, nil
|
||||
}
|
||||
|
||||
// collectTemperature queries /system/health/print and extracts the temperature
|
||||
// reading. Returns an empty string if the device has no temperature sensor or
|
||||
// the command is not supported.
|
||||
func collectTemperature(client *routeros.Client, majorVersion int) string {
|
||||
reply, err := client.Run("/system/health/print")
|
||||
if err != nil {
|
||||
slog.Debug("temperature collection not available", "error", err)
|
||||
return ""
|
||||
}
|
||||
|
||||
if len(reply.Re) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// RouterOS v7 returns rows with "name" and "value" columns.
|
||||
// RouterOS v6 returns a flat map in a single sentence.
|
||||
if majorVersion >= 7 {
|
||||
// v7: iterate rows looking for known temperature keys.
|
||||
var fallback string
|
||||
for _, sentence := range reply.Re {
|
||||
m := sentence.Map
|
||||
name := m["name"]
|
||||
value := m["value"]
|
||||
if name == "cpu-temperature" {
|
||||
return value
|
||||
}
|
||||
if name == "board-temperature" {
|
||||
fallback = value
|
||||
}
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
// v6 (or unknown version): flat map — look for cpu-temperature key directly.
|
||||
m := reply.Re[0].Map
|
||||
if temp, ok := m["cpu-temperature"]; ok {
|
||||
return temp
|
||||
}
|
||||
if temp, ok := m["board-temperature"]; ok {
|
||||
return temp
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// collectHealthError returns an error for CollectHealth callers when the
|
||||
// primary resource query fails completely.
|
||||
func collectHealthError(err error) error {
|
||||
return fmt.Errorf("collecting health metrics: %w", err)
|
||||
}
|
||||
61
poller/internal/device/interfaces.go
Normal file
61
poller/internal/device/interfaces.go
Normal file
@@ -0,0 +1,61 @@
|
||||
// Package device provides RouterOS metric collectors for the poller.
|
||||
package device
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
|
||||
routeros "github.com/go-routeros/routeros/v3"
|
||||
)
|
||||
|
||||
// InterfaceStats holds the traffic counters for a single RouterOS interface.
|
||||
type InterfaceStats struct {
|
||||
Name string `json:"name"`
|
||||
RxBytes int64 `json:"rx_bytes"`
|
||||
TxBytes int64 `json:"tx_bytes"`
|
||||
Running bool `json:"running"`
|
||||
Type string `json:"type"`
|
||||
}
|
||||
|
||||
// CollectInterfaces queries the RouterOS device for per-interface traffic
|
||||
// counters via /interface/print.
|
||||
//
|
||||
// Returns a slice of InterfaceStats. On error, returns an empty slice and the
|
||||
// error — the caller decides whether to skip the device or log a warning.
|
||||
func CollectInterfaces(client *routeros.Client) ([]InterfaceStats, error) {
|
||||
reply, err := client.Run(
|
||||
"/interface/print",
|
||||
"=.proplist=name,rx-byte,tx-byte,running,type",
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("running /interface/print: %w", err)
|
||||
}
|
||||
|
||||
stats := make([]InterfaceStats, 0, len(reply.Re))
|
||||
for _, sentence := range reply.Re {
|
||||
m := sentence.Map
|
||||
|
||||
rxBytes, err := strconv.ParseInt(m["rx-byte"], 10, 64)
|
||||
if err != nil {
|
||||
slog.Warn("could not parse rx-byte for interface", "interface", m["name"], "value", m["rx-byte"])
|
||||
rxBytes = 0
|
||||
}
|
||||
|
||||
txBytes, err := strconv.ParseInt(m["tx-byte"], 10, 64)
|
||||
if err != nil {
|
||||
slog.Warn("could not parse tx-byte for interface", "interface", m["name"], "value", m["tx-byte"])
|
||||
txBytes = 0
|
||||
}
|
||||
|
||||
stats = append(stats, InterfaceStats{
|
||||
Name: m["name"],
|
||||
RxBytes: rxBytes,
|
||||
TxBytes: txBytes,
|
||||
Running: m["running"] == "true",
|
||||
Type: m["type"],
|
||||
})
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
53
poller/internal/device/sftp.go
Normal file
53
poller/internal/device/sftp.go
Normal file
@@ -0,0 +1,53 @@
|
||||
// Package device provides SFTP file upload helpers for RouterOS devices.
|
||||
//
|
||||
// RouterOS has a built-in SSH/SFTP server (port 22) that accepts the same
|
||||
// credentials as the API. Since the RouterOS binary API cannot upload files,
|
||||
// SFTP is used to push certificate PEM files before importing them.
|
||||
package device
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/pkg/sftp"
|
||||
"golang.org/x/crypto/ssh"
|
||||
)
|
||||
|
||||
// NewSSHClient creates an SSH connection to a RouterOS device.
|
||||
// Uses password authentication (same credentials as API access).
|
||||
func NewSSHClient(ip string, port int, username, password string, timeout time.Duration) (*ssh.Client, error) {
|
||||
config := &ssh.ClientConfig{
|
||||
User: username,
|
||||
Auth: []ssh.AuthMethod{
|
||||
ssh.Password(password),
|
||||
},
|
||||
HostKeyCallback: ssh.InsecureIgnoreHostKey(), //nolint:gosec // RouterOS self-signed SSH
|
||||
Timeout: timeout,
|
||||
}
|
||||
addr := fmt.Sprintf("%s:%d", ip, port)
|
||||
client, err := ssh.Dial("tcp", addr, config)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("SSH dial to %s: %w", addr, err)
|
||||
}
|
||||
return client, nil
|
||||
}
|
||||
|
||||
// UploadFile uploads data to a file on the RouterOS device via SFTP.
|
||||
func UploadFile(sshClient *ssh.Client, remotePath string, data []byte) error {
|
||||
client, err := sftp.NewClient(sshClient)
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating SFTP client: %w", err)
|
||||
}
|
||||
defer client.Close()
|
||||
|
||||
f, err := client.Create(remotePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating remote file %s: %w", remotePath, err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
if _, err := f.Write(data); err != nil {
|
||||
return fmt.Errorf("writing to %s: %w", remotePath, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
86
poller/internal/device/version.go
Normal file
86
poller/internal/device/version.go
Normal file
@@ -0,0 +1,86 @@
|
||||
package device
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
||||
routeros "github.com/go-routeros/routeros/v3"
|
||||
)
|
||||
|
||||
// DeviceInfo holds metadata collected from /system/resource/print and
|
||||
// /system/routerboard/print.
|
||||
type DeviceInfo struct {
|
||||
Version string
|
||||
MajorVersion int
|
||||
BoardName string
|
||||
Architecture string
|
||||
Uptime string
|
||||
CPULoad string
|
||||
FreeMemory string
|
||||
TotalMemory string
|
||||
SerialNumber string // from /system/routerboard serial-number
|
||||
FirmwareVersion string // from /system/routerboard current-firmware
|
||||
LastConfigChange string // from /system/resource last-config-change (RouterOS 7.x)
|
||||
}
|
||||
|
||||
// DetectVersion queries the RouterOS device for system resource information.
|
||||
//
|
||||
// Runs /system/resource/print and parses the response into DeviceInfo.
|
||||
// The major version is extracted from the first character of the version string
|
||||
// (e.g. "6.49.10" -> 6, "7.12" -> 7).
|
||||
func DetectVersion(c *routeros.Client) (DeviceInfo, error) {
|
||||
reply, err := c.Run("/system/resource/print")
|
||||
if err != nil {
|
||||
return DeviceInfo{}, fmt.Errorf("running /system/resource/print: %w", err)
|
||||
}
|
||||
|
||||
if len(reply.Re) == 0 {
|
||||
return DeviceInfo{}, fmt.Errorf("/system/resource/print returned no sentences")
|
||||
}
|
||||
|
||||
m := reply.Re[0].Map
|
||||
|
||||
info := DeviceInfo{
|
||||
Version: m["version"],
|
||||
BoardName: m["board-name"],
|
||||
Architecture: m["architecture-name"],
|
||||
Uptime: m["uptime"],
|
||||
CPULoad: m["cpu-load"],
|
||||
FreeMemory: m["free-memory"],
|
||||
TotalMemory: m["total-memory"],
|
||||
LastConfigChange: m["last-config-change"],
|
||||
}
|
||||
|
||||
// Extract major version from first character of version string.
|
||||
// Valid RouterOS versions start with '6' or '7'.
|
||||
if len(info.Version) > 0 {
|
||||
firstChar := info.Version[0]
|
||||
if firstChar >= '0' && firstChar <= '9' {
|
||||
info.MajorVersion = int(firstChar - '0')
|
||||
} else {
|
||||
slog.Warn("unexpected RouterOS version format", "version", info.Version)
|
||||
info.MajorVersion = 0
|
||||
}
|
||||
}
|
||||
|
||||
// Query routerboard info for serial number and firmware version.
|
||||
// Non-fatal: CHR and x86 devices don't have a routerboard.
|
||||
rbReply, rbErr := c.Run("/system/routerboard/print")
|
||||
if rbErr == nil && len(rbReply.Re) > 0 {
|
||||
rb := rbReply.Re[0].Map
|
||||
info.SerialNumber = rb["serial-number"]
|
||||
info.FirmwareVersion = rb["current-firmware"]
|
||||
} else if rbErr != nil {
|
||||
slog.Debug("routerboard query failed (normal for CHR/x86)", "error", rbErr)
|
||||
}
|
||||
|
||||
slog.Debug("detected RouterOS version",
|
||||
"version", info.Version,
|
||||
"major_version", info.MajorVersion,
|
||||
"board_name", info.BoardName,
|
||||
"serial", info.SerialNumber,
|
||||
"firmware", info.FirmwareVersion,
|
||||
)
|
||||
|
||||
return info, nil
|
||||
}
|
||||
145
poller/internal/device/wireless.go
Normal file
145
poller/internal/device/wireless.go
Normal file
@@ -0,0 +1,145 @@
|
||||
package device
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"strconv"
|
||||
|
||||
routeros "github.com/go-routeros/routeros/v3"
|
||||
)
|
||||
|
||||
// WirelessStats holds aggregated wireless metrics for a single wireless interface.
|
||||
// Metrics are aggregated across all registered clients on that interface.
|
||||
type WirelessStats struct {
|
||||
Interface string `json:"interface"`
|
||||
ClientCount int `json:"client_count"`
|
||||
AvgSignal int `json:"avg_signal"` // dBm (negative), e.g. -67
|
||||
CCQ int `json:"ccq"` // 0–100 percentage; 0 if not available (v7)
|
||||
Frequency int `json:"frequency"` // MHz
|
||||
}
|
||||
|
||||
// CollectWireless queries the RouterOS device for wireless registration-table
|
||||
// entries and aggregates them per interface.
|
||||
//
|
||||
// Version routing:
|
||||
// - majorVersion >= 7: tries /interface/wifi/registration-table/print first;
|
||||
// falls back to /interface/wireless/registration-table/print if that fails.
|
||||
// - majorVersion < 7 (including 0 for unknown): uses the classic wireless path.
|
||||
//
|
||||
// Returns an empty slice (not an error) when the device has no wireless interfaces.
|
||||
func CollectWireless(client *routeros.Client, majorVersion int) ([]WirelessStats, error) {
|
||||
var registrations []map[string]string
|
||||
var useV7WiFi bool
|
||||
|
||||
if majorVersion >= 7 {
|
||||
// Try the v7 WiFi API first.
|
||||
regReply, err := client.Run("/interface/wifi/registration-table/print")
|
||||
if err == nil {
|
||||
useV7WiFi = true
|
||||
for _, s := range regReply.Re {
|
||||
registrations = append(registrations, s.Map)
|
||||
}
|
||||
} else {
|
||||
slog.Debug("v7 wifi registration-table not available, falling back to wireless", "error", err)
|
||||
// Fall back to classic wireless path.
|
||||
regReply, err = client.Run("/interface/wireless/registration-table/print")
|
||||
if err != nil {
|
||||
slog.Debug("device has no wireless interfaces", "error", err)
|
||||
return nil, nil
|
||||
}
|
||||
for _, s := range regReply.Re {
|
||||
registrations = append(registrations, s.Map)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
regReply, err := client.Run("/interface/wireless/registration-table/print")
|
||||
if err != nil {
|
||||
slog.Debug("device has no wireless interfaces", "error", err)
|
||||
return nil, nil
|
||||
}
|
||||
for _, s := range regReply.Re {
|
||||
registrations = append(registrations, s.Map)
|
||||
}
|
||||
}
|
||||
|
||||
if len(registrations) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Collect frequency per interface so we can include it in the stats.
|
||||
frequencies := collectWirelessFrequencies(client, majorVersion, useV7WiFi)
|
||||
|
||||
// Aggregate registration-table rows per interface.
|
||||
type ifaceAgg struct {
|
||||
count int
|
||||
signal int
|
||||
ccq int
|
||||
}
|
||||
|
||||
agg := make(map[string]*ifaceAgg)
|
||||
for _, r := range registrations {
|
||||
iface := r["interface"]
|
||||
if iface == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := agg[iface]; !ok {
|
||||
agg[iface] = &ifaceAgg{}
|
||||
}
|
||||
a := agg[iface]
|
||||
a.count++
|
||||
|
||||
if sig, err := strconv.Atoi(r["signal-strength"]); err == nil {
|
||||
a.signal += sig
|
||||
}
|
||||
if ccq, err := strconv.Atoi(r["tx-ccq"]); err == nil {
|
||||
a.ccq += ccq
|
||||
}
|
||||
}
|
||||
|
||||
result := make([]WirelessStats, 0, len(agg))
|
||||
for iface, a := range agg {
|
||||
avgSignal := 0
|
||||
avgCCQ := 0
|
||||
if a.count > 0 {
|
||||
avgSignal = a.signal / a.count
|
||||
avgCCQ = a.ccq / a.count
|
||||
}
|
||||
result = append(result, WirelessStats{
|
||||
Interface: iface,
|
||||
ClientCount: a.count,
|
||||
AvgSignal: avgSignal,
|
||||
CCQ: avgCCQ,
|
||||
Frequency: frequencies[iface],
|
||||
})
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// collectWirelessFrequencies returns a map of interface name → frequency (MHz).
|
||||
// Uses the v7 WiFi API or the classic wireless API based on the useV7WiFi flag.
|
||||
func collectWirelessFrequencies(client *routeros.Client, majorVersion int, useV7WiFi bool) map[string]int {
|
||||
freqs := make(map[string]int)
|
||||
|
||||
var cmd string
|
||||
if useV7WiFi {
|
||||
cmd = "/interface/wifi/print"
|
||||
} else {
|
||||
cmd = "/interface/wireless/print"
|
||||
}
|
||||
|
||||
reply, err := client.Run(cmd, "=.proplist=name,frequency")
|
||||
if err != nil {
|
||||
slog.Debug("could not collect wireless frequencies", "command", cmd, "error", err)
|
||||
return freqs
|
||||
}
|
||||
|
||||
for _, s := range reply.Re {
|
||||
m := s.Map
|
||||
name := m["name"]
|
||||
if freq, err := strconv.Atoi(m["frequency"]); err == nil {
|
||||
freqs[name] = freq
|
||||
}
|
||||
}
|
||||
|
||||
return freqs
|
||||
}
|
||||
60
poller/internal/observability/metrics.go
Normal file
60
poller/internal/observability/metrics.go
Normal file
@@ -0,0 +1,60 @@
|
||||
// Package observability provides Prometheus metrics and health endpoints for the poller.
|
||||
package observability
|
||||
|
||||
import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
)
|
||||
|
||||
// PollDuration tracks the duration of individual device poll cycles.
|
||||
var PollDuration = promauto.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "mikrotik_poll_duration_seconds",
|
||||
Help: "Duration of a single device poll cycle in seconds.",
|
||||
Buckets: []float64{0.5, 1, 2, 5, 10, 30, 60},
|
||||
})
|
||||
|
||||
// PollTotal counts the total number of poll cycles by status.
|
||||
// Status labels: "success", "error", "skipped".
|
||||
var PollTotal = promauto.NewCounterVec(prometheus.CounterOpts{
|
||||
Name: "mikrotik_poll_total",
|
||||
Help: "Total number of poll cycles.",
|
||||
}, []string{"status"})
|
||||
|
||||
// DevicesActive tracks the number of devices currently being polled.
|
||||
var DevicesActive = promauto.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "mikrotik_devices_active",
|
||||
Help: "Number of devices currently being polled.",
|
||||
})
|
||||
|
||||
// DeviceConnectionErrors counts total device connection failures.
|
||||
var DeviceConnectionErrors = promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "mikrotik_device_connection_errors_total",
|
||||
Help: "Total device connection failures.",
|
||||
})
|
||||
|
||||
// NATSPublishTotal counts NATS publish operations by subject and status.
|
||||
// Subject labels: "status", "metrics", "firmware".
|
||||
// Status labels: "success", "error".
|
||||
var NATSPublishTotal = promauto.NewCounterVec(prometheus.CounterOpts{
|
||||
Name: "mikrotik_nats_publish_total",
|
||||
Help: "Total NATS publish operations.",
|
||||
}, []string{"subject", "status"})
|
||||
|
||||
// RedisLockTotal counts Redis lock operations by status.
|
||||
// Status labels: "obtained", "not_obtained", "error".
|
||||
var RedisLockTotal = promauto.NewCounterVec(prometheus.CounterOpts{
|
||||
Name: "mikrotik_redis_lock_total",
|
||||
Help: "Total Redis lock operations.",
|
||||
}, []string{"status"})
|
||||
|
||||
// CircuitBreakerSkips counts polls skipped due to circuit breaker backoff.
|
||||
var CircuitBreakerSkips = promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "mikrotik_circuit_breaker_skips_total",
|
||||
Help: "Total polls skipped because the device is in circuit breaker backoff.",
|
||||
})
|
||||
|
||||
// CircuitBreakerResets counts circuit breaker resets (device recovered after failures).
|
||||
var CircuitBreakerResets = promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "mikrotik_circuit_breaker_resets_total",
|
||||
Help: "Total circuit breaker resets when a device recovers.",
|
||||
})
|
||||
59
poller/internal/observability/server.go
Normal file
59
poller/internal/observability/server.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package observability
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
)
|
||||
|
||||
// StartServer starts an HTTP server for Prometheus metrics and health checks.
|
||||
//
|
||||
// The server exposes:
|
||||
// - GET /metrics — Prometheus metrics endpoint
|
||||
// - GET /health — Liveness probe (returns 200 with {"status":"ok"})
|
||||
//
|
||||
// The server shuts down gracefully when ctx is cancelled. It runs in a
|
||||
// goroutine and does not block the caller.
|
||||
func StartServer(ctx context.Context, addr string) *http.Server {
|
||||
mux := http.NewServeMux()
|
||||
mux.Handle("/metrics", promhttp.Handler())
|
||||
mux.HandleFunc("/health", healthHandler)
|
||||
|
||||
srv := &http.Server{
|
||||
Addr: addr,
|
||||
Handler: mux,
|
||||
ReadHeaderTimeout: 5 * time.Second,
|
||||
}
|
||||
|
||||
// Start serving in a goroutine.
|
||||
go func() {
|
||||
slog.Info("observability server starting", "addr", addr)
|
||||
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||
slog.Error("observability server error", "error", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Graceful shutdown when context is cancelled.
|
||||
go func() {
|
||||
<-ctx.Done()
|
||||
slog.Info("observability server shutting down")
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := srv.Shutdown(shutdownCtx); err != nil {
|
||||
slog.Error("observability server shutdown error", "error", err)
|
||||
}
|
||||
slog.Info("observability server stopped")
|
||||
}()
|
||||
|
||||
return srv
|
||||
}
|
||||
|
||||
// healthHandler returns a simple liveness response.
|
||||
func healthHandler(w http.ResponseWriter, _ *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"status":"ok"}`))
|
||||
}
|
||||
195
poller/internal/poller/integration_test.go
Normal file
195
poller/internal/poller/integration_test.go
Normal file
@@ -0,0 +1,195 @@
|
||||
package poller_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/bsm/redislock"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
goredis "github.com/redis/go-redis/v9"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/bus"
|
||||
"github.com/mikrotik-portal/poller/internal/store"
|
||||
"github.com/mikrotik-portal/poller/internal/testutil"
|
||||
)
|
||||
|
||||
// TestPollPublishConsumeCycle_Integration verifies the complete pipeline:
|
||||
//
|
||||
// 1. DeviceStore reads devices from real PostgreSQL
|
||||
// 2. Publisher sends status events through real NATS JetStream
|
||||
// 3. A NATS consumer receives the events with correct data
|
||||
// 4. Redis distributed lock can be obtained and released
|
||||
//
|
||||
// The actual PollDevice function requires a real RouterOS device, so we test
|
||||
// the integration seams individually and verify they compose correctly.
|
||||
func TestPollPublishConsumeCycle_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
tenantID := "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
|
||||
dummyCreds := []byte("dummy-encrypted-credentials")
|
||||
|
||||
// --- Phase 1: PostgreSQL + DeviceStore ---
|
||||
connStr, pgCleanup := testutil.SetupPostgres(t)
|
||||
defer pgCleanup()
|
||||
|
||||
v7 := "7.16"
|
||||
major7 := 7
|
||||
deviceID := testutil.InsertTestDevice(t, connStr, store.Device{
|
||||
TenantID: tenantID,
|
||||
IPAddress: "10.0.0.1",
|
||||
APIPort: 8728,
|
||||
APISSLPort: 8729,
|
||||
EncryptedCredentials: dummyCreds,
|
||||
RouterOSVersion: &v7,
|
||||
MajorVersion: &major7,
|
||||
})
|
||||
|
||||
ds, err := store.NewDeviceStore(ctx, connStr)
|
||||
require.NoError(t, err)
|
||||
defer ds.Close()
|
||||
|
||||
devices, err := ds.FetchDevices(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, devices, 1)
|
||||
assert.Equal(t, deviceID, devices[0].ID)
|
||||
assert.Equal(t, tenantID, devices[0].TenantID)
|
||||
|
||||
// --- Phase 2: NATS + Publisher ---
|
||||
natsURL, natsCleanup := testutil.SetupNATS(t)
|
||||
defer natsCleanup()
|
||||
|
||||
pub, err := bus.NewPublisher(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer pub.Close()
|
||||
|
||||
// Create a consumer to verify events.
|
||||
nc, err := nats.Connect(natsURL)
|
||||
require.NoError(t, err)
|
||||
defer nc.Close()
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
require.NoError(t, err)
|
||||
|
||||
cons, err := js.CreateOrUpdateConsumer(ctx, "DEVICE_EVENTS", jetstream.ConsumerConfig{
|
||||
FilterSubject: "device.status.>",
|
||||
AckPolicy: jetstream.AckNonePolicy,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Simulate what PollDevice does after connecting to a device:
|
||||
// publish a status event with data from the fetched device.
|
||||
dev := devices[0]
|
||||
statusEvent := bus.DeviceStatusEvent{
|
||||
DeviceID: dev.ID,
|
||||
TenantID: dev.TenantID,
|
||||
Status: "online",
|
||||
LastSeen: time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
err = pub.PublishStatus(ctx, statusEvent)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify consumer receives the event.
|
||||
msgBatch, err := cons.Fetch(1, jetstream.FetchMaxWait(5*time.Second))
|
||||
require.NoError(t, err)
|
||||
|
||||
var received *jetstream.Msg
|
||||
for msg := range msgBatch.Messages() {
|
||||
received = &msg
|
||||
break
|
||||
}
|
||||
require.NotNil(t, received, "consumer should receive the status event")
|
||||
|
||||
var got bus.DeviceStatusEvent
|
||||
err = json.Unmarshal((*received).Data(), &got)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, dev.ID, got.DeviceID)
|
||||
assert.Equal(t, dev.TenantID, got.TenantID)
|
||||
assert.Equal(t, "online", got.Status)
|
||||
|
||||
// --- Phase 3: Redis distributed lock ---
|
||||
redisAddr, redisCleanup := testutil.SetupRedis(t)
|
||||
defer redisCleanup()
|
||||
|
||||
rdb := goredis.NewClient(&goredis.Options{Addr: redisAddr})
|
||||
defer rdb.Close()
|
||||
|
||||
locker := redislock.New(rdb)
|
||||
|
||||
lockKey := "poll:device:" + dev.ID
|
||||
lock, err := locker.Obtain(ctx, lockKey, 10*time.Second, nil)
|
||||
require.NoError(t, err, "should obtain Redis distributed lock")
|
||||
|
||||
// A second attempt should fail (lock held).
|
||||
_, err = locker.Obtain(ctx, lockKey, 10*time.Second, nil)
|
||||
assert.ErrorIs(t, err, redislock.ErrNotObtained, "second lock attempt should fail")
|
||||
|
||||
// Release and re-obtain.
|
||||
err = lock.Release(ctx)
|
||||
require.NoError(t, err, "should release lock")
|
||||
|
||||
lock2, err := locker.Obtain(ctx, lockKey, 10*time.Second, nil)
|
||||
require.NoError(t, err, "should re-obtain lock after release")
|
||||
_ = lock2.Release(ctx)
|
||||
}
|
||||
|
||||
// TestSchedulerReconcile_WithRealDB_Integration verifies that the Scheduler's
|
||||
// reconciliation loop correctly starts and stops device polling goroutines
|
||||
// when backed by a real PostgreSQL database.
|
||||
//
|
||||
// We test this by running the Scheduler for a brief period and verifying it
|
||||
// fetches devices and starts goroutines. Since PollDevice requires real
|
||||
// RouterOS hardware, the goroutines will fail on the poll cycle (no device to
|
||||
// connect to), but the scheduler's reconciliation logic is the integration
|
||||
// point we are testing here.
|
||||
func TestSchedulerReconcile_WithRealDB_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
tenantID := "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
|
||||
dummyCreds := []byte("dummy-encrypted-credentials")
|
||||
|
||||
connStr, pgCleanup := testutil.SetupPostgres(t)
|
||||
defer pgCleanup()
|
||||
|
||||
// Insert 2 devices.
|
||||
id1 := testutil.InsertTestDevice(t, connStr, store.Device{
|
||||
TenantID: tenantID,
|
||||
IPAddress: "10.0.0.1",
|
||||
APIPort: 8728,
|
||||
APISSLPort: 8729,
|
||||
EncryptedCredentials: dummyCreds,
|
||||
})
|
||||
id2 := testutil.InsertTestDevice(t, connStr, store.Device{
|
||||
TenantID: tenantID,
|
||||
IPAddress: "10.0.0.2",
|
||||
APIPort: 8728,
|
||||
APISSLPort: 8729,
|
||||
EncryptedCredentials: dummyCreds,
|
||||
})
|
||||
|
||||
ds, err := store.NewDeviceStore(ctx, connStr)
|
||||
require.NoError(t, err)
|
||||
defer ds.Close()
|
||||
|
||||
// Verify DeviceStore returns both devices (integration seam check).
|
||||
devices, err := ds.FetchDevices(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, devices, 2)
|
||||
|
||||
returnedIDs := make(map[string]bool)
|
||||
for _, d := range devices {
|
||||
returnedIDs[d.ID] = true
|
||||
}
|
||||
assert.True(t, returnedIDs[id1], "device 1 should be fetched from real DB")
|
||||
assert.True(t, returnedIDs[id2], "device 2 should be fetched from real DB")
|
||||
}
|
||||
14
poller/internal/poller/interfaces.go
Normal file
14
poller/internal/poller/interfaces.go
Normal file
@@ -0,0 +1,14 @@
|
||||
package poller
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/store"
|
||||
)
|
||||
|
||||
// DeviceFetcher is the subset of store.DeviceStore that the Scheduler needs.
|
||||
// Defined here (consumer-side) following Go interface best practices.
|
||||
// The concrete *store.DeviceStore automatically satisfies this interface.
|
||||
type DeviceFetcher interface {
|
||||
FetchDevices(ctx context.Context) ([]store.Device, error)
|
||||
}
|
||||
264
poller/internal/poller/scheduler.go
Normal file
264
poller/internal/poller/scheduler.go
Normal file
@@ -0,0 +1,264 @@
|
||||
package poller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/bsm/redislock"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/bus"
|
||||
"github.com/mikrotik-portal/poller/internal/observability"
|
||||
"github.com/mikrotik-portal/poller/internal/store"
|
||||
"github.com/mikrotik-portal/poller/internal/vault"
|
||||
)
|
||||
|
||||
// deviceState tracks per-device circuit breaker and lifecycle state.
|
||||
type deviceState struct {
|
||||
cancel context.CancelFunc
|
||||
consecutiveFailures int
|
||||
backoffUntil time.Time
|
||||
}
|
||||
|
||||
// Scheduler manages the lifecycle of per-device polling goroutines.
|
||||
//
|
||||
// It periodically re-queries the database to discover new devices (starting goroutines)
|
||||
// and detect removed devices (stopping goroutines). Each device has exactly one
|
||||
// polling goroutine running at a time.
|
||||
//
|
||||
// Circuit breaker: after consecutive connection failures, a device enters exponential
|
||||
// backoff. The device loop skips poll ticks during backoff. On successful poll, the
|
||||
// circuit breaker resets and the device resumes normal polling.
|
||||
type Scheduler struct {
|
||||
store DeviceFetcher
|
||||
locker *redislock.Client
|
||||
publisher *bus.Publisher
|
||||
credentialCache *vault.CredentialCache
|
||||
pollInterval time.Duration
|
||||
connTimeout time.Duration
|
||||
cmdTimeout time.Duration
|
||||
refreshPeriod time.Duration
|
||||
|
||||
// Circuit breaker configuration.
|
||||
maxFailures int
|
||||
baseBackoff time.Duration
|
||||
maxBackoff time.Duration
|
||||
|
||||
// activeDevices maps device ID to per-device state.
|
||||
mu sync.Mutex
|
||||
activeDevices map[string]*deviceState
|
||||
}
|
||||
|
||||
// NewScheduler creates a Scheduler with the provided dependencies.
|
||||
func NewScheduler(
|
||||
store DeviceFetcher,
|
||||
locker *redislock.Client,
|
||||
publisher *bus.Publisher,
|
||||
credentialCache *vault.CredentialCache,
|
||||
pollInterval time.Duration,
|
||||
connTimeout time.Duration,
|
||||
cmdTimeout time.Duration,
|
||||
refreshPeriod time.Duration,
|
||||
maxFailures int,
|
||||
baseBackoff time.Duration,
|
||||
maxBackoff time.Duration,
|
||||
) *Scheduler {
|
||||
return &Scheduler{
|
||||
store: store,
|
||||
locker: locker,
|
||||
publisher: publisher,
|
||||
credentialCache: credentialCache,
|
||||
pollInterval: pollInterval,
|
||||
connTimeout: connTimeout,
|
||||
cmdTimeout: cmdTimeout,
|
||||
refreshPeriod: refreshPeriod,
|
||||
maxFailures: maxFailures,
|
||||
baseBackoff: baseBackoff,
|
||||
maxBackoff: maxBackoff,
|
||||
activeDevices: make(map[string]*deviceState),
|
||||
}
|
||||
}
|
||||
|
||||
// Run is the main scheduler loop. It:
|
||||
// 1. Fetches devices from the database.
|
||||
// 2. Starts goroutines for newly-discovered devices.
|
||||
// 3. Stops goroutines for devices no longer in the database.
|
||||
// 4. Sleeps for refreshPeriod, then repeats.
|
||||
// 5. Cancels all goroutines when ctx is cancelled (graceful shutdown).
|
||||
//
|
||||
// Run blocks until ctx is cancelled, then waits for all goroutines to finish.
|
||||
func (s *Scheduler) Run(ctx context.Context) error {
|
||||
var wg sync.WaitGroup
|
||||
|
||||
defer func() {
|
||||
// On shutdown, cancel all active device goroutines and wait for them.
|
||||
s.mu.Lock()
|
||||
for id, ds := range s.activeDevices {
|
||||
slog.Info("stopping device goroutine", "device_id", id)
|
||||
ds.cancel()
|
||||
}
|
||||
s.mu.Unlock()
|
||||
wg.Wait()
|
||||
slog.Info("scheduler shutdown complete")
|
||||
}()
|
||||
|
||||
for {
|
||||
if err := s.reconcileDevices(ctx, &wg); err != nil {
|
||||
slog.Error("device reconciliation failed", "error", err)
|
||||
// Continue — a transient DB error should not crash the scheduler.
|
||||
}
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Info("scheduler context cancelled — shutting down")
|
||||
return nil
|
||||
case <-time.After(s.refreshPeriod):
|
||||
// Next reconciliation cycle.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reconcileDevices fetches the current device list from the DB and starts/stops
|
||||
// goroutines as needed to keep the active set in sync.
|
||||
func (s *Scheduler) reconcileDevices(ctx context.Context, wg *sync.WaitGroup) error {
|
||||
devices, err := s.store.FetchDevices(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Build a set of current device IDs for quick lookup.
|
||||
currentIDs := make(map[string]struct{}, len(devices))
|
||||
for _, d := range devices {
|
||||
currentIDs[d.ID] = struct{}{}
|
||||
}
|
||||
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
// Start goroutines for newly-discovered devices.
|
||||
for _, dev := range devices {
|
||||
if _, active := s.activeDevices[dev.ID]; !active {
|
||||
devCopy := dev // capture loop variable
|
||||
devCtx, cancel := context.WithCancel(ctx)
|
||||
ds := &deviceState{cancel: cancel}
|
||||
s.activeDevices[dev.ID] = ds
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
s.runDeviceLoop(devCtx, devCopy, ds)
|
||||
}()
|
||||
|
||||
slog.Info("started polling goroutine", "device_id", dev.ID, "ip", dev.IPAddress)
|
||||
}
|
||||
}
|
||||
|
||||
// Stop goroutines for devices that are no longer in the database.
|
||||
for id, ds := range s.activeDevices {
|
||||
if _, exists := currentIDs[id]; !exists {
|
||||
slog.Info("stopping goroutine for removed device", "device_id", id)
|
||||
ds.cancel()
|
||||
delete(s.activeDevices, id)
|
||||
}
|
||||
}
|
||||
|
||||
// Update Prometheus gauge with current active device count.
|
||||
observability.DevicesActive.Set(float64(len(s.activeDevices)))
|
||||
|
||||
slog.Debug("device reconciliation complete",
|
||||
"total_devices", len(devices),
|
||||
"active_goroutines", len(s.activeDevices),
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// runDeviceLoop is the per-device polling loop. It ticks at pollInterval and
|
||||
// calls PollDevice synchronously on each tick (not in a sub-goroutine, to avoid
|
||||
// unbounded goroutine growth if polls are slow).
|
||||
//
|
||||
// Circuit breaker: when consecutive failures exceed maxFailures, the device enters
|
||||
// exponential backoff. Poll ticks during backoff are skipped. On success, the
|
||||
// circuit breaker resets.
|
||||
func (s *Scheduler) runDeviceLoop(ctx context.Context, dev store.Device, ds *deviceState) {
|
||||
// lockTTL gives the poll cycle time to complete: interval + connection timeout + 15s margin.
|
||||
lockTTL := s.pollInterval + s.connTimeout + 15*time.Second
|
||||
|
||||
ticker := time.NewTicker(s.pollInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
slog.Debug("device poll loop started", "device_id", dev.ID, "poll_interval", s.pollInterval)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Debug("device poll loop stopping", "device_id", dev.ID)
|
||||
return
|
||||
|
||||
case <-ticker.C:
|
||||
// Circuit breaker: skip poll if device is in backoff period.
|
||||
if time.Now().Before(ds.backoffUntil) {
|
||||
slog.Debug("circuit breaker: skipping poll (in backoff)",
|
||||
"device_id", dev.ID,
|
||||
"backoff_until", ds.backoffUntil.Format(time.RFC3339),
|
||||
"consecutive_failures", ds.consecutiveFailures,
|
||||
)
|
||||
observability.CircuitBreakerSkips.Inc()
|
||||
continue
|
||||
}
|
||||
|
||||
err := PollDevice(ctx, dev, s.locker, s.publisher, s.credentialCache, s.connTimeout, s.cmdTimeout, lockTTL)
|
||||
|
||||
if err != nil {
|
||||
ds.consecutiveFailures++
|
||||
|
||||
if ds.consecutiveFailures >= s.maxFailures {
|
||||
backoff := calculateBackoff(ds.consecutiveFailures, s.baseBackoff, s.maxBackoff)
|
||||
ds.backoffUntil = time.Now().Add(backoff)
|
||||
slog.Warn("circuit breaker: device entering backoff",
|
||||
"device_id", dev.ID,
|
||||
"ip", dev.IPAddress,
|
||||
"consecutive_failures", ds.consecutiveFailures,
|
||||
"backoff_duration", backoff,
|
||||
"backoff_until", ds.backoffUntil.Format(time.RFC3339),
|
||||
)
|
||||
}
|
||||
|
||||
// Only log as error if it's not a device-offline situation.
|
||||
if err != ErrDeviceOffline {
|
||||
slog.Error("poll cycle failed",
|
||||
"device_id", dev.ID,
|
||||
"ip", dev.IPAddress,
|
||||
"error", err,
|
||||
)
|
||||
}
|
||||
} else {
|
||||
// Success — reset circuit breaker if it was tripped.
|
||||
if ds.consecutiveFailures > 0 {
|
||||
slog.Info("circuit breaker: device recovered",
|
||||
"device_id", dev.ID,
|
||||
"ip", dev.IPAddress,
|
||||
"previous_failures", ds.consecutiveFailures,
|
||||
)
|
||||
observability.CircuitBreakerResets.Inc()
|
||||
ds.consecutiveFailures = 0
|
||||
ds.backoffUntil = time.Time{}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// calculateBackoff computes the exponential backoff duration for the given
|
||||
// number of consecutive failures: base * 2^(failures-1), capped at maxBackoff.
|
||||
func calculateBackoff(failures int, baseBackoff, maxBackoff time.Duration) time.Duration {
|
||||
if failures <= 1 {
|
||||
return baseBackoff
|
||||
}
|
||||
backoff := baseBackoff * time.Duration(1<<uint(failures-1))
|
||||
if backoff > maxBackoff || backoff < 0 { // negative check guards against overflow
|
||||
return maxBackoff
|
||||
}
|
||||
return backoff
|
||||
}
|
||||
184
poller/internal/poller/scheduler_test.go
Normal file
184
poller/internal/poller/scheduler_test.go
Normal file
@@ -0,0 +1,184 @@
|
||||
package poller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/store"
|
||||
"github.com/mikrotik-portal/poller/internal/vault"
|
||||
)
|
||||
|
||||
// mockDeviceFetcher implements DeviceFetcher for testing.
|
||||
type mockDeviceFetcher struct {
|
||||
devices []store.Device
|
||||
err error
|
||||
}
|
||||
|
||||
func (m *mockDeviceFetcher) FetchDevices(ctx context.Context) ([]store.Device, error) {
|
||||
return m.devices, m.err
|
||||
}
|
||||
|
||||
// newTestScheduler creates a Scheduler with a mock DeviceFetcher for testing.
|
||||
// Uses nil for locker and publisher since reconcileDevices doesn't use them.
|
||||
func newTestScheduler(fetcher DeviceFetcher) *Scheduler {
|
||||
// Create a minimal credential cache for testing (no transit, no legacy key, no db).
|
||||
testCache := vault.NewCredentialCache(64, 5*time.Minute, nil, make([]byte, 32), nil)
|
||||
return &Scheduler{
|
||||
store: fetcher,
|
||||
locker: nil,
|
||||
publisher: nil,
|
||||
credentialCache: testCache,
|
||||
pollInterval: 24 * time.Hour, // Never fires during test
|
||||
connTimeout: time.Second,
|
||||
cmdTimeout: time.Second,
|
||||
refreshPeriod: time.Second,
|
||||
maxFailures: 5,
|
||||
baseBackoff: 30 * time.Second,
|
||||
maxBackoff: 15 * time.Minute,
|
||||
activeDevices: make(map[string]*deviceState),
|
||||
}
|
||||
}
|
||||
|
||||
func TestReconcileDevices_StartsNewDevices(t *testing.T) {
|
||||
devices := []store.Device{
|
||||
{ID: "dev-1", TenantID: "t-1", IPAddress: "192.168.1.1", APISSLPort: 8729},
|
||||
{ID: "dev-2", TenantID: "t-1", IPAddress: "192.168.1.2", APISSLPort: 8729},
|
||||
}
|
||||
fetcher := &mockDeviceFetcher{devices: devices}
|
||||
sched := newTestScheduler(fetcher)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
err := sched.reconcileDevices(ctx, &wg)
|
||||
require.NoError(t, err)
|
||||
|
||||
sched.mu.Lock()
|
||||
assert.Len(t, sched.activeDevices, 2)
|
||||
_, hasDev1 := sched.activeDevices["dev-1"]
|
||||
_, hasDev2 := sched.activeDevices["dev-2"]
|
||||
assert.True(t, hasDev1)
|
||||
assert.True(t, hasDev2)
|
||||
sched.mu.Unlock()
|
||||
|
||||
// Clean up: cancel context and wait for goroutines
|
||||
cancel()
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func TestReconcileDevices_StopsRemovedDevices(t *testing.T) {
|
||||
// Start with one active device
|
||||
sched := newTestScheduler(&mockDeviceFetcher{devices: []store.Device{}})
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// Manually add a device to activeDevices to simulate it was previously running
|
||||
devCtx, devCancel := context.WithCancel(ctx)
|
||||
sched.activeDevices["dev-removed"] = &deviceState{cancel: devCancel}
|
||||
|
||||
// Track if cancel was called
|
||||
cancelled := false
|
||||
go func() {
|
||||
<-devCtx.Done()
|
||||
cancelled = true
|
||||
}()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
// FetchDevices returns empty -> dev-removed should be stopped
|
||||
err := sched.reconcileDevices(ctx, &wg)
|
||||
require.NoError(t, err)
|
||||
|
||||
sched.mu.Lock()
|
||||
assert.Len(t, sched.activeDevices, 0)
|
||||
sched.mu.Unlock()
|
||||
|
||||
// Give the goroutine a moment to register the cancel
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
assert.True(t, cancelled)
|
||||
|
||||
cancel()
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func TestReconcileDevices_PreservesExistingDevices(t *testing.T) {
|
||||
devices := []store.Device{
|
||||
{ID: "dev-existing", TenantID: "t-1", IPAddress: "192.168.1.1", APISSLPort: 8729},
|
||||
{ID: "dev-new", TenantID: "t-1", IPAddress: "192.168.1.2", APISSLPort: 8729},
|
||||
}
|
||||
fetcher := &mockDeviceFetcher{devices: devices}
|
||||
sched := newTestScheduler(fetcher)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// Pre-populate dev-existing as if it was already running
|
||||
existingCtx, existingCancel := context.WithCancel(ctx)
|
||||
_ = existingCtx
|
||||
sched.activeDevices["dev-existing"] = &deviceState{cancel: existingCancel}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
err := sched.reconcileDevices(ctx, &wg)
|
||||
require.NoError(t, err)
|
||||
|
||||
sched.mu.Lock()
|
||||
assert.Len(t, sched.activeDevices, 2)
|
||||
// dev-existing should still have its ORIGINAL cancel function (not replaced)
|
||||
assert.Equal(t, fmt.Sprintf("%p", existingCancel), fmt.Sprintf("%p", sched.activeDevices["dev-existing"].cancel))
|
||||
_, hasNew := sched.activeDevices["dev-new"]
|
||||
assert.True(t, hasNew)
|
||||
sched.mu.Unlock()
|
||||
|
||||
cancel()
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func TestReconcileDevices_HandlesEmptyDatabase(t *testing.T) {
|
||||
fetcher := &mockDeviceFetcher{devices: []store.Device{}}
|
||||
sched := newTestScheduler(fetcher)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
err := sched.reconcileDevices(ctx, &wg)
|
||||
require.NoError(t, err)
|
||||
|
||||
sched.mu.Lock()
|
||||
assert.Len(t, sched.activeDevices, 0)
|
||||
sched.mu.Unlock()
|
||||
|
||||
cancel()
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func TestReconcileDevices_FetchError(t *testing.T) {
|
||||
fetcher := &mockDeviceFetcher{err: fmt.Errorf("connection refused")}
|
||||
sched := newTestScheduler(fetcher)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// Pre-populate a device
|
||||
devCancel := func() {}
|
||||
sched.activeDevices["dev-1"] = &deviceState{cancel: devCancel}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
err := sched.reconcileDevices(ctx, &wg)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "connection refused")
|
||||
|
||||
// Active devices should be unchanged (no side effects on error)
|
||||
sched.mu.Lock()
|
||||
assert.Len(t, sched.activeDevices, 1)
|
||||
sched.mu.Unlock()
|
||||
|
||||
cancel()
|
||||
wg.Wait()
|
||||
}
|
||||
409
poller/internal/poller/worker.go
Normal file
409
poller/internal/poller/worker.go
Normal file
@@ -0,0 +1,409 @@
|
||||
// Package poller implements the polling logic for individual devices.
|
||||
package poller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/bsm/redislock"
|
||||
"github.com/redis/go-redis/v9"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/bus"
|
||||
"github.com/mikrotik-portal/poller/internal/device"
|
||||
"github.com/mikrotik-portal/poller/internal/observability"
|
||||
"github.com/mikrotik-portal/poller/internal/store"
|
||||
"github.com/mikrotik-portal/poller/internal/vault"
|
||||
)
|
||||
|
||||
// ErrDeviceOffline is returned by PollDevice when a device cannot be reached.
|
||||
// The scheduler uses this to drive the circuit breaker — consecutive offline
|
||||
// events trigger exponential backoff without logging as a hard error.
|
||||
var ErrDeviceOffline = errors.New("device offline")
|
||||
|
||||
// redisClientForFirmware is a module-level Redis client reference used
|
||||
// for firmware check rate limiting. Set by the scheduler before starting polls.
|
||||
var redisClientForFirmware *redis.Client
|
||||
|
||||
// SetRedisClient sets the Redis client used for firmware rate limiting.
|
||||
func SetRedisClient(c *redis.Client) {
|
||||
redisClientForFirmware = c
|
||||
}
|
||||
|
||||
// withTimeout runs fn in a goroutine and returns its result, or a timeout error
|
||||
// if ctx expires first. This wraps RouterOS API calls that don't accept a context
|
||||
// parameter, enforcing per-command timeouts to prevent indefinite blocking.
|
||||
func withTimeout[T any](ctx context.Context, fn func() (T, error)) (T, error) {
|
||||
type result struct {
|
||||
val T
|
||||
err error
|
||||
}
|
||||
ch := make(chan result, 1)
|
||||
go func() {
|
||||
v, e := fn()
|
||||
ch <- result{v, e}
|
||||
}()
|
||||
select {
|
||||
case r := <-ch:
|
||||
return r.val, r.err
|
||||
case <-ctx.Done():
|
||||
var zero T
|
||||
return zero, fmt.Errorf("command timed out: %w", ctx.Err())
|
||||
}
|
||||
}
|
||||
|
||||
// PollDevice performs a single poll cycle for one device:
|
||||
// 1. Acquire distributed Redis lock to prevent duplicate polls across pods.
|
||||
// 2. Decrypt device credentials.
|
||||
// 3. Attempt TLS connection to the RouterOS binary API.
|
||||
// 4. On failure: publish offline event, return ErrDeviceOffline.
|
||||
// 5. On success: run /system/resource/print, publish online event with metadata.
|
||||
// 6. Collect interface, health, and wireless metrics; publish as separate events.
|
||||
// 7. Release lock and close connection via deferred calls.
|
||||
//
|
||||
// lockTTL should be longer than the expected poll duration to prevent the lock
|
||||
// from expiring while the poll is still in progress.
|
||||
//
|
||||
// cmdTimeout is the per-command timeout for individual RouterOS API calls.
|
||||
func PollDevice(
|
||||
ctx context.Context,
|
||||
dev store.Device,
|
||||
locker *redislock.Client,
|
||||
pub *bus.Publisher,
|
||||
credentialCache *vault.CredentialCache,
|
||||
connTimeout time.Duration,
|
||||
cmdTimeout time.Duration,
|
||||
lockTTL time.Duration,
|
||||
) error {
|
||||
startTime := time.Now()
|
||||
pollStatus := "success"
|
||||
|
||||
lockKey := fmt.Sprintf("poll:device:%s", dev.ID)
|
||||
|
||||
// Acquire per-device lock. If another pod already holds the lock, skip this cycle.
|
||||
lock, err := locker.Obtain(ctx, lockKey, lockTTL, nil)
|
||||
if err == redislock.ErrNotObtained {
|
||||
slog.Debug("skipping poll — lock held by another pod", "device_id", dev.ID)
|
||||
observability.PollTotal.WithLabelValues("skipped").Inc()
|
||||
observability.RedisLockTotal.WithLabelValues("not_obtained").Inc()
|
||||
return nil
|
||||
}
|
||||
if err != nil {
|
||||
observability.RedisLockTotal.WithLabelValues("error").Inc()
|
||||
return fmt.Errorf("obtaining Redis lock for device %s: %w", dev.ID, err)
|
||||
}
|
||||
observability.RedisLockTotal.WithLabelValues("obtained").Inc()
|
||||
|
||||
defer func() {
|
||||
if releaseErr := lock.Release(ctx); releaseErr != nil && releaseErr != redislock.ErrLockNotHeld {
|
||||
slog.Warn("failed to release Redis lock", "device_id", dev.ID, "error", releaseErr)
|
||||
}
|
||||
}()
|
||||
|
||||
// Deferred metric recording — captures poll duration and status at exit.
|
||||
defer func() {
|
||||
observability.PollDuration.Observe(time.Since(startTime).Seconds())
|
||||
observability.PollTotal.WithLabelValues(pollStatus).Inc()
|
||||
}()
|
||||
|
||||
// Decrypt device credentials via credential cache (Transit preferred, legacy fallback).
|
||||
username, password, err := credentialCache.GetCredentials(
|
||||
dev.ID,
|
||||
dev.TenantID,
|
||||
dev.EncryptedCredentialsTransit,
|
||||
dev.EncryptedCredentials,
|
||||
)
|
||||
if err != nil {
|
||||
pollStatus = "error"
|
||||
return fmt.Errorf("decrypting credentials for device %s: %w", dev.ID, err)
|
||||
}
|
||||
|
||||
// Prepare CA cert PEM for TLS verification (only populated for portal_ca devices).
|
||||
var caCertPEM []byte
|
||||
if dev.CACertPEM != nil {
|
||||
caCertPEM = []byte(*dev.CACertPEM)
|
||||
}
|
||||
|
||||
// Attempt connection. On failure, publish offline event and return ErrDeviceOffline.
|
||||
client, err := device.ConnectDevice(dev.IPAddress, dev.APISSLPort, dev.APIPort, username, password, connTimeout, caCertPEM, dev.TLSMode)
|
||||
if err != nil {
|
||||
slog.Info("device offline", "device_id", dev.ID, "ip", dev.IPAddress, "error", err)
|
||||
observability.DeviceConnectionErrors.Inc()
|
||||
|
||||
offlineEvent := bus.DeviceStatusEvent{
|
||||
DeviceID: dev.ID,
|
||||
TenantID: dev.TenantID,
|
||||
Status: "offline",
|
||||
LastSeen: time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
if pubErr := pub.PublishStatus(ctx, offlineEvent); pubErr != nil {
|
||||
slog.Warn("failed to publish offline event", "device_id", dev.ID, "error", pubErr)
|
||||
observability.NATSPublishTotal.WithLabelValues("status", "error").Inc()
|
||||
} else {
|
||||
observability.NATSPublishTotal.WithLabelValues("status", "success").Inc()
|
||||
}
|
||||
|
||||
// Check for recent config push — trigger rollback or alert if device
|
||||
// went offline shortly after a push (Redis key set by push_tracker).
|
||||
if redisClientForFirmware != nil {
|
||||
pushKey := fmt.Sprintf("push:recent:%s", dev.ID)
|
||||
pushData, pushErr := redisClientForFirmware.Get(ctx, pushKey).Result()
|
||||
if pushErr == nil && pushData != "" {
|
||||
var pushInfo struct {
|
||||
DeviceID string `json:"device_id"`
|
||||
TenantID string `json:"tenant_id"`
|
||||
PushType string `json:"push_type"`
|
||||
PushOperationID string `json:"push_operation_id"`
|
||||
PrePushCommitSHA string `json:"pre_push_commit_sha"`
|
||||
}
|
||||
if unmarshalErr := json.Unmarshal([]byte(pushData), &pushInfo); unmarshalErr == nil {
|
||||
slog.Warn("device went offline after recent config push",
|
||||
"device_id", dev.ID,
|
||||
"push_type", pushInfo.PushType,
|
||||
)
|
||||
|
||||
if pushInfo.PushType == "template" || pushInfo.PushType == "restore" {
|
||||
// Auto-rollback for template/restore pushes
|
||||
if rollbackErr := pub.PublishPushRollback(ctx, bus.PushRollbackEvent{
|
||||
DeviceID: pushInfo.DeviceID,
|
||||
TenantID: pushInfo.TenantID,
|
||||
PushOperationID: pushInfo.PushOperationID,
|
||||
PrePushCommitSHA: pushInfo.PrePushCommitSHA,
|
||||
}); rollbackErr != nil {
|
||||
slog.Error("failed to publish push rollback event", "device_id", dev.ID, "error", rollbackErr)
|
||||
}
|
||||
} else {
|
||||
// Alert only for editor pushes (one-click rollback in UI)
|
||||
if alertErr := pub.PublishPushAlert(ctx, bus.PushAlertEvent{
|
||||
DeviceID: pushInfo.DeviceID,
|
||||
TenantID: pushInfo.TenantID,
|
||||
PushType: pushInfo.PushType,
|
||||
}); alertErr != nil {
|
||||
slog.Error("failed to publish push alert event", "device_id", dev.ID, "error", alertErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ErrDeviceOffline
|
||||
}
|
||||
defer device.CloseDevice(client)
|
||||
|
||||
// Query device resources (version, uptime, CPU, memory) with per-command timeout.
|
||||
cmdCtx, cmdCancel := context.WithTimeout(ctx, cmdTimeout)
|
||||
info, err := withTimeout[device.DeviceInfo](cmdCtx, func() (device.DeviceInfo, error) {
|
||||
return device.DetectVersion(client)
|
||||
})
|
||||
cmdCancel()
|
||||
if err != nil {
|
||||
slog.Warn("failed to detect version", "device_id", dev.ID, "error", err)
|
||||
// Still publish an online event even if version detection fails.
|
||||
}
|
||||
|
||||
onlineEvent := bus.DeviceStatusEvent{
|
||||
DeviceID: dev.ID,
|
||||
TenantID: dev.TenantID,
|
||||
Status: "online",
|
||||
RouterOSVersion: info.Version,
|
||||
MajorVersion: info.MajorVersion,
|
||||
BoardName: info.BoardName,
|
||||
Architecture: info.Architecture,
|
||||
Uptime: info.Uptime,
|
||||
CPULoad: info.CPULoad,
|
||||
FreeMemory: info.FreeMemory,
|
||||
TotalMemory: info.TotalMemory,
|
||||
SerialNumber: info.SerialNumber,
|
||||
FirmwareVersion: info.FirmwareVersion,
|
||||
LastSeen: time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
|
||||
if pubErr := pub.PublishStatus(ctx, onlineEvent); pubErr != nil {
|
||||
observability.NATSPublishTotal.WithLabelValues("status", "error").Inc()
|
||||
pollStatus = "error"
|
||||
return fmt.Errorf("publishing online event for device %s: %w", dev.ID, pubErr)
|
||||
}
|
||||
observability.NATSPublishTotal.WithLabelValues("status", "success").Inc()
|
||||
|
||||
// =========================================================================
|
||||
// CONFIG CHANGE DETECTION
|
||||
// Compare last-config-change from /system/resource/print against the
|
||||
// previous value stored in Redis. If it changed (and we have a previous
|
||||
// value — skip first poll), publish a ConfigChangedEvent so the backend
|
||||
// can trigger an event-driven backup.
|
||||
// =========================================================================
|
||||
if info.LastConfigChange != "" && redisClientForFirmware != nil {
|
||||
redisKey := fmt.Sprintf("device:%s:last_config_change", dev.ID)
|
||||
prev, redisErr := redisClientForFirmware.Get(ctx, redisKey).Result()
|
||||
if redisErr != nil && redisErr != redis.Nil {
|
||||
slog.Warn("Redis GET last_config_change error", "device_id", dev.ID, "error", redisErr)
|
||||
}
|
||||
|
||||
if prev != info.LastConfigChange {
|
||||
if prev != "" { // Skip first poll — no previous value to compare
|
||||
slog.Info("config change detected on device",
|
||||
"device_id", dev.ID,
|
||||
"old_timestamp", prev,
|
||||
"new_timestamp", info.LastConfigChange,
|
||||
)
|
||||
if pubErr := pub.PublishConfigChanged(ctx, bus.ConfigChangedEvent{
|
||||
DeviceID: dev.ID,
|
||||
TenantID: dev.TenantID,
|
||||
OldTimestamp: prev,
|
||||
NewTimestamp: info.LastConfigChange,
|
||||
}); pubErr != nil {
|
||||
slog.Warn("failed to publish config.changed", "device_id", dev.ID, "error", pubErr)
|
||||
observability.NATSPublishTotal.WithLabelValues("config_changed", "error").Inc()
|
||||
} else {
|
||||
observability.NATSPublishTotal.WithLabelValues("config_changed", "success").Inc()
|
||||
}
|
||||
}
|
||||
// Update Redis with current value (24h TTL)
|
||||
redisClientForFirmware.Set(ctx, redisKey, info.LastConfigChange, 24*time.Hour)
|
||||
}
|
||||
}
|
||||
|
||||
slog.Info("device polled successfully",
|
||||
"device_id", dev.ID,
|
||||
"ip", dev.IPAddress,
|
||||
"status", "online",
|
||||
"version", info.Version,
|
||||
)
|
||||
|
||||
// =========================================================================
|
||||
// METRICS COLLECTION
|
||||
// Errors are non-fatal — a metric collection failure should not fail the
|
||||
// poll cycle. Publish failures are also non-fatal for the same reason.
|
||||
// Each collection call is wrapped with a per-command timeout.
|
||||
// =========================================================================
|
||||
collectedAt := time.Now().UTC().Format(time.RFC3339)
|
||||
|
||||
// Interface traffic counters.
|
||||
cmdCtx, cmdCancel = context.WithTimeout(ctx, cmdTimeout)
|
||||
interfaces, err := withTimeout[[]device.InterfaceStats](cmdCtx, func() ([]device.InterfaceStats, error) {
|
||||
return device.CollectInterfaces(client)
|
||||
})
|
||||
cmdCancel()
|
||||
if err != nil {
|
||||
slog.Warn("failed to collect interface metrics", "device_id", dev.ID, "error", err)
|
||||
}
|
||||
if pubErr := pub.PublishMetrics(ctx, bus.DeviceMetricsEvent{
|
||||
DeviceID: dev.ID,
|
||||
TenantID: dev.TenantID,
|
||||
CollectedAt: collectedAt,
|
||||
Type: "interfaces",
|
||||
Interfaces: interfaces,
|
||||
}); pubErr != nil {
|
||||
slog.Warn("failed to publish interface metrics", "device_id", dev.ID, "error", pubErr)
|
||||
observability.NATSPublishTotal.WithLabelValues("metrics", "error").Inc()
|
||||
} else {
|
||||
observability.NATSPublishTotal.WithLabelValues("metrics", "success").Inc()
|
||||
}
|
||||
|
||||
// System health (CPU, memory, disk, temperature).
|
||||
cmdCtx, cmdCancel = context.WithTimeout(ctx, cmdTimeout)
|
||||
health, err := withTimeout[device.HealthMetrics](cmdCtx, func() (device.HealthMetrics, error) {
|
||||
return device.CollectHealth(client, info)
|
||||
})
|
||||
cmdCancel()
|
||||
if err != nil {
|
||||
slog.Warn("failed to collect health metrics", "device_id", dev.ID, "error", err)
|
||||
}
|
||||
if pubErr := pub.PublishMetrics(ctx, bus.DeviceMetricsEvent{
|
||||
DeviceID: dev.ID,
|
||||
TenantID: dev.TenantID,
|
||||
CollectedAt: collectedAt,
|
||||
Type: "health",
|
||||
Health: &health,
|
||||
}); pubErr != nil {
|
||||
slog.Warn("failed to publish health metrics", "device_id", dev.ID, "error", pubErr)
|
||||
observability.NATSPublishTotal.WithLabelValues("metrics", "error").Inc()
|
||||
} else {
|
||||
observability.NATSPublishTotal.WithLabelValues("metrics", "success").Inc()
|
||||
}
|
||||
|
||||
// Wireless client stats (only publish if the device has wireless interfaces).
|
||||
cmdCtx, cmdCancel = context.WithTimeout(ctx, cmdTimeout)
|
||||
wireless, err := withTimeout[[]device.WirelessStats](cmdCtx, func() ([]device.WirelessStats, error) {
|
||||
return device.CollectWireless(client, info.MajorVersion)
|
||||
})
|
||||
cmdCancel()
|
||||
if err != nil {
|
||||
slog.Warn("failed to collect wireless metrics", "device_id", dev.ID, "error", err)
|
||||
}
|
||||
if len(wireless) > 0 {
|
||||
if pubErr := pub.PublishMetrics(ctx, bus.DeviceMetricsEvent{
|
||||
DeviceID: dev.ID,
|
||||
TenantID: dev.TenantID,
|
||||
CollectedAt: collectedAt,
|
||||
Type: "wireless",
|
||||
Wireless: wireless,
|
||||
}); pubErr != nil {
|
||||
slog.Warn("failed to publish wireless metrics", "device_id", dev.ID, "error", pubErr)
|
||||
observability.NATSPublishTotal.WithLabelValues("metrics", "error").Inc()
|
||||
} else {
|
||||
observability.NATSPublishTotal.WithLabelValues("metrics", "success").Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// FIRMWARE CHECK (rate-limited to once per day per device)
|
||||
// Checks if a firmware update is available and publishes the result.
|
||||
// Uses a Redis key with 24h TTL to ensure we don't hammer devices every 60s.
|
||||
// =========================================================================
|
||||
if redisClientForFirmware != nil {
|
||||
fwCacheKey := fmt.Sprintf("firmware:checked:%s", dev.ID)
|
||||
exists, _ := redisClientForFirmware.Exists(ctx, fwCacheKey).Result()
|
||||
if exists == 0 {
|
||||
cmdCtx, cmdCancel = context.WithTimeout(ctx, cmdTimeout)
|
||||
fwInfo, fwErr := withTimeout[device.FirmwareInfo](cmdCtx, func() (device.FirmwareInfo, error) {
|
||||
return device.CheckFirmwareUpdate(client)
|
||||
})
|
||||
cmdCancel()
|
||||
if fwErr != nil {
|
||||
slog.Warn("firmware check failed", "device_id", dev.ID, "error", fwErr)
|
||||
// Set cooldown on failure too, but shorter (6h) so we retry sooner than success (24h).
|
||||
// Prevents hammering devices that can't reach MikroTik update servers every poll cycle.
|
||||
fwFailKey := fmt.Sprintf("firmware:check-failed:%s", dev.ID)
|
||||
redisClientForFirmware.Set(ctx, fwFailKey, "1", 6*time.Hour)
|
||||
// Also set the main checked key to prevent the success path from re-checking.
|
||||
redisClientForFirmware.Set(ctx, fwCacheKey, "1", 6*time.Hour)
|
||||
} else {
|
||||
fwEvent := bus.DeviceFirmwareEvent{
|
||||
DeviceID: dev.ID,
|
||||
TenantID: dev.TenantID,
|
||||
InstalledVersion: fwInfo.InstalledVersion,
|
||||
LatestVersion: fwInfo.LatestVersion,
|
||||
Channel: fwInfo.Channel,
|
||||
Status: fwInfo.Status,
|
||||
Architecture: fwInfo.Architecture,
|
||||
}
|
||||
if pubErr := pub.PublishFirmware(ctx, fwEvent); pubErr != nil {
|
||||
slog.Warn("failed to publish firmware event", "device_id", dev.ID, "error", pubErr)
|
||||
observability.NATSPublishTotal.WithLabelValues("firmware", "error").Inc()
|
||||
} else {
|
||||
observability.NATSPublishTotal.WithLabelValues("firmware", "success").Inc()
|
||||
// Set Redis key with 24h TTL — firmware checked for today.
|
||||
// If the check succeeded but status is "check-failed",
|
||||
// use shorter cooldown since the device couldn't reach update servers.
|
||||
if fwInfo.Status == "check-failed" {
|
||||
redisClientForFirmware.Set(ctx, fwCacheKey, "1", 6*time.Hour)
|
||||
} else {
|
||||
redisClientForFirmware.Set(ctx, fwCacheKey, "1", 24*time.Hour)
|
||||
}
|
||||
slog.Info("firmware check published",
|
||||
"device_id", dev.ID,
|
||||
"installed", fwInfo.InstalledVersion,
|
||||
"latest", fwInfo.LatestVersion,
|
||||
"channel", fwInfo.Channel,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
161
poller/internal/store/devices.go
Normal file
161
poller/internal/store/devices.go
Normal file
@@ -0,0 +1,161 @@
|
||||
// Package store provides database access for the poller service.
|
||||
package store
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
)
|
||||
|
||||
// Device represents a device row fetched from the devices table.
|
||||
// The poller reads ALL devices across all tenants (no RLS applied to poller_user).
|
||||
type Device struct {
|
||||
ID string
|
||||
TenantID string
|
||||
IPAddress string
|
||||
APIPort int
|
||||
APISSLPort int
|
||||
EncryptedCredentials []byte // legacy AES-256-GCM BYTEA
|
||||
EncryptedCredentialsTransit *string // OpenBao Transit ciphertext (TEXT, nullable)
|
||||
RouterOSVersion *string
|
||||
MajorVersion *int
|
||||
TLSMode string // "insecure" or "portal_ca"
|
||||
CACertPEM *string // PEM-encoded CA cert (only populated when TLSMode = "portal_ca")
|
||||
}
|
||||
|
||||
// DeviceStore manages PostgreSQL connections for device data access.
|
||||
type DeviceStore struct {
|
||||
pool *pgxpool.Pool
|
||||
}
|
||||
|
||||
// NewDeviceStore creates a pgx connection pool and returns a DeviceStore.
|
||||
//
|
||||
// The databaseURL should use the poller_user role which has SELECT-only access
|
||||
// to the devices table and is not subject to RLS policies.
|
||||
func NewDeviceStore(ctx context.Context, databaseURL string) (*DeviceStore, error) {
|
||||
pool, err := pgxpool.New(ctx, databaseURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating pgx pool: %w", err)
|
||||
}
|
||||
|
||||
// Verify connectivity immediately.
|
||||
if err := pool.Ping(ctx); err != nil {
|
||||
pool.Close()
|
||||
return nil, fmt.Errorf("pinging database: %w", err)
|
||||
}
|
||||
|
||||
return &DeviceStore{pool: pool}, nil
|
||||
}
|
||||
|
||||
// FetchDevices returns all devices from the database.
|
||||
//
|
||||
// The query reads across all tenants intentionally — the poller_user role has
|
||||
// SELECT-only access without RLS so it can poll all devices.
|
||||
func (s *DeviceStore) FetchDevices(ctx context.Context) ([]Device, error) {
|
||||
const query = `
|
||||
SELECT
|
||||
d.id::text,
|
||||
d.tenant_id::text,
|
||||
d.ip_address,
|
||||
d.api_port,
|
||||
d.api_ssl_port,
|
||||
d.encrypted_credentials,
|
||||
d.encrypted_credentials_transit,
|
||||
d.routeros_version,
|
||||
d.routeros_major_version,
|
||||
d.tls_mode,
|
||||
ca.cert_pem
|
||||
FROM devices d
|
||||
LEFT JOIN certificate_authorities ca
|
||||
ON d.tenant_id = ca.tenant_id
|
||||
AND d.tls_mode = 'portal_ca'
|
||||
WHERE d.encrypted_credentials IS NOT NULL
|
||||
OR d.encrypted_credentials_transit IS NOT NULL
|
||||
`
|
||||
|
||||
rows, err := s.pool.Query(ctx, query)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("querying devices: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var devices []Device
|
||||
for rows.Next() {
|
||||
var d Device
|
||||
if err := rows.Scan(
|
||||
&d.ID,
|
||||
&d.TenantID,
|
||||
&d.IPAddress,
|
||||
&d.APIPort,
|
||||
&d.APISSLPort,
|
||||
&d.EncryptedCredentials,
|
||||
&d.EncryptedCredentialsTransit,
|
||||
&d.RouterOSVersion,
|
||||
&d.MajorVersion,
|
||||
&d.TLSMode,
|
||||
&d.CACertPEM,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("scanning device row: %w", err)
|
||||
}
|
||||
devices = append(devices, d)
|
||||
}
|
||||
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, fmt.Errorf("iterating device rows: %w", err)
|
||||
}
|
||||
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
// GetDevice returns a single device by ID for interactive command execution.
|
||||
func (s *DeviceStore) GetDevice(ctx context.Context, deviceID string) (Device, error) {
|
||||
const query = `
|
||||
SELECT
|
||||
d.id::text,
|
||||
d.tenant_id::text,
|
||||
d.ip_address,
|
||||
d.api_port,
|
||||
d.api_ssl_port,
|
||||
d.encrypted_credentials,
|
||||
d.encrypted_credentials_transit,
|
||||
d.routeros_version,
|
||||
d.routeros_major_version,
|
||||
d.tls_mode,
|
||||
ca.cert_pem
|
||||
FROM devices d
|
||||
LEFT JOIN certificate_authorities ca
|
||||
ON d.tenant_id = ca.tenant_id
|
||||
AND d.tls_mode = 'portal_ca'
|
||||
WHERE d.id = $1
|
||||
`
|
||||
var d Device
|
||||
err := s.pool.QueryRow(ctx, query, deviceID).Scan(
|
||||
&d.ID,
|
||||
&d.TenantID,
|
||||
&d.IPAddress,
|
||||
&d.APIPort,
|
||||
&d.APISSLPort,
|
||||
&d.EncryptedCredentials,
|
||||
&d.EncryptedCredentialsTransit,
|
||||
&d.RouterOSVersion,
|
||||
&d.MajorVersion,
|
||||
&d.TLSMode,
|
||||
&d.CACertPEM,
|
||||
)
|
||||
if err != nil {
|
||||
return Device{}, fmt.Errorf("querying device %s: %w", deviceID, err)
|
||||
}
|
||||
return d, nil
|
||||
}
|
||||
|
||||
// Pool returns the underlying pgxpool.Pool for shared use by other subsystems
|
||||
// (e.g., credential cache key_access_log inserts).
|
||||
func (s *DeviceStore) Pool() *pgxpool.Pool {
|
||||
return s.pool
|
||||
}
|
||||
|
||||
// Close closes the pgx connection pool.
|
||||
func (s *DeviceStore) Close() {
|
||||
s.pool.Close()
|
||||
}
|
||||
150
poller/internal/store/devices_integration_test.go
Normal file
150
poller/internal/store/devices_integration_test.go
Normal file
@@ -0,0 +1,150 @@
|
||||
package store_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/store"
|
||||
"github.com/mikrotik-portal/poller/internal/testutil"
|
||||
)
|
||||
|
||||
func TestDeviceStore_FetchDevices_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
connStr, cleanup := testutil.SetupPostgres(t)
|
||||
defer cleanup()
|
||||
|
||||
ctx := context.Background()
|
||||
tenantID := "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
|
||||
dummyCreds := []byte("dummy-encrypted-credentials")
|
||||
v7 := "7.16"
|
||||
major7 := 7
|
||||
|
||||
// Insert 3 devices WITH encrypted_credentials (should be returned).
|
||||
id1 := testutil.InsertTestDevice(t, connStr, store.Device{
|
||||
TenantID: tenantID,
|
||||
IPAddress: "192.168.1.1",
|
||||
APIPort: 8728,
|
||||
APISSLPort: 8729,
|
||||
EncryptedCredentials: dummyCreds,
|
||||
RouterOSVersion: &v7,
|
||||
MajorVersion: &major7,
|
||||
})
|
||||
id2 := testutil.InsertTestDevice(t, connStr, store.Device{
|
||||
TenantID: tenantID,
|
||||
IPAddress: "192.168.1.2",
|
||||
APIPort: 8728,
|
||||
APISSLPort: 8729,
|
||||
EncryptedCredentials: dummyCreds,
|
||||
})
|
||||
id3 := testutil.InsertTestDevice(t, connStr, store.Device{
|
||||
TenantID: tenantID,
|
||||
IPAddress: "192.168.1.3",
|
||||
APIPort: 8728,
|
||||
APISSLPort: 8729,
|
||||
EncryptedCredentials: dummyCreds,
|
||||
})
|
||||
|
||||
// Insert 1 device WITHOUT encrypted_credentials (should be excluded).
|
||||
_ = testutil.InsertTestDevice(t, connStr, store.Device{
|
||||
TenantID: tenantID,
|
||||
IPAddress: "192.168.1.99",
|
||||
APIPort: 8728,
|
||||
// EncryptedCredentials is nil -> excluded by FetchDevices WHERE clause
|
||||
})
|
||||
|
||||
ds, err := store.NewDeviceStore(ctx, connStr)
|
||||
require.NoError(t, err)
|
||||
defer ds.Close()
|
||||
|
||||
devices, err := ds.FetchDevices(ctx)
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, devices, 3, "should return only devices with encrypted_credentials")
|
||||
|
||||
// Collect returned IDs for verification.
|
||||
returnedIDs := make(map[string]bool)
|
||||
for _, d := range devices {
|
||||
returnedIDs[d.ID] = true
|
||||
}
|
||||
assert.True(t, returnedIDs[id1], "device 1 should be returned")
|
||||
assert.True(t, returnedIDs[id2], "device 2 should be returned")
|
||||
assert.True(t, returnedIDs[id3], "device 3 should be returned")
|
||||
|
||||
// Verify fields on the device with version info.
|
||||
for _, d := range devices {
|
||||
if d.ID == id1 {
|
||||
assert.Equal(t, tenantID, d.TenantID)
|
||||
assert.Equal(t, "192.168.1.1", d.IPAddress)
|
||||
assert.Equal(t, 8728, d.APIPort)
|
||||
assert.Equal(t, 8729, d.APISSLPort)
|
||||
assert.Equal(t, dummyCreds, d.EncryptedCredentials)
|
||||
require.NotNil(t, d.RouterOSVersion)
|
||||
assert.Equal(t, "7.16", *d.RouterOSVersion)
|
||||
require.NotNil(t, d.MajorVersion)
|
||||
assert.Equal(t, 7, *d.MajorVersion)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeviceStore_GetDevice_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
connStr, cleanup := testutil.SetupPostgres(t)
|
||||
defer cleanup()
|
||||
|
||||
ctx := context.Background()
|
||||
tenantID := "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
|
||||
dummyCreds := []byte("dummy-encrypted-credentials")
|
||||
|
||||
id := testutil.InsertTestDevice(t, connStr, store.Device{
|
||||
TenantID: tenantID,
|
||||
IPAddress: "10.0.0.1",
|
||||
APIPort: 8728,
|
||||
APISSLPort: 8729,
|
||||
EncryptedCredentials: dummyCreds,
|
||||
})
|
||||
|
||||
ds, err := store.NewDeviceStore(ctx, connStr)
|
||||
require.NoError(t, err)
|
||||
defer ds.Close()
|
||||
|
||||
// Happy path: existing device.
|
||||
d, err := ds.GetDevice(ctx, id)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, id, d.ID)
|
||||
assert.Equal(t, tenantID, d.TenantID)
|
||||
assert.Equal(t, "10.0.0.1", d.IPAddress)
|
||||
assert.Equal(t, dummyCreds, d.EncryptedCredentials)
|
||||
|
||||
// Sad path: nonexistent device.
|
||||
_, err = ds.GetDevice(ctx, "00000000-0000-0000-0000-000000000000")
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
func TestDeviceStore_FetchDevices_Empty_Integration(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
connStr, cleanup := testutil.SetupPostgres(t)
|
||||
defer cleanup()
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
ds, err := store.NewDeviceStore(ctx, connStr)
|
||||
require.NoError(t, err)
|
||||
defer ds.Close()
|
||||
|
||||
devices, err := ds.FetchDevices(ctx)
|
||||
require.NoError(t, err)
|
||||
// FetchDevices returns nil slice when no rows exist (append on nil);
|
||||
// this is acceptable Go behavior. The important thing is no error.
|
||||
assert.Empty(t, devices, "should return empty result for empty database")
|
||||
}
|
||||
241
poller/internal/testutil/containers.go
Normal file
241
poller/internal/testutil/containers.go
Normal file
@@ -0,0 +1,241 @@
|
||||
// Package testutil provides shared testcontainer helpers for integration tests.
|
||||
//
|
||||
// All helpers start real infrastructure containers (PostgreSQL, Redis, NATS) via
|
||||
// testcontainers-go and return connection strings plus cleanup functions. Tests
|
||||
// using these helpers require a running Docker daemon and are skipped automatically
|
||||
// when `go test -short` is used.
|
||||
package testutil
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/testcontainers/testcontainers-go"
|
||||
tcnats "github.com/testcontainers/testcontainers-go/modules/nats"
|
||||
"github.com/testcontainers/testcontainers-go/modules/postgres"
|
||||
"github.com/testcontainers/testcontainers-go/modules/redis"
|
||||
"github.com/testcontainers/testcontainers-go/wait"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/store"
|
||||
)
|
||||
|
||||
// devicesSchema is the minimal DDL needed for integration tests against the
|
||||
// devices table. It mirrors the production schema but omits RLS policies and
|
||||
// other tables the poller doesn't read.
|
||||
const devicesSchema = `
|
||||
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
|
||||
CREATE TABLE IF NOT EXISTS devices (
|
||||
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
||||
tenant_id UUID NOT NULL,
|
||||
hostname VARCHAR(255) NOT NULL,
|
||||
ip_address VARCHAR(45) NOT NULL,
|
||||
api_port INTEGER NOT NULL DEFAULT 8728,
|
||||
api_ssl_port INTEGER NOT NULL DEFAULT 8729,
|
||||
model VARCHAR(255),
|
||||
serial_number VARCHAR(255),
|
||||
firmware_version VARCHAR(100),
|
||||
routeros_version VARCHAR(100),
|
||||
routeros_major_version INTEGER,
|
||||
uptime_seconds INTEGER,
|
||||
last_seen TIMESTAMPTZ,
|
||||
encrypted_credentials BYTEA,
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'unknown',
|
||||
created_at TIMESTAMPTZ DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ DEFAULT now()
|
||||
);
|
||||
`
|
||||
|
||||
// SetupPostgres starts a PostgreSQL container using the TimescaleDB image and
|
||||
// applies the devices table schema. Returns the connection string and a cleanup
|
||||
// function that terminates the container.
|
||||
func SetupPostgres(t *testing.T) (connStr string, cleanup func()) {
|
||||
t.Helper()
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
pgContainer, err := postgres.Run(ctx,
|
||||
"postgres:17-alpine",
|
||||
postgres.WithDatabase("mikrotik_test"),
|
||||
postgres.WithUsername("postgres"),
|
||||
postgres.WithPassword("test"),
|
||||
testcontainers.WithWaitStrategy(
|
||||
wait.ForLog("database system is ready to accept connections").
|
||||
WithOccurrence(2).
|
||||
WithStartupTimeout(60*time.Second),
|
||||
),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("starting PostgreSQL container: %v", err)
|
||||
}
|
||||
|
||||
connStr, err = pgContainer.ConnectionString(ctx, "sslmode=disable")
|
||||
if err != nil {
|
||||
_ = pgContainer.Terminate(ctx)
|
||||
t.Fatalf("getting PostgreSQL connection string: %v", err)
|
||||
}
|
||||
|
||||
// Apply schema using pgx directly.
|
||||
conn, err := pgx.Connect(ctx, connStr)
|
||||
if err != nil {
|
||||
_ = pgContainer.Terminate(ctx)
|
||||
t.Fatalf("connecting to PostgreSQL to apply schema: %v", err)
|
||||
}
|
||||
defer conn.Close(ctx)
|
||||
|
||||
if _, err := conn.Exec(ctx, devicesSchema); err != nil {
|
||||
_ = pgContainer.Terminate(ctx)
|
||||
t.Fatalf("applying devices schema: %v", err)
|
||||
}
|
||||
|
||||
cleanup = func() {
|
||||
if err := pgContainer.Terminate(ctx); err != nil {
|
||||
t.Logf("warning: terminating PostgreSQL container: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return connStr, cleanup
|
||||
}
|
||||
|
||||
// SetupRedis starts a Redis container and returns the address (host:port) plus
|
||||
// a cleanup function.
|
||||
func SetupRedis(t *testing.T) (addr string, cleanup func()) {
|
||||
t.Helper()
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
redisContainer, err := redis.Run(ctx,
|
||||
"redis:7-alpine",
|
||||
testcontainers.WithWaitStrategy(
|
||||
wait.ForLog("Ready to accept connections").
|
||||
WithStartupTimeout(30*time.Second),
|
||||
),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("starting Redis container: %v", err)
|
||||
}
|
||||
|
||||
host, err := redisContainer.Host(ctx)
|
||||
if err != nil {
|
||||
_ = redisContainer.Terminate(ctx)
|
||||
t.Fatalf("getting Redis host: %v", err)
|
||||
}
|
||||
|
||||
port, err := redisContainer.MappedPort(ctx, "6379")
|
||||
if err != nil {
|
||||
_ = redisContainer.Terminate(ctx)
|
||||
t.Fatalf("getting Redis mapped port: %v", err)
|
||||
}
|
||||
|
||||
addr = fmt.Sprintf("%s:%s", host, port.Port())
|
||||
|
||||
cleanup = func() {
|
||||
if err := redisContainer.Terminate(ctx); err != nil {
|
||||
t.Logf("warning: terminating Redis container: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return addr, cleanup
|
||||
}
|
||||
|
||||
// SetupNATS starts a NATS container with JetStream enabled and returns the NATS
|
||||
// URL (nats://host:port) plus a cleanup function.
|
||||
func SetupNATS(t *testing.T) (url string, cleanup func()) {
|
||||
t.Helper()
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test in short mode")
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
natsContainer, err := tcnats.Run(ctx,
|
||||
"nats:2-alpine",
|
||||
testcontainers.WithCmd("--jetstream"),
|
||||
testcontainers.WithWaitStrategy(
|
||||
wait.ForLog("Server is ready").
|
||||
WithStartupTimeout(30*time.Second),
|
||||
),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("starting NATS container: %v", err)
|
||||
}
|
||||
|
||||
host, err := natsContainer.Host(ctx)
|
||||
if err != nil {
|
||||
_ = natsContainer.Terminate(ctx)
|
||||
t.Fatalf("getting NATS host: %v", err)
|
||||
}
|
||||
|
||||
port, err := natsContainer.MappedPort(ctx, "4222")
|
||||
if err != nil {
|
||||
_ = natsContainer.Terminate(ctx)
|
||||
t.Fatalf("getting NATS mapped port: %v", err)
|
||||
}
|
||||
|
||||
url = fmt.Sprintf("nats://%s:%s", host, port.Port())
|
||||
|
||||
cleanup = func() {
|
||||
if err := natsContainer.Terminate(ctx); err != nil {
|
||||
t.Logf("warning: terminating NATS container: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return url, cleanup
|
||||
}
|
||||
|
||||
// InsertTestDevice inserts a device row into the database and returns the
|
||||
// generated UUID. The caller provides a store.Device with fields to populate;
|
||||
// fields left at zero values use column defaults.
|
||||
func InsertTestDevice(t *testing.T, connStr string, dev store.Device) string {
|
||||
t.Helper()
|
||||
|
||||
ctx := context.Background()
|
||||
conn, err := pgx.Connect(ctx, connStr)
|
||||
if err != nil {
|
||||
t.Fatalf("connecting to PostgreSQL for InsertTestDevice: %v", err)
|
||||
}
|
||||
defer conn.Close(ctx)
|
||||
|
||||
var id string
|
||||
err = conn.QueryRow(ctx,
|
||||
`INSERT INTO devices (tenant_id, hostname, ip_address, api_port, api_ssl_port,
|
||||
encrypted_credentials, routeros_version, routeros_major_version)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
RETURNING id::text`,
|
||||
dev.TenantID,
|
||||
coalesce(dev.IPAddress, "test-device"), // hostname defaults to ip if not set
|
||||
dev.IPAddress,
|
||||
coalesceInt(dev.APIPort, 8728),
|
||||
coalesceInt(dev.APISSLPort, 8729),
|
||||
dev.EncryptedCredentials,
|
||||
dev.RouterOSVersion,
|
||||
dev.MajorVersion,
|
||||
).Scan(&id)
|
||||
if err != nil {
|
||||
t.Fatalf("inserting test device: %v", err)
|
||||
}
|
||||
|
||||
return id
|
||||
}
|
||||
|
||||
func coalesce(s, fallback string) string {
|
||||
if s == "" {
|
||||
return fallback
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func coalesceInt(v, fallback int) int {
|
||||
if v == 0 {
|
||||
return fallback
|
||||
}
|
||||
return v
|
||||
}
|
||||
173
poller/internal/vault/cache.go
Normal file
173
poller/internal/vault/cache.go
Normal file
@@ -0,0 +1,173 @@
|
||||
package vault
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/hashicorp/golang-lru/v2/expirable"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
|
||||
"github.com/mikrotik-portal/poller/internal/device"
|
||||
)
|
||||
|
||||
// CachedCreds holds decrypted device credentials.
|
||||
type CachedCreds struct {
|
||||
Username string
|
||||
Password string
|
||||
}
|
||||
|
||||
// Prometheus metrics for credential cache and OpenBao Transit observability.
|
||||
var (
|
||||
CacheHits = promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "poller_credential_cache_hits_total",
|
||||
Help: "Number of credential cache hits (no OpenBao call)",
|
||||
})
|
||||
CacheMisses = promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "poller_credential_cache_misses_total",
|
||||
Help: "Number of credential cache misses (OpenBao decrypt call)",
|
||||
})
|
||||
OpenBaoLatency = promauto.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "poller_openbao_decrypt_duration_seconds",
|
||||
Help: "Latency of OpenBao Transit decrypt calls",
|
||||
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0},
|
||||
})
|
||||
LegacyDecrypts = promauto.NewCounter(prometheus.CounterOpts{
|
||||
Name: "poller_credential_legacy_decrypts_total",
|
||||
Help: "Number of credentials decrypted using legacy AES key (not yet migrated)",
|
||||
})
|
||||
)
|
||||
|
||||
// CredentialCache provides cached credential decryption with dual-read support.
|
||||
// It uses an LRU cache with TTL to avoid redundant OpenBao calls and falls back
|
||||
// to legacy AES-256-GCM decryption for credentials not yet migrated to Transit.
|
||||
type CredentialCache struct {
|
||||
cache *expirable.LRU[string, *CachedCreds]
|
||||
transit *TransitClient
|
||||
legacy []byte // legacy AES-256-GCM key (nil if not available)
|
||||
db *pgxpool.Pool // for key_access_log inserts (nil if not available)
|
||||
}
|
||||
|
||||
// NewCredentialCache creates a bounded LRU cache with the given size and TTL.
|
||||
// transit may be nil if OpenBao is not configured. legacyKey may be nil if not available.
|
||||
// db may be nil if key access logging is not needed.
|
||||
func NewCredentialCache(size int, ttl time.Duration, transit *TransitClient, legacyKey []byte, db *pgxpool.Pool) *CredentialCache {
|
||||
cache := expirable.NewLRU[string, *CachedCreds](size, nil, ttl)
|
||||
return &CredentialCache{
|
||||
cache: cache,
|
||||
transit: transit,
|
||||
legacy: legacyKey,
|
||||
db: db,
|
||||
}
|
||||
}
|
||||
|
||||
// GetCredentials returns decrypted credentials for a device, using the cache.
|
||||
// transitCiphertext is the Transit-encrypted string (nullable), legacyCiphertext is the legacy BYTEA (nullable).
|
||||
// Returns (username, password, error).
|
||||
func (c *CredentialCache) GetCredentials(
|
||||
deviceID, tenantID string,
|
||||
transitCiphertext *string,
|
||||
legacyCiphertext []byte,
|
||||
) (string, string, error) {
|
||||
// Check cache first
|
||||
if cached, ok := c.cache.Get(deviceID); ok {
|
||||
CacheHits.Inc()
|
||||
return cached.Username, cached.Password, nil
|
||||
}
|
||||
CacheMisses.Inc()
|
||||
|
||||
var username, password string
|
||||
|
||||
// Prefer Transit ciphertext if available
|
||||
if transitCiphertext != nil && *transitCiphertext != "" && strings.HasPrefix(*transitCiphertext, "vault:v") {
|
||||
if c.transit == nil {
|
||||
return "", "", fmt.Errorf("transit ciphertext present but OpenBao client not configured")
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
plaintext, err := c.transit.Decrypt(tenantID, *transitCiphertext)
|
||||
OpenBaoLatency.Observe(time.Since(start).Seconds())
|
||||
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("transit decrypt for device %s: %w", deviceID, err)
|
||||
}
|
||||
|
||||
var creds struct {
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
}
|
||||
if err := json.Unmarshal(plaintext, &creds); err != nil {
|
||||
return "", "", fmt.Errorf("unmarshal transit-decrypted credentials: %w", err)
|
||||
}
|
||||
username = creds.Username
|
||||
password = creds.Password
|
||||
|
||||
// Fire-and-forget key access log INSERT for audit trail
|
||||
if c.db != nil {
|
||||
go c.logKeyAccess(deviceID, tenantID, "decrypt_credentials", "poller_poll")
|
||||
}
|
||||
|
||||
} else if legacyCiphertext != nil && len(legacyCiphertext) > 0 {
|
||||
// Fall back to legacy AES-256-GCM decryption
|
||||
if c.legacy == nil {
|
||||
return "", "", fmt.Errorf("legacy ciphertext present but encryption key not configured")
|
||||
}
|
||||
|
||||
var err error
|
||||
username, password, err = device.DecryptCredentials(legacyCiphertext, c.legacy)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("legacy decrypt for device %s: %w", deviceID, err)
|
||||
}
|
||||
LegacyDecrypts.Inc()
|
||||
|
||||
} else {
|
||||
return "", "", fmt.Errorf("no credentials available for device %s", deviceID)
|
||||
}
|
||||
|
||||
// Cache the result
|
||||
c.cache.Add(deviceID, &CachedCreds{Username: username, Password: password})
|
||||
|
||||
slog.Debug("credential decrypted and cached",
|
||||
"device_id", deviceID,
|
||||
"source", func() string {
|
||||
if transitCiphertext != nil && *transitCiphertext != "" {
|
||||
return "transit"
|
||||
}
|
||||
return "legacy"
|
||||
}(),
|
||||
)
|
||||
|
||||
return username, password, nil
|
||||
}
|
||||
|
||||
// Invalidate removes a device's cached credentials (e.g., after credential rotation).
|
||||
func (c *CredentialCache) Invalidate(deviceID string) {
|
||||
c.cache.Remove(deviceID)
|
||||
}
|
||||
|
||||
// Len returns the number of cached entries.
|
||||
func (c *CredentialCache) Len() int {
|
||||
return c.cache.Len()
|
||||
}
|
||||
|
||||
// logKeyAccess inserts an immutable audit record for a credential decryption event.
|
||||
// Called as a fire-and-forget goroutine to avoid slowing down the poll cycle.
|
||||
func (c *CredentialCache) logKeyAccess(deviceID, tenantID, action, justification string) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
correlationID := uuid.New().String()
|
||||
_, err := c.db.Exec(ctx,
|
||||
`INSERT INTO key_access_log (tenant_id, device_id, action, resource_type, justification, correlation_id)
|
||||
VALUES ($1::uuid, $2::uuid, $3, 'device_credentials', $4, $5)`,
|
||||
tenantID, deviceID, action, justification, correlationID,
|
||||
)
|
||||
if err != nil {
|
||||
slog.Warn("failed to log key access", "error", err, "device_id", deviceID)
|
||||
}
|
||||
}
|
||||
127
poller/internal/vault/transit.go
Normal file
127
poller/internal/vault/transit.go
Normal file
@@ -0,0 +1,127 @@
|
||||
// Package vault provides OpenBao Transit integration for credential encryption/decryption.
|
||||
//
|
||||
// The TransitClient communicates with the OpenBao Transit secrets engine via HTTP,
|
||||
// enabling per-tenant encryption keys managed by OpenBao rather than a static
|
||||
// application-level AES key.
|
||||
package vault
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TransitClient communicates with OpenBao Transit secrets engine via HTTP.
|
||||
type TransitClient struct {
|
||||
httpClient *http.Client
|
||||
addr string
|
||||
token string
|
||||
}
|
||||
|
||||
// NewTransitClient creates a Transit client with sensible defaults.
|
||||
func NewTransitClient(addr, token string) *TransitClient {
|
||||
return &TransitClient{
|
||||
httpClient: &http.Client{Timeout: 5 * time.Second},
|
||||
addr: addr,
|
||||
token: token,
|
||||
}
|
||||
}
|
||||
|
||||
// transitDecryptResponse is the JSON response from Transit decrypt endpoint.
|
||||
type transitDecryptResponse struct {
|
||||
Data struct {
|
||||
Plaintext string `json:"plaintext"`
|
||||
} `json:"data"`
|
||||
Errors []string `json:"errors,omitempty"`
|
||||
}
|
||||
|
||||
// Decrypt decrypts a Transit ciphertext (vault:v1:...) and returns plaintext bytes.
|
||||
func (c *TransitClient) Decrypt(tenantID, ciphertext string) ([]byte, error) {
|
||||
payload, err := json.Marshal(map[string]string{"ciphertext": ciphertext})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshal decrypt request: %w", err)
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s/v1/transit/decrypt/tenant_%s", c.addr, tenantID)
|
||||
req, err := http.NewRequest("POST", url, bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create decrypt request: %w", err)
|
||||
}
|
||||
req.Header.Set("X-Vault-Token", c.token)
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := c.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("openbao transit decrypt: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read decrypt response: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("openbao transit decrypt failed (status %d): %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
var result transitDecryptResponse
|
||||
if err := json.Unmarshal(body, &result); err != nil {
|
||||
return nil, fmt.Errorf("unmarshal decrypt response: %w", err)
|
||||
}
|
||||
|
||||
plaintext, err := base64.StdEncoding.DecodeString(result.Data.Plaintext)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("decode plaintext base64: %w", err)
|
||||
}
|
||||
|
||||
return plaintext, nil
|
||||
}
|
||||
|
||||
// Encrypt encrypts plaintext bytes via Transit engine. Returns ciphertext string.
|
||||
func (c *TransitClient) Encrypt(tenantID string, plaintext []byte) (string, error) {
|
||||
payload, err := json.Marshal(map[string]string{
|
||||
"plaintext": base64.StdEncoding.EncodeToString(plaintext),
|
||||
})
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("marshal encrypt request: %w", err)
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s/v1/transit/encrypt/tenant_%s", c.addr, tenantID)
|
||||
req, err := http.NewRequest("POST", url, bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("create encrypt request: %w", err)
|
||||
}
|
||||
req.Header.Set("X-Vault-Token", c.token)
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := c.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("openbao transit encrypt: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read encrypt response: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("openbao transit encrypt failed (status %d): %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Data struct {
|
||||
Ciphertext string `json:"ciphertext"`
|
||||
} `json:"data"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &result); err != nil {
|
||||
return "", fmt.Errorf("unmarshal encrypt response: %w", err)
|
||||
}
|
||||
|
||||
return result.Data.Ciphertext, nil
|
||||
}
|
||||
Reference in New Issue
Block a user