feat(02-01): add config backup env vars, NATS event, device SSH fields, migration, metrics

- Config: CONFIG_BACKUP_INTERVAL (21600s), CONFIG_BACKUP_MAX_CONCURRENT (10), CONFIG_BACKUP_COMMAND_TIMEOUT (60s)
- NATS: ConfigSnapshotEvent type, PublishConfigSnapshot method, config.snapshot.> stream subject
- Device: SSHPort/SSHHostKeyFingerprint fields, UpdateSSHHostKey method, updated queries/scans
- Migration 028: ssh_port, ssh_host_key_fingerprint, timestamp columns with poller_user grants
- Metrics: ConfigBackupTotal (counter), ConfigBackupDuration (histogram), ConfigBackupActive (gauge)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jason Staack
2026-03-12 20:48:12 -05:00
parent f1abb75cab
commit 4ae39d2cb3
5 changed files with 162 additions and 4 deletions

View File

@@ -65,6 +65,19 @@ type PushRollbackEvent struct {
PrePushCommitSHA string `json:"pre_push_commit_sha"`
}
// ConfigSnapshotEvent is the payload published to NATS JetStream when a config
// backup is successfully collected from a device. The backend subscribes to
// "config.snapshot.>" to store snapshots and compute diffs.
type ConfigSnapshotEvent struct {
DeviceID string `json:"device_id"`
TenantID string `json:"tenant_id"`
RouterOSVersion string `json:"routeros_version,omitempty"`
CollectedAt string `json:"collected_at"` // RFC3339
SHA256Hash string `json:"sha256_hash"`
ConfigText string `json:"config_text"`
NormalizationVersion int `json:"normalization_version"`
}
// PushAlertEvent triggers an alert for editor pushes (one-click rollback).
type PushAlertEvent struct {
DeviceID string `json:"device_id"`
@@ -122,6 +135,7 @@ func NewPublisher(natsURL string) (*Publisher, error) {
"device.firmware.>",
"device.credential_changed.>",
"config.changed.>",
"config.snapshot.>",
"config.push.rollback.>",
"config.push.alert.>",
"audit.session.end.>",
@@ -257,6 +271,33 @@ func (p *Publisher) PublishConfigChanged(ctx context.Context, event ConfigChange
return nil
}
// PublishConfigSnapshot publishes a config snapshot event to NATS JetStream.
//
// Events are published to "config.snapshot.create.{DeviceID}" so the Python
// backend can store the snapshot and compute diffs against the previous one.
func (p *Publisher) PublishConfigSnapshot(ctx context.Context, event ConfigSnapshotEvent) error {
data, err := json.Marshal(event)
if err != nil {
return fmt.Errorf("marshalling config snapshot event: %w", err)
}
subject := fmt.Sprintf("config.snapshot.create.%s", event.DeviceID)
_, err = p.js.Publish(ctx, subject, data)
if err != nil {
return fmt.Errorf("publishing to %s: %w", subject, err)
}
slog.Debug("published config snapshot event",
"device_id", event.DeviceID,
"tenant_id", event.TenantID,
"sha256_hash", event.SHA256Hash,
"subject", subject,
)
return nil
}
// PublishPushRollback publishes a push rollback event when a device goes offline
// after a template or restore config push, triggering automatic rollback.
func (p *Publisher) PublishPushRollback(ctx context.Context, event PushRollbackEvent) error {

View File

@@ -90,6 +90,15 @@ type Config struct {
// SSHMaxPerDevice is the maximum number of concurrent SSH relay sessions per device.
SSHMaxPerDevice int
// ConfigBackupIntervalSeconds is how often config backups are collected per device (default 6h = 21600s).
ConfigBackupIntervalSeconds int
// ConfigBackupMaxConcurrent is the max number of concurrent config backup jobs.
ConfigBackupMaxConcurrent int
// ConfigBackupCommandTimeoutSeconds is the per-command timeout for SSH config export.
ConfigBackupCommandTimeoutSeconds int
}
// knownInsecureEncryptionKey is the base64-encoded dev default encryption key.
@@ -118,8 +127,11 @@ func Load() (*Config, error) {
SSHRelayPort: getEnv("SSH_RELAY_PORT", "8080"),
SSHIdleTimeout: getEnvInt("SSH_IDLE_TIMEOUT", 900),
SSHMaxSessions: getEnvInt("SSH_MAX_SESSIONS", 200),
SSHMaxPerUser: getEnvInt("SSH_MAX_PER_USER", 10),
SSHMaxPerDevice: getEnvInt("SSH_MAX_PER_DEVICE", 20),
SSHMaxPerUser: getEnvInt("SSH_MAX_PER_USER", 10),
SSHMaxPerDevice: getEnvInt("SSH_MAX_PER_DEVICE", 20),
ConfigBackupIntervalSeconds: getEnvInt("CONFIG_BACKUP_INTERVAL", 21600),
ConfigBackupMaxConcurrent: getEnvInt("CONFIG_BACKUP_MAX_CONCURRENT", 10),
ConfigBackupCommandTimeoutSeconds: getEnvInt("CONFIG_BACKUP_COMMAND_TIMEOUT", 60),
}
if cfg.DatabaseURL == "" {

View File

@@ -58,3 +58,23 @@ var CircuitBreakerResets = promauto.NewCounter(prometheus.CounterOpts{
Name: "mikrotik_circuit_breaker_resets_total",
Help: "Total circuit breaker resets when a device recovers.",
})
// ConfigBackupTotal counts config backup operations by status.
// Status labels: "success", "error", "skipped_offline", "skipped_auth_blocked", "skipped_hostkey_blocked".
var ConfigBackupTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "mikrotik_config_backup_total",
Help: "Total config backup operations.",
}, []string{"status"})
// ConfigBackupDuration tracks the duration of individual config backup operations.
var ConfigBackupDuration = promauto.NewHistogram(prometheus.HistogramOpts{
Name: "mikrotik_config_backup_duration_seconds",
Help: "Duration of a single config backup operation in seconds.",
Buckets: []float64{1, 5, 10, 30, 60, 120, 300},
})
// ConfigBackupActive tracks the number of concurrent config backup jobs running.
var ConfigBackupActive = promauto.NewGauge(prometheus.GaugeOpts{
Name: "mikrotik_config_backup_active",
Help: "Number of concurrent config backup jobs running.",
})

View File

@@ -22,6 +22,8 @@ type Device struct {
MajorVersion *int
TLSMode string // "insecure" or "portal_ca"
CACertPEM *string // PEM-encoded CA cert (only populated when TLSMode = "portal_ca")
SSHPort int // SSH port for config backup (default 22)
SSHHostKeyFingerprint *string // TOFU SSH host key fingerprint (SHA256:base64)
}
// DeviceStore manages PostgreSQL connections for device data access.
@@ -65,7 +67,9 @@ func (s *DeviceStore) FetchDevices(ctx context.Context) ([]Device, error) {
d.routeros_version,
d.routeros_major_version,
d.tls_mode,
ca.cert_pem
ca.cert_pem,
COALESCE(d.ssh_port, 22),
d.ssh_host_key_fingerprint
FROM devices d
LEFT JOIN certificate_authorities ca
ON d.tenant_id = ca.tenant_id
@@ -95,6 +99,8 @@ func (s *DeviceStore) FetchDevices(ctx context.Context) ([]Device, error) {
&d.MajorVersion,
&d.TLSMode,
&d.CACertPEM,
&d.SSHPort,
&d.SSHHostKeyFingerprint,
); err != nil {
return nil, fmt.Errorf("scanning device row: %w", err)
}
@@ -122,7 +128,9 @@ func (s *DeviceStore) GetDevice(ctx context.Context, deviceID string) (Device, e
d.routeros_version,
d.routeros_major_version,
d.tls_mode,
ca.cert_pem
ca.cert_pem,
COALESCE(d.ssh_port, 22),
d.ssh_host_key_fingerprint
FROM devices d
LEFT JOIN certificate_authorities ca
ON d.tenant_id = ca.tenant_id
@@ -142,6 +150,8 @@ func (s *DeviceStore) GetDevice(ctx context.Context, deviceID string) (Device, e
&d.MajorVersion,
&d.TLSMode,
&d.CACertPEM,
&d.SSHPort,
&d.SSHHostKeyFingerprint,
)
if err != nil {
return Device{}, fmt.Errorf("querying device %s: %w", deviceID, err)
@@ -149,6 +159,17 @@ func (s *DeviceStore) GetDevice(ctx context.Context, deviceID string) (Device, e
return d, nil
}
// UpdateSSHHostKey stores the SSH host key fingerprint for TOFU verification.
// Called after a successful first-connect to persist the observed fingerprint.
func (s *DeviceStore) UpdateSSHHostKey(ctx context.Context, deviceID string, fingerprint string) error {
const query = `UPDATE devices SET ssh_host_key_fingerprint = $1, ssh_host_key_first_seen = COALESCE(ssh_host_key_first_seen, NOW()), ssh_host_key_last_verified = NOW() WHERE id = $2`
_, err := s.pool.Exec(ctx, query, fingerprint, deviceID)
if err != nil {
return fmt.Errorf("updating SSH host key for device %s: %w", deviceID, err)
}
return nil
}
// Pool returns the underlying pgxpool.Pool for shared use by other subsystems
// (e.g., credential cache key_access_log inserts).
func (s *DeviceStore) Pool() *pgxpool.Pool {