From 2ad0367c91705d3640177b7a4b5b302122f2e5da Mon Sep 17 00:00:00 2001
From: Jason Staack
Date: Sat, 14 Mar 2026 20:59:14 -0500
Subject: [PATCH] fix(vpn): backport VPN fixes from production debugging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Fix _commit_and_sync infinite recursion
- Use admin session for subnet_index allocation (bypass RLS)
- Auto-set VPN endpoint from CORS_ORIGINS hostname
- Remove server address field from VPN setup UI
- Add DELETE endpoint and button for VPN config removal
- Add wg-reload watcher for reliable config hot-reload via wg syncconf
- Add wg_status.json writer for live peer handshake status in UI
- Per-tenant SNAT for poller-to-device routing through VPN
- Restrict VPN→eth0 forwarding to Docker networks only (block exit node abuse)
- Use 10.10.0.0/16 allowed-address in RouterOS commands
- Fix structlog event= conflict (use audit=True)
- Export backup_scheduler proxy for firmware/upgrade imports
---
backend/app/routers/vpn.py | 23 ++++++
backend/app/services/backup_scheduler.py | 15 ++++
backend/app/services/vpn_service.py | 57 ++++++++++---
.../custom-cont-init.d/10-forwarding.sh | 79 ++++++++++++++++++-
frontend/src/components/vpn/VpnPage.tsx | 39 +++++----
frontend/src/lib/api.ts | 3 +
setup.py | 9 ++-
7 files changed, 194 insertions(+), 31 deletions(-)
diff --git a/backend/app/routers/vpn.py b/backend/app/routers/vpn.py
index 30221fc..4c499f4 100644
--- a/backend/app/routers/vpn.py
+++ b/backend/app/routers/vpn.py
@@ -92,6 +92,29 @@ async def setup_vpn(
return VpnConfigResponse.model_validate(config)
+@router.delete("/tenants/{tenant_id}/vpn", status_code=status.HTTP_204_NO_CONTENT)
+@limiter.limit("5/minute")
+async def delete_vpn_config(
+ request: Request,
+ tenant_id: uuid.UUID,
+ current_user: CurrentUser = Depends(get_current_user),
+ db: AsyncSession = Depends(get_db),
+):
+ """Delete VPN configuration and all peers for this tenant."""
+ await _check_tenant_access(current_user, tenant_id, db)
+ _require_operator(current_user)
+ config = await vpn_service.get_vpn_config(db, tenant_id)
+ if not config:
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="VPN not configured")
+ # Delete all peers first
+ peers = await vpn_service.get_peers(db, tenant_id)
+ for peer in peers:
+ await db.delete(peer)
+ await db.delete(config)
+ await db.flush()
+ await vpn_service._commit_and_sync(db)
+
+
@router.patch("/tenants/{tenant_id}/vpn", response_model=VpnConfigResponse)
@limiter.limit("20/minute")
async def update_vpn_config(
diff --git a/backend/app/services/backup_scheduler.py b/backend/app/services/backup_scheduler.py
index cbaa5a1..6a20fe0 100644
--- a/backend/app/services/backup_scheduler.py
+++ b/backend/app/services/backup_scheduler.py
@@ -195,3 +195,18 @@ async def stop_backup_scheduler() -> None:
_scheduler.shutdown(wait=False)
_scheduler = None
logger.info("Backup scheduler stopped")
+
+
+class _SchedulerProxy:
+ """Proxy to access the module-level scheduler from other modules.
+
+ Usage: `from app.services.backup_scheduler import backup_scheduler`
+ then `backup_scheduler.add_job(...)`.
+ """
+ def __getattr__(self, name):
+ if _scheduler is None:
+ raise RuntimeError("Backup scheduler not started yet")
+ return getattr(_scheduler, name)
+
+
+backup_scheduler = _SchedulerProxy()
diff --git a/backend/app/services/vpn_service.py b/backend/app/services/vpn_service.py
index 71dd462..5966b5b 100644
--- a/backend/app/services/vpn_service.py
+++ b/backend/app/services/vpn_service.py
@@ -96,7 +96,7 @@ async def _get_or_create_global_server_key(db: AsyncSession) -> tuple[str, str]:
)
await db.flush()
- logger.info("vpn_global_server_keypair_generated", event="vpn_audit")
+ logger.info("vpn_global_server_keypair_generated", audit=True)
return private_key_b64, public_key_b64
@@ -114,11 +114,15 @@ def _allocate_subnet_index_from_used(used: set[int]) -> int:
async def _allocate_subnet_index(db: AsyncSession) -> int:
"""Allocate next available subnet_index from the database.
- Uses gap-filling: finds the lowest integer in [1,255] not already used.
+ Uses an admin session to see ALL tenants' subnet indices (bypasses RLS).
The UNIQUE constraint on subnet_index protects against races.
"""
- result = await db.execute(select(VpnConfig.subnet_index))
- used = {row[0] for row in result.all()}
+ from app.database import AdminAsyncSessionLocal
+ from sqlalchemy import text as sa_text
+
+ async with AdminAsyncSessionLocal() as admin_db:
+ result = await admin_db.execute(sa_text("SELECT subnet_index FROM vpn_config"))
+ used = {row[0] for row in result.fetchall()}
return _allocate_subnet_index_from_used(used)
@@ -158,7 +162,8 @@ async def _commit_and_sync(db: AsyncSession) -> None:
first for their changes to be visible. This helper combines both steps
and provides a single patch point for tests.
"""
- await _commit_and_sync(db)
+ await db.commit()
+ await sync_wireguard_config()
async def sync_wireguard_config() -> None:
@@ -244,11 +249,33 @@ async def sync_wireguard_config() -> None:
tmp_path.write_text("\n".join(lines))
os.rename(str(tmp_path), str(conf_path))
+ # Write per-tenant SNAT rules for poller→device routing
+ # Docker traffic (172.16.0.0/12) going to each tenant's subnet
+ # gets SNATted to that tenant's gateway IP (.1) so the router
+ # can route replies back through the tunnel.
+ nat_lines = ["#!/bin/sh",
+ "# Auto-generated per-tenant SNAT rules",
+ "# Remove old rules",
+ "iptables -t nat -F POSTROUTING 2>/dev/null",
+ "# Re-add Docker DNS rules",
+ ]
+ for config in configs:
+ gateway_ip = config.server_address.split("/")[0] # e.g. 10.10.3.1
+ subnet = config.subnet # e.g. 10.10.3.0/24
+ nat_lines.append(
+ f"iptables -t nat -A POSTROUTING -s 172.16.0.0/12 -d {subnet} -o wg0 -j SNAT --to-source {gateway_ip}"
+ )
+ nat_path = wg_confs_dir / "nat_rules.sh"
+ nat_tmp = wg_confs_dir / "nat_rules.sh.tmp"
+ nat_tmp.write_text("\n".join(nat_lines) + "\n")
+ os.rename(str(nat_tmp), str(nat_path))
+ os.chmod(str(nat_path), 0o755)
+
# Signal WireGuard container to reload
reload_flag = wg_confs_dir / ".reload"
reload_flag.write_text("1")
- logger.info("wireguard_config_synced", event="vpn_audit",
+ logger.info("wireguard_config_synced", audit=True,
tenants=len(configs), peers=total_peers)
finally:
@@ -304,6 +331,18 @@ async def setup_vpn(
if existing:
raise ValueError("VPN already configured for this tenant")
+ # Auto-set endpoint from CORS_ORIGINS if not provided
+ if not endpoint:
+ origins = getattr(settings, "CORS_ORIGINS", "")
+ if isinstance(origins, list):
+ host = origins[0] if origins else ""
+ else:
+ host = str(origins).split(",")[0].strip()
+ # Extract hostname from URL, append WireGuard port
+ if host:
+ host = host.replace("https://", "").replace("http://", "").split("/")[0]
+ endpoint = f"{host}:51820"
+
# Get or create global server keypair
_, public_key_b64 = await _get_or_create_global_server_key(db)
@@ -332,7 +371,7 @@ async def setup_vpn(
db.add(config)
await db.flush()
- logger.info("vpn_subnet_allocated", event="vpn_audit",
+ logger.info("vpn_subnet_allocated", audit=True,
tenant_id=str(tenant_id), subnet_index=subnet_index, subnet=subnet)
await _commit_and_sync(db)
@@ -464,7 +503,7 @@ async def get_peer_config(db: AsyncSession, tenant_id: uuid.UUID, peer_id: uuid.
f'/interface wireguard add name=wg-portal listen-port=13231 private-key="{private_key}"',
f'/interface wireguard peers add interface=wg-portal public-key="{config.server_public_key}" '
f'endpoint-address={endpoint.split(":")[0]} endpoint-port={endpoint.split(":")[-1]} '
- f'allowed-address={config.subnet} persistent-keepalive=25'
+ f'allowed-address=10.10.0.0/16 persistent-keepalive=25'
+ (f' preshared-key="{psk}"' if psk else ""),
f"/ip address add address={peer.assigned_ip} interface=wg-portal",
]
@@ -545,7 +584,7 @@ async def onboard_device(
f'/interface wireguard add name=wg-portal listen-port=13231 private-key="{private_key_b64}"',
f'/interface wireguard peers add interface=wg-portal public-key="{config.server_public_key}" '
f'endpoint-address={endpoint.split(":")[0]} endpoint-port={endpoint.split(":")[-1]} '
- f'allowed-address={config.subnet} persistent-keepalive=25'
+ f'allowed-address=10.10.0.0/16 persistent-keepalive=25'
f' preshared-key="{psk_decrypted}"',
f"/ip address add address={assigned_ip} interface=wg-portal",
]
diff --git a/docker-data/wireguard/custom-cont-init.d/10-forwarding.sh b/docker-data/wireguard/custom-cont-init.d/10-forwarding.sh
index 5e83e5e..58f7a74 100755
--- a/docker-data/wireguard/custom-cont-init.d/10-forwarding.sh
+++ b/docker-data/wireguard/custom-cont-init.d/10-forwarding.sh
@@ -1,8 +1,12 @@
#!/bin/sh
# Enable forwarding between Docker network and WireGuard tunnel
# Idempotent: check before adding to prevent duplicates on restart
+# Allow Docker→VPN (poller/API reaching devices)
iptables -C FORWARD -i eth0 -o wg0 -j ACCEPT 2>/dev/null || iptables -A FORWARD -i eth0 -o wg0 -j ACCEPT
-iptables -C FORWARD -i wg0 -o eth0 -j ACCEPT 2>/dev/null || iptables -A FORWARD -i wg0 -o eth0 -j ACCEPT
+# Allow VPN→Docker ONLY (devices reaching poller/API, NOT the public internet)
+iptables -C FORWARD -i wg0 -o eth0 -d 172.16.0.0/12 -j ACCEPT 2>/dev/null || iptables -A FORWARD -i wg0 -o eth0 -d 172.16.0.0/12 -j ACCEPT
+# Block VPN→anywhere else (prevents using server as exit node)
+iptables -C FORWARD -i wg0 -o eth0 -j DROP 2>/dev/null || iptables -A FORWARD -i wg0 -o eth0 -j DROP
# Block cross-subnet traffic on wg0 (tenant isolation)
# Peers in 10.10.1.0/24 cannot reach peers in 10.10.2.0/24
@@ -11,7 +15,76 @@ iptables -C FORWARD -i wg0 -o wg0 -j DROP 2>/dev/null || iptables -A FORWARD -i
# Block IPv6 forwarding on wg0 (prevent link-local bypass)
ip6tables -C FORWARD -i wg0 -j DROP 2>/dev/null || ip6tables -A FORWARD -i wg0 -j DROP
-# NAT for return traffic
-iptables -t nat -C POSTROUTING -o wg0 -j MASQUERADE 2>/dev/null || iptables -t nat -A POSTROUTING -o wg0 -j MASQUERADE
+# NAT for return traffic — per-tenant SNAT rules are applied by wg-reload watcher
+# (nat_rules.sh is generated by sync_wireguard_config)
echo "WireGuard forwarding and tenant isolation rules applied"
+
+# Start config reload watcher in background
+# Polls for .reload flag every 2s, applies changes via wg syncconf
+(
+ CONF_DIR="/config/wg_confs"
+ RELOAD_FLAG="$CONF_DIR/.reload"
+ echo "wg-reload: watcher started (pid $$)"
+ # Wait for wg0 interface to be fully up before processing reloads
+ while ! wg show wg0 >/dev/null 2>&1; do
+ sleep 2
+ done
+ # Apply NAT rules on startup if they exist
+ if [ -f "$CONF_DIR/nat_rules.sh" ]; then
+ sh "$CONF_DIR/nat_rules.sh" 2>&1
+ echo "wg-reload: startup NAT rules applied"
+ fi
+ # Clear any reload flag that was set during startup (interface already has the config)
+ rm -f "$RELOAD_FLAG"
+ echo "wg-reload: wg0 is up, watching for changes"
+ while true; do
+ if [ -f "$RELOAD_FLAG" ]; then
+ rm -f "$RELOAD_FLAG"
+ sleep 0.5
+ if [ -f "$CONF_DIR/wg0.conf" ]; then
+ # Strip Address and comments; keep ListenPort + PrivateKey + Peers
+ # wg syncconf rejects Address but needs ListenPort to preserve it
+ grep -v "^Address" "$CONF_DIR/wg0.conf" | grep -v "^#" | wg syncconf wg0 /dev/stdin 2>&1
+ # Apply per-tenant SNAT rules for poller connectivity
+ if [ -f "$CONF_DIR/nat_rules.sh" ]; then
+ sh "$CONF_DIR/nat_rules.sh" 2>&1
+ echo "wg-reload: NAT rules applied"
+ fi
+ if [ $? -eq 0 ]; then
+ echo "wg-reload: config applied"
+ else
+ echo "wg-reload: syncconf failed"
+ fi
+ fi
+ fi
+ sleep 2
+ done
+) &
+
+# Start status writer in background
+# Writes wg_status.json every 15 seconds from `wg show wg0 dump`
+(
+ STATUS_FILE="/config/wg_status.json"
+ # Wait for wg0
+ while ! wg show wg0 >/dev/null 2>&1; do
+ sleep 2
+ done
+ echo "wg-status: writer started"
+ while true; do
+ # Parse `wg show wg0 dump` into JSON array
+ # Format: private_key public_key listen_port fwmark
+ # Peer lines: public_key preshared_key endpoint allowed_ips latest_handshake transfer_rx transfer_tx persistent_keepalive
+ wg show wg0 dump 2>/dev/null | awk -F'\t' '
+ BEGIN { printf "[" ; first=1 }
+ NR > 1 {
+ if (!first) printf ","
+ first=0
+ printf "{\"public_key\":\"%s\",\"endpoint\":\"%s\",\"allowed_ips\":\"%s\",\"last_handshake\":%s,\"rx\":%s,\"tx\":%s}",
+ $1, $3, $4, ($5 == "" ? "0" : $5), ($6 == "" ? "0" : $6), ($7 == "" ? "0" : $7)
+ }
+ END { printf "]\n" }
+ ' > "${STATUS_FILE}.tmp" && mv "${STATUS_FILE}.tmp" "$STATUS_FILE"
+ sleep 15
+ done
+) &
diff --git a/frontend/src/components/vpn/VpnPage.tsx b/frontend/src/components/vpn/VpnPage.tsx
index 0209a4d..76b7c74 100644
--- a/frontend/src/components/vpn/VpnPage.tsx
+++ b/frontend/src/components/vpn/VpnPage.tsx
@@ -134,6 +134,16 @@ export function VpnPage() {
},
})
+ const deleteMutation = useMutation({
+ mutationFn: () => vpnApi.deleteConfig(tenantId),
+ onSuccess: () => {
+ queryClient.invalidateQueries({ queryKey: ['vpn-config'] })
+ queryClient.invalidateQueries({ queryKey: ['vpn-peers'] })
+ toast({ title: 'VPN configuration deleted' })
+ },
+ onError: (e: any) => toast({ title: e?.response?.data?.detail || 'Failed to delete VPN', variant: 'destructive' }),
+ })
+
// ── Helpers ──
const connectedPeerIds = new Set(peers.map((p) => p.device_id))
@@ -195,22 +205,6 @@ export function VpnPage() {
-
-
-
setEndpoint(e.target.value)}
- className="text-center"
- />
-
- The public hostname or IP where devices will connect. You can set this later.
-
-
-
{writable && (