fix(vpn): backport VPN fixes from production debugging

- Fix _commit_and_sync infinite recursion
- Use admin session for subnet_index allocation (bypass RLS)
- Auto-set VPN endpoint from CORS_ORIGINS hostname
- Remove server address field from VPN setup UI
- Add DELETE endpoint and button for VPN config removal
- Add wg-reload watcher for reliable config hot-reload via wg syncconf
- Add wg_status.json writer for live peer handshake status in UI
- Per-tenant SNAT for poller-to-device routing through VPN
- Restrict VPN→eth0 forwarding to Docker networks only (block exit node abuse)
- Use 10.10.0.0/16 allowed-address in RouterOS commands
- Fix structlog event= conflict (use audit=True)
- Export backup_scheduler proxy for firmware/upgrade imports
This commit is contained in:
Jason Staack
2026-03-14 20:59:14 -05:00
parent b5f9bf14df
commit 2ad0367c91
7 changed files with 194 additions and 31 deletions

View File

@@ -92,6 +92,29 @@ async def setup_vpn(
return VpnConfigResponse.model_validate(config) return VpnConfigResponse.model_validate(config)
@router.delete("/tenants/{tenant_id}/vpn", status_code=status.HTTP_204_NO_CONTENT)
@limiter.limit("5/minute")
async def delete_vpn_config(
request: Request,
tenant_id: uuid.UUID,
current_user: CurrentUser = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
"""Delete VPN configuration and all peers for this tenant."""
await _check_tenant_access(current_user, tenant_id, db)
_require_operator(current_user)
config = await vpn_service.get_vpn_config(db, tenant_id)
if not config:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="VPN not configured")
# Delete all peers first
peers = await vpn_service.get_peers(db, tenant_id)
for peer in peers:
await db.delete(peer)
await db.delete(config)
await db.flush()
await vpn_service._commit_and_sync(db)
@router.patch("/tenants/{tenant_id}/vpn", response_model=VpnConfigResponse) @router.patch("/tenants/{tenant_id}/vpn", response_model=VpnConfigResponse)
@limiter.limit("20/minute") @limiter.limit("20/minute")
async def update_vpn_config( async def update_vpn_config(

View File

@@ -195,3 +195,18 @@ async def stop_backup_scheduler() -> None:
_scheduler.shutdown(wait=False) _scheduler.shutdown(wait=False)
_scheduler = None _scheduler = None
logger.info("Backup scheduler stopped") logger.info("Backup scheduler stopped")
class _SchedulerProxy:
"""Proxy to access the module-level scheduler from other modules.
Usage: `from app.services.backup_scheduler import backup_scheduler`
then `backup_scheduler.add_job(...)`.
"""
def __getattr__(self, name):
if _scheduler is None:
raise RuntimeError("Backup scheduler not started yet")
return getattr(_scheduler, name)
backup_scheduler = _SchedulerProxy()

View File

@@ -96,7 +96,7 @@ async def _get_or_create_global_server_key(db: AsyncSession) -> tuple[str, str]:
) )
await db.flush() await db.flush()
logger.info("vpn_global_server_keypair_generated", event="vpn_audit") logger.info("vpn_global_server_keypair_generated", audit=True)
return private_key_b64, public_key_b64 return private_key_b64, public_key_b64
@@ -114,11 +114,15 @@ def _allocate_subnet_index_from_used(used: set[int]) -> int:
async def _allocate_subnet_index(db: AsyncSession) -> int: async def _allocate_subnet_index(db: AsyncSession) -> int:
"""Allocate next available subnet_index from the database. """Allocate next available subnet_index from the database.
Uses gap-filling: finds the lowest integer in [1,255] not already used. Uses an admin session to see ALL tenants' subnet indices (bypasses RLS).
The UNIQUE constraint on subnet_index protects against races. The UNIQUE constraint on subnet_index protects against races.
""" """
result = await db.execute(select(VpnConfig.subnet_index)) from app.database import AdminAsyncSessionLocal
used = {row[0] for row in result.all()} from sqlalchemy import text as sa_text
async with AdminAsyncSessionLocal() as admin_db:
result = await admin_db.execute(sa_text("SELECT subnet_index FROM vpn_config"))
used = {row[0] for row in result.fetchall()}
return _allocate_subnet_index_from_used(used) return _allocate_subnet_index_from_used(used)
@@ -158,7 +162,8 @@ async def _commit_and_sync(db: AsyncSession) -> None:
first for their changes to be visible. This helper combines both steps first for their changes to be visible. This helper combines both steps
and provides a single patch point for tests. and provides a single patch point for tests.
""" """
await _commit_and_sync(db) await db.commit()
await sync_wireguard_config()
async def sync_wireguard_config() -> None: async def sync_wireguard_config() -> None:
@@ -244,11 +249,33 @@ async def sync_wireguard_config() -> None:
tmp_path.write_text("\n".join(lines)) tmp_path.write_text("\n".join(lines))
os.rename(str(tmp_path), str(conf_path)) os.rename(str(tmp_path), str(conf_path))
# Write per-tenant SNAT rules for poller→device routing
# Docker traffic (172.16.0.0/12) going to each tenant's subnet
# gets SNATted to that tenant's gateway IP (.1) so the router
# can route replies back through the tunnel.
nat_lines = ["#!/bin/sh",
"# Auto-generated per-tenant SNAT rules",
"# Remove old rules",
"iptables -t nat -F POSTROUTING 2>/dev/null",
"# Re-add Docker DNS rules",
]
for config in configs:
gateway_ip = config.server_address.split("/")[0] # e.g. 10.10.3.1
subnet = config.subnet # e.g. 10.10.3.0/24
nat_lines.append(
f"iptables -t nat -A POSTROUTING -s 172.16.0.0/12 -d {subnet} -o wg0 -j SNAT --to-source {gateway_ip}"
)
nat_path = wg_confs_dir / "nat_rules.sh"
nat_tmp = wg_confs_dir / "nat_rules.sh.tmp"
nat_tmp.write_text("\n".join(nat_lines) + "\n")
os.rename(str(nat_tmp), str(nat_path))
os.chmod(str(nat_path), 0o755)
# Signal WireGuard container to reload # Signal WireGuard container to reload
reload_flag = wg_confs_dir / ".reload" reload_flag = wg_confs_dir / ".reload"
reload_flag.write_text("1") reload_flag.write_text("1")
logger.info("wireguard_config_synced", event="vpn_audit", logger.info("wireguard_config_synced", audit=True,
tenants=len(configs), peers=total_peers) tenants=len(configs), peers=total_peers)
finally: finally:
@@ -304,6 +331,18 @@ async def setup_vpn(
if existing: if existing:
raise ValueError("VPN already configured for this tenant") raise ValueError("VPN already configured for this tenant")
# Auto-set endpoint from CORS_ORIGINS if not provided
if not endpoint:
origins = getattr(settings, "CORS_ORIGINS", "")
if isinstance(origins, list):
host = origins[0] if origins else ""
else:
host = str(origins).split(",")[0].strip()
# Extract hostname from URL, append WireGuard port
if host:
host = host.replace("https://", "").replace("http://", "").split("/")[0]
endpoint = f"{host}:51820"
# Get or create global server keypair # Get or create global server keypair
_, public_key_b64 = await _get_or_create_global_server_key(db) _, public_key_b64 = await _get_or_create_global_server_key(db)
@@ -332,7 +371,7 @@ async def setup_vpn(
db.add(config) db.add(config)
await db.flush() await db.flush()
logger.info("vpn_subnet_allocated", event="vpn_audit", logger.info("vpn_subnet_allocated", audit=True,
tenant_id=str(tenant_id), subnet_index=subnet_index, subnet=subnet) tenant_id=str(tenant_id), subnet_index=subnet_index, subnet=subnet)
await _commit_and_sync(db) await _commit_and_sync(db)
@@ -464,7 +503,7 @@ async def get_peer_config(db: AsyncSession, tenant_id: uuid.UUID, peer_id: uuid.
f'/interface wireguard add name=wg-portal listen-port=13231 private-key="{private_key}"', f'/interface wireguard add name=wg-portal listen-port=13231 private-key="{private_key}"',
f'/interface wireguard peers add interface=wg-portal public-key="{config.server_public_key}" ' f'/interface wireguard peers add interface=wg-portal public-key="{config.server_public_key}" '
f'endpoint-address={endpoint.split(":")[0]} endpoint-port={endpoint.split(":")[-1]} ' f'endpoint-address={endpoint.split(":")[0]} endpoint-port={endpoint.split(":")[-1]} '
f'allowed-address={config.subnet} persistent-keepalive=25' f'allowed-address=10.10.0.0/16 persistent-keepalive=25'
+ (f' preshared-key="{psk}"' if psk else ""), + (f' preshared-key="{psk}"' if psk else ""),
f"/ip address add address={peer.assigned_ip} interface=wg-portal", f"/ip address add address={peer.assigned_ip} interface=wg-portal",
] ]
@@ -545,7 +584,7 @@ async def onboard_device(
f'/interface wireguard add name=wg-portal listen-port=13231 private-key="{private_key_b64}"', f'/interface wireguard add name=wg-portal listen-port=13231 private-key="{private_key_b64}"',
f'/interface wireguard peers add interface=wg-portal public-key="{config.server_public_key}" ' f'/interface wireguard peers add interface=wg-portal public-key="{config.server_public_key}" '
f'endpoint-address={endpoint.split(":")[0]} endpoint-port={endpoint.split(":")[-1]} ' f'endpoint-address={endpoint.split(":")[0]} endpoint-port={endpoint.split(":")[-1]} '
f'allowed-address={config.subnet} persistent-keepalive=25' f'allowed-address=10.10.0.0/16 persistent-keepalive=25'
f' preshared-key="{psk_decrypted}"', f' preshared-key="{psk_decrypted}"',
f"/ip address add address={assigned_ip} interface=wg-portal", f"/ip address add address={assigned_ip} interface=wg-portal",
] ]

View File

@@ -1,8 +1,12 @@
#!/bin/sh #!/bin/sh
# Enable forwarding between Docker network and WireGuard tunnel # Enable forwarding between Docker network and WireGuard tunnel
# Idempotent: check before adding to prevent duplicates on restart # Idempotent: check before adding to prevent duplicates on restart
# Allow Docker→VPN (poller/API reaching devices)
iptables -C FORWARD -i eth0 -o wg0 -j ACCEPT 2>/dev/null || iptables -A FORWARD -i eth0 -o wg0 -j ACCEPT iptables -C FORWARD -i eth0 -o wg0 -j ACCEPT 2>/dev/null || iptables -A FORWARD -i eth0 -o wg0 -j ACCEPT
iptables -C FORWARD -i wg0 -o eth0 -j ACCEPT 2>/dev/null || iptables -A FORWARD -i wg0 -o eth0 -j ACCEPT # Allow VPN→Docker ONLY (devices reaching poller/API, NOT the public internet)
iptables -C FORWARD -i wg0 -o eth0 -d 172.16.0.0/12 -j ACCEPT 2>/dev/null || iptables -A FORWARD -i wg0 -o eth0 -d 172.16.0.0/12 -j ACCEPT
# Block VPN→anywhere else (prevents using server as exit node)
iptables -C FORWARD -i wg0 -o eth0 -j DROP 2>/dev/null || iptables -A FORWARD -i wg0 -o eth0 -j DROP
# Block cross-subnet traffic on wg0 (tenant isolation) # Block cross-subnet traffic on wg0 (tenant isolation)
# Peers in 10.10.1.0/24 cannot reach peers in 10.10.2.0/24 # Peers in 10.10.1.0/24 cannot reach peers in 10.10.2.0/24
@@ -11,7 +15,76 @@ iptables -C FORWARD -i wg0 -o wg0 -j DROP 2>/dev/null || iptables -A FORWARD -i
# Block IPv6 forwarding on wg0 (prevent link-local bypass) # Block IPv6 forwarding on wg0 (prevent link-local bypass)
ip6tables -C FORWARD -i wg0 -j DROP 2>/dev/null || ip6tables -A FORWARD -i wg0 -j DROP ip6tables -C FORWARD -i wg0 -j DROP 2>/dev/null || ip6tables -A FORWARD -i wg0 -j DROP
# NAT for return traffic # NAT for return traffic — per-tenant SNAT rules are applied by wg-reload watcher
iptables -t nat -C POSTROUTING -o wg0 -j MASQUERADE 2>/dev/null || iptables -t nat -A POSTROUTING -o wg0 -j MASQUERADE # (nat_rules.sh is generated by sync_wireguard_config)
echo "WireGuard forwarding and tenant isolation rules applied" echo "WireGuard forwarding and tenant isolation rules applied"
# Start config reload watcher in background
# Polls for .reload flag every 2s, applies changes via wg syncconf
(
CONF_DIR="/config/wg_confs"
RELOAD_FLAG="$CONF_DIR/.reload"
echo "wg-reload: watcher started (pid $$)"
# Wait for wg0 interface to be fully up before processing reloads
while ! wg show wg0 >/dev/null 2>&1; do
sleep 2
done
# Apply NAT rules on startup if they exist
if [ -f "$CONF_DIR/nat_rules.sh" ]; then
sh "$CONF_DIR/nat_rules.sh" 2>&1
echo "wg-reload: startup NAT rules applied"
fi
# Clear any reload flag that was set during startup (interface already has the config)
rm -f "$RELOAD_FLAG"
echo "wg-reload: wg0 is up, watching for changes"
while true; do
if [ -f "$RELOAD_FLAG" ]; then
rm -f "$RELOAD_FLAG"
sleep 0.5
if [ -f "$CONF_DIR/wg0.conf" ]; then
# Strip Address and comments; keep ListenPort + PrivateKey + Peers
# wg syncconf rejects Address but needs ListenPort to preserve it
grep -v "^Address" "$CONF_DIR/wg0.conf" | grep -v "^#" | wg syncconf wg0 /dev/stdin 2>&1
# Apply per-tenant SNAT rules for poller connectivity
if [ -f "$CONF_DIR/nat_rules.sh" ]; then
sh "$CONF_DIR/nat_rules.sh" 2>&1
echo "wg-reload: NAT rules applied"
fi
if [ $? -eq 0 ]; then
echo "wg-reload: config applied"
else
echo "wg-reload: syncconf failed"
fi
fi
fi
sleep 2
done
) &
# Start status writer in background
# Writes wg_status.json every 15 seconds from `wg show wg0 dump`
(
STATUS_FILE="/config/wg_status.json"
# Wait for wg0
while ! wg show wg0 >/dev/null 2>&1; do
sleep 2
done
echo "wg-status: writer started"
while true; do
# Parse `wg show wg0 dump` into JSON array
# Format: private_key public_key listen_port fwmark
# Peer lines: public_key preshared_key endpoint allowed_ips latest_handshake transfer_rx transfer_tx persistent_keepalive
wg show wg0 dump 2>/dev/null | awk -F'\t' '
BEGIN { printf "[" ; first=1 }
NR > 1 {
if (!first) printf ","
first=0
printf "{\"public_key\":\"%s\",\"endpoint\":\"%s\",\"allowed_ips\":\"%s\",\"last_handshake\":%s,\"rx\":%s,\"tx\":%s}",
$1, $3, $4, ($5 == "" ? "0" : $5), ($6 == "" ? "0" : $6), ($7 == "" ? "0" : $7)
}
END { printf "]\n" }
' > "${STATUS_FILE}.tmp" && mv "${STATUS_FILE}.tmp" "$STATUS_FILE"
sleep 15
done
) &

View File

@@ -134,6 +134,16 @@ export function VpnPage() {
}, },
}) })
const deleteMutation = useMutation({
mutationFn: () => vpnApi.deleteConfig(tenantId),
onSuccess: () => {
queryClient.invalidateQueries({ queryKey: ['vpn-config'] })
queryClient.invalidateQueries({ queryKey: ['vpn-peers'] })
toast({ title: 'VPN configuration deleted' })
},
onError: (e: any) => toast({ title: e?.response?.data?.detail || 'Failed to delete VPN', variant: 'destructive' }),
})
// ── Helpers ── // ── Helpers ──
const connectedPeerIds = new Set(peers.map((p) => p.device_id)) const connectedPeerIds = new Set(peers.map((p) => p.device_id))
@@ -195,22 +205,6 @@ export function VpnPage() {
</p> </p>
</div> </div>
<div className="space-y-3 text-left">
<Label htmlFor="endpoint" className="text-text-secondary">
Server Address <span className="text-text-muted">(optional)</span>
</Label>
<Input
id="endpoint"
placeholder="your-server.example.com:51820"
value={endpoint}
onChange={(e) => setEndpoint(e.target.value)}
className="text-center"
/>
<p className="text-xs text-text-muted">
The public hostname or IP where devices will connect. You can set this later.
</p>
</div>
{writable && ( {writable && (
<Button <Button
onClick={() => setupMutation.mutate()} onClick={() => setupMutation.mutate()}
@@ -264,6 +258,19 @@ export function VpnPage() {
<Button size="sm" onClick={() => setShowAddDevice(true)}> <Button size="sm" onClick={() => setShowAddDevice(true)}>
<Plus className="h-4 w-4 mr-1" /> Add Device <Plus className="h-4 w-4 mr-1" /> Add Device
</Button> </Button>
<Button
variant="outline"
size="sm"
className="text-red-400 border-red-800 hover:bg-red-900/30"
onClick={() => {
if (confirm('Delete VPN configuration? All peers will be removed.')) {
deleteMutation.mutate()
}
}}
disabled={deleteMutation.isPending}
>
Delete VPN
</Button>
</> </>
)} )}
</div> </div>

View File

@@ -1136,6 +1136,9 @@ export const vpnApi = {
updateConfig: (tenantId: string, data: { endpoint?: string; is_enabled?: boolean }) => updateConfig: (tenantId: string, data: { endpoint?: string; is_enabled?: boolean }) =>
api.patch<VpnConfigResponse>(`/api/tenants/${tenantId}/vpn`, data).then((r) => r.data), api.patch<VpnConfigResponse>(`/api/tenants/${tenantId}/vpn`, data).then((r) => r.data),
deleteConfig: (tenantId: string) =>
api.delete(`/api/tenants/${tenantId}/vpn`),
listPeers: (tenantId: string) => listPeers: (tenantId: string) =>
api.get<VpnPeerResponse[]>(`/api/tenants/${tenantId}/vpn/peers`).then((r) => r.data), api.get<VpnPeerResponse[]>(`/api/tenants/${tenantId}/vpn/peers`).then((r) => r.data),

View File

@@ -988,8 +988,12 @@ def prepare_data_dirs() -> None:
#!/bin/sh #!/bin/sh
# Enable forwarding between Docker network and WireGuard tunnel # Enable forwarding between Docker network and WireGuard tunnel
# Idempotent: check before adding to prevent duplicates on restart # Idempotent: check before adding to prevent duplicates on restart
# Allow Docker→VPN (poller/API reaching devices)
iptables -C FORWARD -i eth0 -o wg0 -j ACCEPT 2>/dev/null || iptables -A FORWARD -i eth0 -o wg0 -j ACCEPT iptables -C FORWARD -i eth0 -o wg0 -j ACCEPT 2>/dev/null || iptables -A FORWARD -i eth0 -o wg0 -j ACCEPT
iptables -C FORWARD -i wg0 -o eth0 -j ACCEPT 2>/dev/null || iptables -A FORWARD -i wg0 -o eth0 -j ACCEPT # Allow VPN→Docker ONLY (devices reaching poller/API, NOT the public internet)
iptables -C FORWARD -i wg0 -o eth0 -d 172.16.0.0/12 -j ACCEPT 2>/dev/null || iptables -A FORWARD -i wg0 -o eth0 -d 172.16.0.0/12 -j ACCEPT
# Block VPN→anywhere else (prevents using server as exit node)
iptables -C FORWARD -i wg0 -o eth0 -j DROP 2>/dev/null || iptables -A FORWARD -i wg0 -o eth0 -j DROP
# Block cross-subnet traffic on wg0 (tenant isolation) # Block cross-subnet traffic on wg0 (tenant isolation)
# Peers in 10.10.1.0/24 cannot reach peers in 10.10.2.0/24 # Peers in 10.10.1.0/24 cannot reach peers in 10.10.2.0/24
@@ -998,8 +1002,7 @@ iptables -C FORWARD -i wg0 -o wg0 -j DROP 2>/dev/null || iptables -A FORWARD -i
# Block IPv6 forwarding on wg0 (prevent link-local bypass) # Block IPv6 forwarding on wg0 (prevent link-local bypass)
ip6tables -C FORWARD -i wg0 -j DROP 2>/dev/null || ip6tables -A FORWARD -i wg0 -j DROP ip6tables -C FORWARD -i wg0 -j DROP 2>/dev/null || ip6tables -A FORWARD -i wg0 -j DROP
# NAT for return traffic # NAT for return traffic — per-tenant SNAT rules applied by wg-reload watcher
iptables -t nat -C POSTROUTING -o wg0 -j MASQUERADE 2>/dev/null || iptables -t nat -A POSTROUTING -o wg0 -j MASQUERADE
echo "WireGuard forwarding and tenant isolation rules applied" echo "WireGuard forwarding and tenant isolation rules applied"
""") """)