feat: add opt-in anonymous setup telemetry

Instruments setup.py to send per-step diagnostic events to the TOD
telemetry collector when the user opts in. Uses a shared static token
with no registration flow — fully anonymous.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Jason Staack
2026-03-16 20:04:26 -05:00
parent 21563132e1
commit ecbefae9ab

277
setup.py
View File

@@ -11,8 +11,10 @@ Usage:
import base64 import base64
import datetime import datetime
import getpass import getpass
import json
import os import os
import pathlib import pathlib
import platform
import re import re
import secrets import secrets
import shutil import shutil
@@ -21,6 +23,8 @@ import socket
import subprocess import subprocess
import sys import sys
import time import time
import urllib.error
import urllib.request
# ── Constants ──────────────────────────────────────────────────────────────── # ── Constants ────────────────────────────────────────────────────────────────
@@ -95,6 +99,148 @@ def info(text: str) -> None:
print(f" {dim('·')} {text}") print(f" {dim('·')} {text}")
# ── Setup Telemetry ─────────────────────────────────────────────────────────
_TELEMETRY_COLLECTOR = "https://telemetry.theotherdude.net"
_TELEMETRY_TOKEN = "75e320cbd48e20e3234ab4e734f86e124a903a7278e643cf6d383708a8a7fe4b"
def _collect_environment() -> dict:
"""Gather allowlisted environment info. No IPs, hostnames, or secrets."""
env = {
"os": platform.system(),
"os_version": platform.release(),
"arch": platform.machine(),
"python": platform.python_version(),
}
# Docker version
try:
r = subprocess.run(
["docker", "version", "--format", "{{.Server.Version}}"],
capture_output=True, text=True, timeout=5,
)
if r.returncode == 0:
env["docker"] = r.stdout.strip()
except Exception:
pass
# Compose version
try:
r = subprocess.run(
["docker", "compose", "version", "--short"],
capture_output=True, text=True, timeout=5,
)
if r.returncode == 0:
env["compose"] = r.stdout.strip()
except Exception:
pass
# RAM (rounded to nearest GB)
try:
if sys.platform == "darwin":
r = subprocess.run(
["sysctl", "-n", "hw.memsize"],
capture_output=True, text=True, timeout=5,
)
if r.returncode == 0:
env["ram_gb"] = round(int(r.stdout.strip()) / (1024 ** 3))
else:
with open("/proc/meminfo") as f:
for line in f:
if line.startswith("MemTotal:"):
env["ram_gb"] = round(int(line.split()[1]) * 1024 / (1024 ** 3))
break
except Exception:
pass
return env
def _get_app_version() -> tuple[str, str]:
"""Return (version, build_id) from git if available."""
version = "unknown"
build_id = "unknown"
try:
r = subprocess.run(
["git", "describe", "--tags", "--always"],
capture_output=True, text=True, timeout=5, cwd=PROJECT_ROOT,
)
if r.returncode == 0:
version = r.stdout.strip()
except Exception:
pass
try:
r = subprocess.run(
["git", "rev-parse", "--short", "HEAD"],
capture_output=True, text=True, timeout=5, cwd=PROJECT_ROOT,
)
if r.returncode == 0:
build_id = r.stdout.strip()
except Exception:
pass
return version, build_id
class SetupTelemetry:
"""Lightweight fire-and-forget telemetry for setup diagnostics.
When enabled, sends one event per setup step to the TOD telemetry collector.
All events use a shared anonymous token — no registration, no PII.
"""
def __init__(self) -> None:
self.enabled = False
self._environment: dict = {}
self._app_version = "unknown"
self._build_id = "unknown"
def enable(self) -> None:
self.enabled = True
self._environment = _collect_environment()
self._app_version, self._build_id = _get_app_version()
def step(self, step_name: str, result: str, duration_ms: int | None = None,
error_message: str | None = None, error_code: str | None = None,
metrics: dict | None = None) -> None:
"""Emit a single setup step event. No-op if disabled."""
if not self.enabled:
return
event: dict = {
"event_type": "setup",
"severity": "error" if result == "failure" else "info",
"phase": "setup",
"operation": step_name,
"result": result,
"timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
"app_version": self._app_version,
"build_id": self._build_id,
"environment": self._environment,
}
if duration_ms is not None:
event["duration_ms"] = duration_ms
if error_message:
event["error"] = {"message": error_message[:500], "code": error_code or ""}
if metrics:
event["metrics"] = metrics
self._send([event])
def _send(self, events: list[dict]) -> None:
"""POST events to the collector. Fire-and-forget."""
try:
body = json.dumps({"events": events}).encode()
req = urllib.request.Request(
f"{_TELEMETRY_COLLECTOR}/api/v1/ingest",
data=body,
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {_TELEMETRY_TOKEN}",
},
method="POST",
)
urllib.request.urlopen(req, timeout=10)
except Exception:
pass # Fire-and-forget — never affect setup
# ── Input helpers ──────────────────────────────────────────────────────────── # ── Input helpers ────────────────────────────────────────────────────────────
def ask(prompt: str, default: str = "", required: bool = False, def ask(prompt: str, default: str = "", required: bool = False,
@@ -720,6 +866,27 @@ def wizard_reverse_proxy(config: dict) -> None:
info("Traefik watches for file changes — no reload needed.") info("Traefik watches for file changes — no reload needed.")
def wizard_telemetry(config: dict, telem: SetupTelemetry) -> None:
section("Anonymous Diagnostics")
info("TOD can send anonymous setup and runtime diagnostics to help")
info("identify common failures. No personal data, IPs, hostnames,")
info("or configuration values are ever sent.")
print()
info("What is collected: step pass/fail, duration, OS/arch/Python")
info("version, Docker version, RAM (rounded), and error types.")
info("You can disable this anytime by setting TELEMETRY_ENABLED=false")
info("in .env.prod.")
print()
if ask_yes_no("Send anonymous diagnostics?", default=False):
config["telemetry_enabled"] = True
telem.enable()
ok("Diagnostics enabled — thank you!")
else:
config["telemetry_enabled"] = False
info("No diagnostics will be sent.")
# ── Summary ────────────────────────────────────────────────────────────────── # ── Summary ──────────────────────────────────────────────────────────────────
def show_summary(config: dict) -> bool: def show_summary(config: dict) -> bool:
@@ -765,6 +932,13 @@ def show_summary(config: dict) -> bool:
print(f" {dim('(not configured)')}") print(f" {dim('(not configured)')}")
print() print()
print(f" {bold('Diagnostics')}")
if config.get("telemetry_enabled"):
print(f" TELEMETRY_ENABLED = {green('true')}")
else:
print(f" TELEMETRY_ENABLED = {dim('false')}")
print()
print(f" {bold('OpenBao')}") print(f" {bold('OpenBao')}")
print(f" {dim('(will be captured automatically during bootstrap)')}") print(f" {dim('(will be captured automatically during bootstrap)')}")
print() print()
@@ -868,6 +1042,11 @@ SSH_IDLE_TIMEOUT=900
# --- Config Backup --- # --- Config Backup ---
CONFIG_BACKUP_INTERVAL=21600 CONFIG_BACKUP_INTERVAL=21600
CONFIG_BACKUP_MAX_CONCURRENT=10 CONFIG_BACKUP_MAX_CONCURRENT=10
# --- Telemetry ---
# Opt-in anonymous diagnostics. Set to false to disable.
TELEMETRY_ENABLED={'true' if config.get('telemetry_enabled') else 'false'}
TELEMETRY_COLLECTOR_URL={_TELEMETRY_COLLECTOR}
""" """
ENV_PROD.write_text(content) ENV_PROD.write_text(content)
@@ -1216,12 +1395,34 @@ def health_check(config: dict) -> None:
# ── Main ───────────────────────────────────────────────────────────────────── # ── Main ─────────────────────────────────────────────────────────────────────
def _timed(telem: SetupTelemetry, step_name: str, func, *args, **kwargs):
"""Run func, emit a telemetry event with timing. Returns func's result."""
t0 = time.monotonic()
try:
result = func(*args, **kwargs)
duration_ms = int((time.monotonic() - t0) * 1000)
telem.step(step_name, "success", duration_ms=duration_ms)
return result
except Exception as e:
duration_ms = int((time.monotonic() - t0) * 1000)
telem.step(
step_name, "failure", duration_ms=duration_ms,
error_message=str(e), error_code=type(e).__name__,
)
raise
def main() -> int: def main() -> int:
# Graceful Ctrl+C # Graceful Ctrl+C
env_written = False env_written = False
telem = SetupTelemetry()
setup_start = time.monotonic()
def handle_sigint(sig, frame): def handle_sigint(sig, frame):
nonlocal env_written nonlocal env_written
telem.step("setup_total", "failure",
duration_ms=int((time.monotonic() - setup_start) * 1000),
error_message="User cancelled (SIGINT)")
print() print()
if not env_written: if not env_written:
info("Aborted before writing .env.prod — no files changed.") info("Aborted before writing .env.prod — no files changed.")
@@ -1236,47 +1437,99 @@ def main() -> int:
# Phase 1: Pre-flight # Phase 1: Pre-flight
if not preflight(): if not preflight():
telem.step("preflight", "failure")
return 1 return 1
telem.step("preflight", "success")
# Telemetry opt-in (right after preflight, before wizard)
config: dict = {}
wizard_telemetry(config, telem)
# Phase 2: Wizard # Phase 2: Wizard
config: dict = {} try:
wizard_database(config) wizard_database(config)
wizard_security(config) wizard_security(config)
wizard_admin(config) wizard_admin(config)
wizard_email(config) wizard_email(config)
wizard_domain(config) wizard_domain(config)
wizard_reverse_proxy(config) wizard_reverse_proxy(config)
telem.step("wizard", "success")
except Exception as e:
telem.step("wizard", "failure",
error_message=str(e), error_code=type(e).__name__)
raise
# Summary # Summary
if not show_summary(config): if not show_summary(config):
info("Setup cancelled.") info("Setup cancelled.")
telem.step("setup_total", "failure",
duration_ms=int((time.monotonic() - setup_start) * 1000),
error_message="User cancelled at summary")
return 1 return 1
# Phase 3: Write files and prepare directories # Phase 3: Write files and prepare directories
section("Writing Configuration") section("Writing Configuration")
write_env_prod(config) try:
write_init_sql_prod(config) write_env_prod(config)
env_written = True write_init_sql_prod(config)
prepare_data_dirs() env_written = True
prepare_data_dirs()
telem.step("write_config", "success")
except Exception as e:
telem.step("write_config", "failure",
error_message=str(e), error_code=type(e).__name__)
raise
# Phase 4: OpenBao # Phase 4: OpenBao
t0 = time.monotonic()
bao_ok = bootstrap_openbao(config) bao_ok = bootstrap_openbao(config)
if not bao_ok: duration_ms = int((time.monotonic() - t0) * 1000)
if bao_ok:
telem.step("openbao_bootstrap", "success", duration_ms=duration_ms)
else:
telem.step("openbao_bootstrap", "failure", duration_ms=duration_ms,
error_message="OpenBao did not become healthy or credentials not found")
if not ask_yes_no("Continue without OpenBao credentials? (stack will need manual fix)", default=False): if not ask_yes_no("Continue without OpenBao credentials? (stack will need manual fix)", default=False):
warn("Fix OpenBao credentials in .env.prod and re-run setup.py.") warn("Fix OpenBao credentials in .env.prod and re-run setup.py.")
telem.step("setup_total", "failure",
duration_ms=int((time.monotonic() - setup_start) * 1000),
error_message="Aborted after OpenBao failure")
return 1 return 1
# Phase 5: Build # Phase 5: Build
t0 = time.monotonic()
if not build_images(): if not build_images():
duration_ms = int((time.monotonic() - t0) * 1000)
telem.step("build_images", "failure", duration_ms=duration_ms)
warn("Fix the build error and re-run setup.py to continue.") warn("Fix the build error and re-run setup.py to continue.")
telem.step("setup_total", "failure",
duration_ms=int((time.monotonic() - setup_start) * 1000),
error_message="Docker build failed")
return 1 return 1
duration_ms = int((time.monotonic() - t0) * 1000)
telem.step("build_images", "success", duration_ms=duration_ms)
# Phase 6: Start # Phase 6: Start
t0 = time.monotonic()
if not start_stack(): if not start_stack():
duration_ms = int((time.monotonic() - t0) * 1000)
telem.step("start_stack", "failure", duration_ms=duration_ms)
telem.step("setup_total", "failure",
duration_ms=int((time.monotonic() - setup_start) * 1000),
error_message="Stack failed to start")
return 1 return 1
duration_ms = int((time.monotonic() - t0) * 1000)
telem.step("start_stack", "success", duration_ms=duration_ms)
# Phase 7: Health # Phase 7: Health
t0 = time.monotonic()
health_check(config) health_check(config)
duration_ms = int((time.monotonic() - t0) * 1000)
telem.step("health_check", "success", duration_ms=duration_ms)
# Done
total_ms = int((time.monotonic() - setup_start) * 1000)
telem.step("setup_total", "success", duration_ms=total_ms)
return 0 return 0