openova/core/controllers/sandbox/cmd/sandbox-controller/main.go
hatiyildiz 88453dc4c2 fix(sandbox-controller): emit canonical SANDBOX_* env vars for MCP plugin (Refs #1986)
TBD-P4 B4 — env-var name drift between the sandbox-controller and the
MCP plugin silently degraded every MCP tool family to "not configured"
at runtime. The controller emitted bare `ORG_ID` and `SOVEREIGN_FQDN`
on every rendered MCP Deployment while the MCP binary
(products/sandbox/mcp-server/internal/tools/env.go) reads the
namespaced canonical `SANDBOX_ORG_ID` / `SANDBOX_SOVEREIGN_FQDN`. Per
agent a99ea3aa's investigation, six additional env-var families the
MCP requires were never wired at all.

Surgical alignment across renderer + chart + controller wiring:

1. core/controllers/sandbox/internal/gitops/manifests.go — MCP
   Deployment template renamed the bare names AND grew env entries
   for the canonical set the MCP plugin reads:

   Rename (MCP Deployment only; pty-server StatefulSet keeps the bare
   names since they are inherited into the user's agent shell — that
   is a distinct contract):
     ORG_ID         -> SANDBOX_ORG_ID            (tool family: all)
     SOVEREIGN_FQDN -> SANDBOX_SOVEREIGN_FQDN    (tool family: all)

   Added (the MCP plugin was reading them; controller wasn't emitting):
     SANDBOX_ID                    -> identifies the Sandbox CR
     SANDBOX_NAMESPACE             -> rendered ns sandbox-<owner-uid>
     SANDBOX_TENANT_ID             -> scopes marketplace/byod handler
     SANDBOX_GITEA_BASE_URL        -> sandbox.deploy / gitea tool family
     SANDBOX_GITEA_TOKEN (secret)  -> ditto, via secretKeyRef optional
     SANDBOX_DOMAIN_API_URL        -> marketplace tool family
     SANDBOX_MARKETPLACE_API_URL   -> marketplace tool family
     SANDBOX_STORAGE_S3_ENDPOINT   -> sandbox.storage tool family
     SANDBOX_STORAGE_S3_REGION     -> ditto
     SANDBOX_STORAGE_S3_USE_TLS    -> ditto
     SANDBOX_STORAGE_S3_ACCESS_KEY -> ditto, via secretKeyRef optional
     SANDBOX_STORAGE_S3_SECRET_KEY -> ditto, via secretKeyRef optional
     KEYCLOAK_ADMIN_URL            -> sandbox.auth tool family
     KEYCLOAK_PARENT_REALM         -> ditto
     KEYCLOAK_ADMIN_TOKEN (secret) -> ditto, via secretKeyRef optional

2. platform/sandbox/chart — bp-sandbox HR surfaces the new wiring as
   chart-level values (mcp.giteaBaseURL, mcp.domainAPIURL,
   mcp.storage.*, mcp.keycloak.*) defaulting to the in-cluster Service
   DNS of a stock Sovereign install. Per-Sovereign overlays may
   override any value. Secrets are NEVER written from this chart —
   name+key references only with `optional: true` so a fresh-prov
   Sovereign with a credential source in flight does NOT crash the
   per-Sandbox MCP Pod; the affected tool family surfaces a clean
   "not configured" error at call time (matches the MCP plugin's
   existing per-tool guard pattern).

3. Chart.yaml + bootstrap-kit pin (19a-bp-sandbox.yaml) bumped to
   0.2.0 so the per-Sovereign overlay picks up the new env surface
   on the next reconcile.

4. sandbox_controller_test.go — extended deployment-mcp.yaml assertion
   block to assert the canonical SANDBOX_* env-var set + value
   plumbing AND added a negative assertion that the bare `ORG_ID` /
   `SOVEREIGN_FQDN` names MUST NOT appear on the MCP Deployment
   (they remain on the pty-server StatefulSet, distinct contract).
   Regression test against future re-introduction of the drift.

Validation:
 - go test ./sandbox/... — all green (controller / gitops / idlescaler
   / newapi / sandboxapi).
 - helm template platform/sandbox/chart --set enabled=true ... — clean
   render, 16 SANDBOX_MCP_* env vars emitted on the controller
   Deployment.

Hard rules honoured:
 - READ-ONLY against existing cluster (no kubectl writes).
 - No Secret writes — name+key references only, all `optional: true`.
 - emrah.baysal mailbox + Stalwart admin untouched.
 - Principle #12 fresh clone validation.

Refs #1986

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 22:00:57 +02:00

333 lines
13 KiB
Go

// sandbox-controller — Wave 1 + Wave 8 + Wave 9 of the Sandbox product
// (products/sandbox/docs/architecture.md §7).
//
// Production entry point. Reads configuration from environment vars,
// constructs the controller-runtime manager, and starts the Sandbox
// reconciler with leader election.
package main
import (
"context"
"flag"
"fmt"
"os"
"strconv"
"strings"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/manager"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
"github.com/openova-io/openova/core/controllers/pkg/gitea"
"github.com/openova-io/openova/core/controllers/pkg/natsbus"
"github.com/openova-io/openova/core/controllers/sandbox/internal/controller"
"github.com/openova-io/openova/core/controllers/sandbox/internal/idlescaler"
"github.com/openova-io/openova/core/controllers/sandbox/internal/newapi"
sandboxapi "github.com/openova-io/openova/core/controllers/sandbox/internal/sandboxapi"
)
var scheme = runtime.NewScheme()
func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(sandboxapi.AddToScheme(scheme))
}
func main() {
var (
metricsAddr string
probeAddr string
enableLeaderElection bool
)
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", true,
"Enable leader election for controller manager. Defaults to true so HA replicas don't double-write.")
opts := zap.Options{Development: false}
opts.BindFlags(flag.CommandLine)
flag.Parse()
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
log := ctrl.Log.WithName("sandbox-controller")
giteaURL := mustEnv("CATALYST_GITEA_URL", log)
giteaToken := mustEnv("CATALYST_GITEA_TOKEN", log)
hostCluster := mustEnv("CATALYST_HOST_CLUSTER", log)
sovereignFQDN := mustEnv("CATALYST_SOVEREIGN_FQDN", log)
branch := envOr("CATALYST_GITEA_BRANCH", "main")
tenantRepo := envOr("CATALYST_TENANT_REPO_NAME", "catalyst-tenant")
// Wave 8 runtime env — per-Sandbox pty-server / MCP / NEWAPI for
// the rendered Pod manifests.
ptyServerImage := mustEnv("SANDBOX_PTY_SERVER_IMAGE", log)
mcpImage := mustEnv("SANDBOX_MCP_IMAGE", log)
sandboxNewapiURL := mustEnv("SANDBOX_NEWAPI_URL", log)
llmGatewayTokenSecret := envOr("SANDBOX_LLM_GATEWAY_TOKEN_SECRET", "sandbox-tokens")
byosSecretPrefix := envOr("SANDBOX_BYOS_SECRET_PREFIX", "sandbox-byos-claude-code")
idleTimeoutMinutes := envOrInt("SANDBOX_IDLE_TIMEOUT_MINUTES", 30)
// Wave 9 — NewAPI bridge wiring. Two env vars carry the bridge URL +
// admin bearer used by the controller to call POST
// /admin/tokens/sandbox (catalyst-api bridge handler, PR #1638).
// Both are REQUIRED in production — a sandbox-controller without
// the bridge wired silently ships Sandboxes without an LLM
// connection. Permit unset for compatibility with smoke tests
// that exercise only the gitops path (env both unset ⇒ controller
// runs without the token-mint path; log line announces it).
newapiBaseURL := strings.TrimSpace(os.Getenv("NEWAPI_BASE_URL"))
newapiAdmin := strings.TrimSpace(os.Getenv("NEWAPI_ADMIN_SECRET"))
defaultChannels := splitAndTrim(envOr("NEWAPI_DEFAULT_CHANNELS", ""), ",")
// D31 active-hot-standby — Sovereign-level toggle + region pair the
// controller threads into every per-Sandbox MCP Pod. The MCP
// server's sandbox.db.provision handler reads these at call time
// and, when valid, materialises a primary + replica Cluster.
// postgresql.cnpg.io pair instead of a single Cluster (DoD D31).
// Default-empty keeps every existing Sandbox on single-Cluster
// CNPG (zero regression). Bootstrap-kit slot 61 wires these from
// the per-Sovereign overlay's envsubst placeholders into the
// bp-sandbox HelmRelease values.
enableHotStandby := envOr("SOVEREIGN_ENABLE_HOT_STANDBY", "")
primaryRegion := envOr("SOVEREIGN_PRIMARY_REGION", "")
replicaRegion := envOr("SOVEREIGN_REPLICA_REGION", "")
// TBD-P4 B4 — canonical SANDBOX_* env wiring for the MCP plugin
// (products/sandbox/mcp-server/internal/tools/env.go). All have
// in-cluster defaults; per-Sovereign overlays may override via
// bp-sandbox HR values. Empty leaves the MCP's per-tool guard to
// surface "not configured" at call time rather than crashing the
// controller at startup.
mcpGiteaBaseURL := envOr("SANDBOX_MCP_GITEA_BASE_URL", giteaURL)
mcpGiteaTokenSecretName := envOr("SANDBOX_MCP_GITEA_TOKEN_SECRET_NAME", "catalyst-gitea-token")
mcpGiteaTokenSecretKey := envOr("SANDBOX_MCP_GITEA_TOKEN_SECRET_KEY", "token")
mcpDomainAPIURL := envOr("SANDBOX_MCP_DOMAIN_API_URL", "http://domain.sme.svc.cluster.local:8086")
mcpMarketplaceAPIURL := envOr("SANDBOX_MCP_MARKETPLACE_API_URL", "http://marketplace-api.marketplace.svc.cluster.local:8082")
mcpStorageS3Endpoint := envOr("SANDBOX_MCP_STORAGE_S3_ENDPOINT", "http://seaweedfs.storage.svc.cluster.local:8333")
mcpStorageS3Region := envOr("SANDBOX_MCP_STORAGE_S3_REGION", "us-east-1")
mcpStorageS3UseTLS := envOr("SANDBOX_MCP_STORAGE_S3_USE_TLS", "false")
mcpStorageS3CredsSecret := envOr("SANDBOX_MCP_STORAGE_S3_CREDS_SECRET_NAME", "")
mcpStorageS3AccessKeyKey := envOr("SANDBOX_MCP_STORAGE_S3_ACCESS_KEY_KEY", "AWS_ACCESS_KEY_ID")
mcpStorageS3SecretKeyKey := envOr("SANDBOX_MCP_STORAGE_S3_SECRET_KEY_KEY", "AWS_SECRET_ACCESS_KEY")
mcpKeycloakAdminURL := envOr("SANDBOX_MCP_KEYCLOAK_ADMIN_URL", "http://keycloak.keycloak.svc.cluster.local:8080")
mcpKeycloakParentRealm := envOr("SANDBOX_MCP_KEYCLOAK_PARENT_REALM", "master")
mcpKeycloakAdminTokenSecret := envOr("SANDBOX_MCP_KEYCLOAK_ADMIN_TOKEN_SECRET_NAME", "")
mcpKeycloakAdminTokenSecretKey := envOr("SANDBOX_MCP_KEYCLOAK_ADMIN_TOKEN_SECRET_KEY", "token")
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{BindAddress: metricsAddr},
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "sandbox-controller.sandbox.openova.io",
})
if err != nil {
log.Error(err, "manager init")
os.Exit(1)
}
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
log.Error(err, "healthz")
os.Exit(1)
}
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
log.Error(err, "readyz")
os.Exit(1)
}
var newapiClient newapi.Client
if newapiBaseURL != "" && newapiAdmin != "" {
c, err := newapi.New(newapiBaseURL, newapiAdmin, nil)
if err != nil {
log.Error(err, "newapi client init")
os.Exit(1)
}
newapiClient = c
} else {
log.Info("newapi bridge not wired — sandbox-controller running in gitops-only mode",
"newapi_base_url_set", newapiBaseURL != "",
"newapi_admin_secret_set", newapiAdmin != "",
)
}
r := &controller.Reconciler{
Client: mgr.GetClient(),
Log: log.WithName("reconciler"),
GiteaClient: gitea.New(giteaURL, giteaToken),
HostCluster: hostCluster,
SovereignFQDN: sovereignFQDN,
Branch: branch,
TenantRepoName: tenantRepo,
PtyServerImage: ptyServerImage,
MCPImage: mcpImage,
NewapiURL: sandboxNewapiURL,
LLMGatewayTokenSecret: llmGatewayTokenSecret,
BYOSSecretPrefix: byosSecretPrefix,
IdleTimeoutMinutes: idleTimeoutMinutes,
NewAPIClient: newapiClient,
DefaultChannels: defaultChannels,
EnableHotStandby: enableHotStandby,
PrimaryRegion: primaryRegion,
ReplicaRegion: replicaRegion,
// TBD-P4 B4 — canonical SANDBOX_* env-var wiring for MCP plugin.
GiteaBaseURL: mcpGiteaBaseURL,
GiteaTokenSecretName: mcpGiteaTokenSecretName,
GiteaTokenSecretKey: mcpGiteaTokenSecretKey,
DomainAPIURL: mcpDomainAPIURL,
MarketplaceAPIURL: mcpMarketplaceAPIURL,
StorageS3Endpoint: mcpStorageS3Endpoint,
StorageS3Region: mcpStorageS3Region,
StorageS3UseTLS: mcpStorageS3UseTLS,
StorageS3CredsSecretName: mcpStorageS3CredsSecret,
StorageS3AccessKeyKey: mcpStorageS3AccessKeyKey,
StorageS3SecretKeyKey: mcpStorageS3SecretKeyKey,
KeycloakAdminURL: mcpKeycloakAdminURL,
KeycloakParentRealm: mcpKeycloakParentRealm,
KeycloakAdminTokenSecret: mcpKeycloakAdminTokenSecret,
KeycloakAdminTokenSecretKey: mcpKeycloakAdminTokenSecretKey,
}
if err := r.SetupWithManager(mgr); err != nil {
log.Error(err, "setup reconciler")
os.Exit(1)
}
// Wave 10 (PR #1641 follow-up) — IdleScaler reads the
// `openova.io/sandbox-idle-timeout-minutes` annotation the
// renderer writes on every pty-server StatefulSet, polls each
// pty-server Service for live activity, and scales replicas to 0
// once the idle window has elapsed. Leader-elected so HA
// controller replicas don't race.
scaler := idlescaler.New(mgr.GetClient(),
log.WithName("idle-scaler"),
idlescaler.Options{
DefaultIdleTimeoutMinutes: idleTimeoutMinutes,
})
if err := mgr.Add(scaler); err != nil {
log.Error(err, "add idle-scaler to manager")
os.Exit(1)
}
// D35 consume-leg — subscribe to `catalyst.tenant.sandbox_requested`
// so the publish from tenant-service nudges the matching Sandbox CR
// into a fresh Reconcile within ~50ms. Same wiring shape as the
// organization-controller's NATS bridge. Best-effort: NATS_URL
// unset → log + continue (informer requeue fallback intact).
natsURL := strings.TrimSpace(os.Getenv("NATS_URL"))
sandboxNs := envOr("SANDBOX_NAMESPACE", "catalyst-system")
if natsURL != "" {
if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error {
sub, err := natsbus.Connect(natsURL)
if err != nil {
log.Error(err, "natsbus: connect failed — D35 consume-leg disabled",
"nats_url", natsURL)
return nil
}
bridge := &controller.NATSBridge{
Client: mgr.GetClient(),
Log: log.WithName("natsbridge"),
Namespace: sandboxNs,
}
if err := sub.Subscribe(ctx,
natsbus.SubjectTenantSandboxRequested,
"sandbox-controller-sandbox-requested",
bridge.HandleSandboxRequested,
natsbus.SubscribeOptions{},
); err != nil {
log.Error(err, "natsbus: subscribe tenant.sandbox_requested failed")
}
<-ctx.Done()
sub.Close()
return nil
})); err != nil {
log.Error(err, "natsbus: add runnable failed")
os.Exit(1)
}
log.Info("natsbus: D35 consume-leg wired",
"nats_url", natsURL,
"subjects", []string{natsbus.SubjectTenantSandboxRequested},
"sandbox_namespace", sandboxNs,
)
} else {
log.Info("natsbus: NATS_URL unset — D35 consume-leg disabled (informer-requeue fallback only)")
}
log.Info("starting manager",
"host_cluster", hostCluster,
"sovereign_fqdn", sovereignFQDN,
"gitea_url", giteaURL,
"tenant_repo", tenantRepo,
"pty_server_image", ptyServerImage,
"mcp_image", mcpImage,
"newapi_url", sandboxNewapiURL,
"llm_gateway_token_secret", llmGatewayTokenSecret,
"byos_secret_prefix", byosSecretPrefix,
"idle_timeout_minutes", idleTimeoutMinutes,
"newapi_wired", newapiClient != nil,
"default_channels", defaultChannels,
)
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
log.Error(err, "manager start")
os.Exit(1)
}
}
func mustEnv(key string, log interface {
Error(err error, msg string, kvs ...any)
},
) string {
v := strings.TrimSpace(os.Getenv(key))
if v == "" {
log.Error(fmt.Errorf("missing env"), "required env var unset", "key", key)
os.Exit(2)
}
return v
}
func envOr(key, fallback string) string {
v := strings.TrimSpace(os.Getenv(key))
if v == "" {
return fallback
}
return v
}
// envOrInt parses an integer env var; non-integer / empty returns the
// fallback. Used for SANDBOX_IDLE_TIMEOUT_MINUTES — operator drift
// (mistyped value) shouldn't crash the controller.
func envOrInt(key string, fallback int) int {
v := strings.TrimSpace(os.Getenv(key))
if v == "" {
return fallback
}
n, err := strconv.Atoi(v)
if err != nil || n <= 0 {
return fallback
}
return n
}
// splitAndTrim splits s on sep and returns the non-empty trimmed
// pieces. "qwen,vllm , " → ["qwen","vllm"]. Empty s returns nil so
// the caller's len()==0 check is unambiguous.
func splitAndTrim(s, sep string) []string {
if strings.TrimSpace(s) == "" {
return nil
}
parts := strings.Split(s, sep)
out := make([]string, 0, len(parts))
for _, p := range parts {
p = strings.TrimSpace(p)
if p == "" {
continue
}
out = append(out, p)
}
return out
}