From 2d0397e8c219069ea0ce532af6a88d9ee1842996 Mon Sep 17 00:00:00 2001 From: Alex Toker Date: Wed, 25 Feb 2026 11:44:14 +0000 Subject: [PATCH] [Spark] Support spark-operator on multi-namespace deployments Co-Authored-By: Claude Opus 4.6 --- charts/mlrun-ce/Chart.yaml | 2 +- .../mlrun-ce/admin_installation_values.yaml | 20 + ..._admin_cluster_ip_installation_values.yaml | 24 +- .../non_admin_installation_values.yaml | 24 +- .../templates/config/mlrun-spark-config.yaml | 2 +- .../spark-operator/spark-controller-rbac.yaml | 68 ++ charts/mlrun-ce/values.yaml | 8 + tests/kind-test.sh | 623 +++++++++++++++++- 8 files changed, 749 insertions(+), 22 deletions(-) create mode 100644 charts/mlrun-ce/templates/spark-operator/spark-controller-rbac.yaml diff --git a/charts/mlrun-ce/Chart.yaml b/charts/mlrun-ce/Chart.yaml index eb248c10..6fd796cd 100644 --- a/charts/mlrun-ce/Chart.yaml +++ b/charts/mlrun-ce/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 name: mlrun-ce -version: 0.11.0-rc.12 +version: 0.11.0-rc.13 description: MLRun Open Source Stack home: https://iguazio.com icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png diff --git a/charts/mlrun-ce/admin_installation_values.yaml b/charts/mlrun-ce/admin_installation_values.yaml index c9b2bf23..56bca94f 100644 --- a/charts/mlrun-ce/admin_installation_values.yaml +++ b/charts/mlrun-ce/admin_installation_values.yaml @@ -40,6 +40,26 @@ seaweedfs: enabled: false spark-operator: + enabled: true + fullnameOverride: spark-operator + controller: + replicas: 0 # No running pods in admin + rbac: + create: true # Creates ClusterRole (shared by all user namespaces) + serviceAccount: + create: true + webhook: + enable: true + replicas: 1 + spark: + jobNamespaces: + - "" # All namespaces (no namespaceSelector on webhook) + serviceAccount: + create: false # No sparkapp SA in admin + rbac: + create: false + +spark: enabled: false pipelines: diff --git a/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml b/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml index a98463ad..3a487343 100644 --- a/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml +++ b/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml @@ -44,7 +44,29 @@ timescaledb: nodePort: "" spark-operator: - enabled: false + enabled: true + fullnameOverride: spark-operator + controller: + replicas: 1 + rbac: + create: false + serviceAccount: + create: true + leaderElection: + enable: true + webhook: + enable: false + spark: + jobNamespaces: + - mlrun + serviceAccount: + create: true + name: sparkapp + rbac: + create: true + +spark: + enabled: true pipelines: service: diff --git a/charts/mlrun-ce/non_admin_installation_values.yaml b/charts/mlrun-ce/non_admin_installation_values.yaml index d84f02ee..6f618721 100644 --- a/charts/mlrun-ce/non_admin_installation_values.yaml +++ b/charts/mlrun-ce/non_admin_installation_values.yaml @@ -38,7 +38,29 @@ seaweedfs: enabled: true spark-operator: - enabled: false + enabled: true + fullnameOverride: spark-operator + controller: + replicas: 1 # Controller runs in user namespace + rbac: + create: false # ClusterRole already exists from admin + serviceAccount: + create: true + leaderElection: + enable: true + webhook: + enable: false + spark: + jobNamespaces: + - mlrun # Override with actual namespace at install time + serviceAccount: + create: true + name: sparkapp + rbac: + create: true # Creates sparkapp Role + RoleBinding + +spark: + enabled: true pipelines: service: diff --git a/charts/mlrun-ce/templates/config/mlrun-spark-config.yaml b/charts/mlrun-ce/templates/config/mlrun-spark-config.yaml index 02054f18..000eda4b 100644 --- a/charts/mlrun-ce/templates/config/mlrun-spark-config.yaml +++ b/charts/mlrun-ce/templates/config/mlrun-spark-config.yaml @@ -1,4 +1,4 @@ -{{- if index .Values "spark-operator" "enabled" -}} +{{- if .Values.spark.enabled -}} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/mlrun-ce/templates/spark-operator/spark-controller-rbac.yaml b/charts/mlrun-ce/templates/spark-operator/spark-controller-rbac.yaml new file mode 100644 index 00000000..d9f265e1 --- /dev/null +++ b/charts/mlrun-ce/templates/spark-operator/spark-controller-rbac.yaml @@ -0,0 +1,68 @@ +{{- $sparkOp := index .Values "spark-operator" -}} +{{- $rbacCreate := true -}} +{{- if hasKey $sparkOp "controller" -}} + {{- if hasKey $sparkOp.controller "rbac" -}} + {{- $rbacCreate = $sparkOp.controller.rbac.create -}} + {{- end -}} +{{- end -}} +{{- if and $sparkOp.enabled (not $rbacCreate) -}} +{{- /* + This template renders only in user multi-NS mode: + - spark-operator subchart is enabled (controller Deployment runs here) + - controller.rbac.create is false (ClusterRole already exists from admin namespace) + + It creates: + 1. RoleBinding: controller SA → shared ClusterRole (namespace-scoped access) + 2. Role + RoleBinding: leader election leases (coordination.k8s.io) +*/ -}} +--- +# RoleBinding: Grant controller SA access to the shared ClusterRole (namespace-scoped) +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: spark-operator-controller + labels: + app.kubernetes.io/name: mlrun-ce + app.kubernetes.io/component: spark-controller-rbac + app.kubernetes.io/managed-by: {{ .Release.Name }} +subjects: + - kind: ServiceAccount + name: spark-operator-controller + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: spark-operator-controller + apiGroup: rbac.authorization.k8s.io +--- +# Role: Leader election leases +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: spark-operator-controller-leases + labels: + app.kubernetes.io/name: mlrun-ce + app.kubernetes.io/component: spark-controller-rbac + app.kubernetes.io/managed-by: {{ .Release.Name }} +rules: + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create", "get", "update"] +--- +# RoleBinding: Grant controller SA access to leader election leases +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: spark-operator-controller-leases + labels: + app.kubernetes.io/name: mlrun-ce + app.kubernetes.io/component: spark-controller-rbac + app.kubernetes.io/managed-by: {{ .Release.Name }} +subjects: + - kind: ServiceAccount + name: spark-operator-controller + namespace: {{ .Release.Namespace }} +roleRef: + kind: Role + name: spark-operator-controller-leases + apiGroup: rbac.authorization.k8s.io +{{- end }} diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index bed3e5b0..5221c6c8 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -645,3 +645,11 @@ kafka: # Empty means "use the release namespace" # Example: "controller" if that's where you installed the operator operatorNamespace: "" + +# Spark configuration for multi-NS deployments +# Controls CE-level spark resources (mlrun-spark-config ConfigMap) +# In single-NS mode, both spark.enabled and spark-operator.enabled are true +# In multi-NS admin mode, spark.enabled is false (no ConfigMap needed) +# In multi-NS user mode, spark.enabled is true (ConfigMap needed for MLRun) +spark: + enabled: true diff --git a/tests/kind-test.sh b/tests/kind-test.sh index c99a182e..756f2300 100755 --- a/tests/kind-test.sh +++ b/tests/kind-test.sh @@ -23,6 +23,9 @@ set -o pipefail CLUSTER_NAME="${CLUSTER_NAME:-mlrun-ce-test}" NAMESPACE="${NAMESPACE:-mlrun}" RELEASE_NAME="${RELEASE_NAME:-mlrun}" +ADMIN_NAMESPACE="${ADMIN_NAMESPACE:-mlrun-admin}" +USER_NAMESPACE_1="${USER_NAMESPACE_1:-mlrun-user1}" +USER_NAMESPACE_2="${USER_NAMESPACE_2:-mlrun-user2}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CHART_DIR="${SCRIPT_DIR}/../charts/mlrun-ce" @@ -35,6 +38,75 @@ NC='\033[0m' # No Color log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } log_error() { echo -e "${RED}[ERROR]${NC} $1"; } +fail() { log_error "$1"; exit 1; } + +# Verify webhook mutation on a driver pod. Expects SparkApplication "spark-test" to exist. +# Args: $1 = namespace +# Returns: number of errors found +verify_spark_webhook_mutation() { + local ns="$1" + local errors=0 + + # Wait for driver pod to be created (up to 60s) + log_info "Waiting for driver pod spark-test-driver in ${ns}..." + local attempt + for attempt in $(seq 1 12); do + if kubectl get pod spark-test-driver -n "${ns}" &>/dev/null; then + log_info "Driver pod created" + break + fi + sleep 5 + done + if ! kubectl get pod spark-test-driver -n "${ns}" &>/dev/null; then + log_error "Driver pod spark-test-driver not created within 60s — skipping mutation checks" + return 1 + fi + + # Check 1: Owner reference points to SparkApplication + local owner_kind owner_name + owner_kind=$(kubectl get pod spark-test-driver -n "${ns}" \ + -o jsonpath='{.metadata.ownerReferences[0].kind}' 2>/dev/null || echo "") + owner_name=$(kubectl get pod spark-test-driver -n "${ns}" \ + -o jsonpath='{.metadata.ownerReferences[0].name}' 2>/dev/null || echo "") + if [[ "${owner_kind}" == "SparkApplication" && "${owner_name}" == "spark-test" ]]; then + log_info "Driver pod owner reference: ${owner_kind}/${owner_name} (correct)" + else + log_error "Driver pod owner reference: ${owner_kind}/${owner_name}, expected SparkApplication/spark-test" + errors=$((errors + 1)) + fi + + # Check 2: Webhook-injected labels exist + local app_name spark_role + app_name=$(kubectl get pod spark-test-driver -n "${ns}" \ + -o jsonpath='{.metadata.labels.sparkoperator\.k8s\.io/app-name}' 2>/dev/null || echo "") + spark_role=$(kubectl get pod spark-test-driver -n "${ns}" \ + -o jsonpath='{.metadata.labels.spark-role}' 2>/dev/null || echo "") + if [[ "${app_name}" == "spark-test" ]]; then + log_info "Webhook label sparkoperator.k8s.io/app-name=${app_name} (correct)" + else + log_error "Webhook label sparkoperator.k8s.io/app-name missing or wrong: '${app_name}'" + errors=$((errors + 1)) + fi + if [[ "${spark_role}" == "driver" ]]; then + log_info "Webhook label spark-role=${spark_role} (correct)" + else + log_error "Webhook label spark-role missing or wrong: '${spark_role}'" + errors=$((errors + 1)) + fi + + # Check 3: Correct service account + local pod_sa + pod_sa=$(kubectl get pod spark-test-driver -n "${ns}" \ + -o jsonpath='{.spec.serviceAccountName}' 2>/dev/null || echo "") + if [[ "${pod_sa}" == "sparkapp" ]]; then + log_info "Driver pod service account: ${pod_sa} (correct)" + else + log_error "Driver pod service account: '${pod_sa}', expected 'sparkapp'" + errors=$((errors + 1)) + fi + + return "${errors}" +} cleanup() { if [[ "${CLEANUP_ON_EXIT:-false}" == "true" ]]; then @@ -89,7 +161,6 @@ setup_helm_repos() { helm repo add nuclio https://nuclio.github.io/nuclio/charts 2>/dev/null || true helm repo add mlrun https://v3io.github.io/helm-charts/stable 2>/dev/null || true helm repo add mpi-operator https://v3io.github.io/helm-charts/stable 2>/dev/null || true - helm repo add minio https://charts.min.io/ 2>/dev/null || true helm repo add spark-operator https://kubeflow.github.io/spark-operator 2>/dev/null || true helm repo add kube-prometheus-stack https://prometheus-community.github.io/helm-charts 2>/dev/null || true helm repo add kafka https://charts.bitnami.com/bitnami 2>/dev/null || true @@ -139,17 +210,6 @@ nuclio: memory: "256Mi" cpu: "200m" -minio: - resources: - requests: - memory: "256Mi" - cpu: "100m" - limits: - memory: "512Mi" - cpu: "500m" - replicas: 1 - mode: standalone - jupyter: resources: requests: @@ -166,7 +226,15 @@ kube-prometheus-stack: enabled: false spark-operator: - enabled: false + enabled: true + controller: + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "256Mi" + cpu: "200m" mpi-operator: enabled: false @@ -189,7 +257,7 @@ EOF --create-namespace \ --namespace "${NAMESPACE}" \ --values "${values_file}" \ - --timeout 10m \ + --timeout 20m \ --wait \ --debug } @@ -220,6 +288,512 @@ verify_installation() { else log_warn "TimescaleDB pod not found" fi + + # Verify spark-operator + echo "" + log_info "Verifying spark-operator..." + + # Controller pod should be running + kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/name=spark-operator \ + -n "${NAMESPACE}" --timeout=120s \ + && log_info "spark-operator controller pod is Ready" \ + || log_warn "spark-operator controller pod not ready" + + # Webhook pod should be running + kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/component=webhook \ + -n "${NAMESPACE}" --timeout=120s \ + && log_info "spark-operator webhook pod is Ready" \ + || log_warn "spark-operator webhook pod not ready" + + # CRDs should exist + kubectl get crd sparkapplications.sparkoperator.k8s.io > /dev/null 2>&1 \ + && log_info "SparkApplication CRD exists" \ + || log_warn "SparkApplication CRD not found" + + # sparkapp ServiceAccount should exist + kubectl get sa sparkapp -n "${NAMESPACE}" > /dev/null 2>&1 \ + && log_info "sparkapp ServiceAccount exists" \ + || log_warn "sparkapp ServiceAccount not found" + + # mlrun-spark-config ConfigMap should exist + kubectl get configmap mlrun-spark-config -n "${NAMESPACE}" > /dev/null 2>&1 \ + && log_info "mlrun-spark-config ConfigMap exists" \ + || log_warn "mlrun-spark-config ConfigMap not found" + + # Functional check: submit a SparkApplication and verify controller processes it + log_info "Submitting test SparkApplication..." + kubectl apply -n "${NAMESPACE}" -f - <<'SPARK_EOF' +apiVersion: sparkoperator.k8s.io/v1beta2 +kind: SparkApplication +metadata: + name: spark-test +spec: + type: Scala + mode: cluster + image: spark:3.5.0 + mainClass: org.apache.spark.examples.SparkPi + mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.0.jar + sparkVersion: "3.5.0" + driver: + serviceAccount: sparkapp + cores: 1 + memory: "512m" + executor: + cores: 1 + instances: 1 + memory: "512m" +SPARK_EOF + + local attempt status driver_pod + for attempt in $(seq 1 12); do + status=$(kubectl get sparkapplication spark-test -n "${NAMESPACE}" \ + -o jsonpath='{.status.applicationState.state}' 2>/dev/null || echo "") + driver_pod=$(kubectl get pod spark-test-driver -n "${NAMESPACE}" \ + -o jsonpath='{.metadata.name}' 2>/dev/null || echo "") + if [[ -n "${status}" || -n "${driver_pod}" ]]; then + log_info "SparkApplication picked up by controller (status=${status:-pending}, driver=${driver_pod:-not yet})" + break + fi + log_info "Waiting for controller to process SparkApplication... (attempt ${attempt}/12)" + sleep 5 + done + + if [[ -z "${status}" && -z "${driver_pod}" ]]; then + log_warn "SparkApplication not processed within 60s — controller may not be working" + fi + + # Verify webhook mutation on the driver pod + local mutation_errors=0 + verify_spark_webhook_mutation "${NAMESPACE}" || mutation_errors=$? + if [[ "${mutation_errors}" -gt 0 ]]; then + log_warn "Webhook mutation checks: ${mutation_errors} issue(s)" + fi + + kubectl delete sparkapplication spark-test -n "${NAMESPACE}" --ignore-not-found > /dev/null 2>&1 +} + +# --- Multi-NS test functions --- + +install_admin_chart() { + log_info "Installing admin release in namespace '${ADMIN_NAMESPACE}'..." + + local values_file + values_file=$(mktemp) + trap "rm -f '${values_file}'" RETURN + + cat > "${values_file}" < "${values_file}" </dev/null; then + log_info "SparkApplication CRD exists" + else + log_error "SparkApplication CRD not found" + errors=$((errors + 1)) + fi + + if kubectl get crd scheduledsparkapplications.sparkoperator.k8s.io &>/dev/null; then + log_info "ScheduledSparkApplication CRD exists" + else + log_error "ScheduledSparkApplication CRD not found" + errors=$((errors + 1)) + fi + + # ClusterRole exists + if kubectl get clusterrole spark-operator-controller &>/dev/null; then + log_info "ClusterRole spark-operator-controller exists" + else + log_error "ClusterRole spark-operator-controller not found" + errors=$((errors + 1)) + fi + + # No controller pods in admin (controller.replicas=0) + local controller_pod_count + controller_pod_count=$(kubectl get pods -n "${ADMIN_NAMESPACE}" -l app.kubernetes.io/component=controller -o name 2>/dev/null | wc -l) + if [[ "${controller_pod_count}" -eq 0 ]]; then + log_info "No spark-operator controller pods in admin namespace (expected)" + else + log_error "Found ${controller_pod_count} controller pods in admin namespace (expected 0)" + errors=$((errors + 1)) + fi + + # Webhook pod running in admin (webhook.replicas=1) + if kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/component=webhook \ + -n "${ADMIN_NAMESPACE}" --timeout=120s &>/dev/null; then + log_info "Webhook pod is Ready in admin namespace" + else + log_error "Webhook pod not ready in admin namespace" + errors=$((errors + 1)) + fi + + # MutatingWebhookConfiguration exists + if kubectl get mutatingwebhookconfiguration spark-operator-webhook &>/dev/null; then + log_info "MutatingWebhookConfiguration exists" + else + log_error "MutatingWebhookConfiguration not found" + errors=$((errors + 1)) + fi + + # No sparkapp SA in admin + if ! kubectl get sa sparkapp -n "${ADMIN_NAMESPACE}" &>/dev/null; then + log_info "No sparkapp SA in admin namespace (expected)" + else + log_error "sparkapp SA should not exist in admin namespace" + errors=$((errors + 1)) + fi + + # mlrun-spark-config NOT in admin + if ! kubectl get configmap mlrun-spark-config -n "${ADMIN_NAMESPACE}" &>/dev/null; then + log_info "mlrun-spark-config ConfigMap absent from admin namespace (expected)" + else + log_error "mlrun-spark-config ConfigMap should not exist in admin namespace" + errors=$((errors + 1)) + fi + + return "${errors}" +} + +# Verify a single user namespace. Args: $1 = namespace name +verify_user_ns() { + local user_ns="$1" + log_info "=== User namespace (${user_ns}) ===" + local errors=0 + + # spark-operator-controller pod is Running + if kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/name=spark-operator \ + -n "${user_ns}" --timeout=120s &>/dev/null; then + log_info "spark-operator-controller pod is Ready" + else + log_error "spark-operator-controller pod not ready" + kubectl get pods -n "${user_ns}" -l app.kubernetes.io/name=spark-operator + errors=$((errors + 1)) + fi + + # sparkapp SA exists + if kubectl get sa sparkapp -n "${user_ns}" &>/dev/null; then + log_info "sparkapp ServiceAccount exists" + else + log_error "sparkapp ServiceAccount not found" + errors=$((errors + 1)) + fi + + # CE-created RoleBinding → ClusterRole exists and is correct + if kubectl get rolebinding spark-operator-controller -n "${user_ns}" &>/dev/null; then + local rb_kind rb_name + rb_kind=$(kubectl get rolebinding spark-operator-controller -n "${user_ns}" -o jsonpath='{.roleRef.kind}') + rb_name=$(kubectl get rolebinding spark-operator-controller -n "${user_ns}" -o jsonpath='{.roleRef.name}') + if [[ "${rb_kind}" == "ClusterRole" && "${rb_name}" == "spark-operator-controller" ]]; then + log_info "RoleBinding spark-operator-controller -> ClusterRole spark-operator-controller (correct)" + else + log_error "RoleBinding references ${rb_kind}/${rb_name}, expected ClusterRole/spark-operator-controller" + errors=$((errors + 1)) + fi + else + log_error "RoleBinding spark-operator-controller not found" + errors=$((errors + 1)) + fi + + # Leader election Role + RoleBinding exist + if kubectl get role spark-operator-controller-leases -n "${user_ns}" &>/dev/null; then + log_info "Leader election Role exists" + else + log_error "Leader election Role spark-operator-controller-leases not found" + errors=$((errors + 1)) + fi + + if kubectl get rolebinding spark-operator-controller-leases -n "${user_ns}" &>/dev/null; then + log_info "Leader election RoleBinding exists" + else + log_error "Leader election RoleBinding spark-operator-controller-leases not found" + errors=$((errors + 1)) + fi + + # mlrun-spark-config ConfigMap exists + if kubectl get configmap mlrun-spark-config -n "${user_ns}" &>/dev/null; then + log_info "mlrun-spark-config ConfigMap exists" + else + log_error "mlrun-spark-config ConfigMap not found" + errors=$((errors + 1)) + fi + + # Functional check: submit a SparkApplication + log_info "Submitting SparkApplication in ${user_ns}..." + kubectl apply -n "${user_ns}" -f - <<'SPARK_EOF' +apiVersion: sparkoperator.k8s.io/v1beta2 +kind: SparkApplication +metadata: + name: spark-test +spec: + type: Scala + mode: cluster + image: spark:3.5.0 + mainClass: org.apache.spark.examples.SparkPi + mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.0.jar + sparkVersion: "3.5.0" + driver: + serviceAccount: sparkapp + cores: 1 + memory: "512m" + executor: + cores: 1 + instances: 1 + memory: "512m" +SPARK_EOF + + log_info "Waiting for controller to process SparkApplication..." + local status="" + local driver_pod="" + local attempt + for attempt in $(seq 1 12); do + sleep 5 + status=$(kubectl get sparkapplication spark-test -n "${user_ns}" -o jsonpath='{.status.applicationState.state}' 2>/dev/null || echo "") + driver_pod=$(kubectl get pod spark-test-driver -n "${user_ns}" -o name 2>/dev/null || echo "") + if [[ -n "${status}" ]]; then + log_info "SparkApplication status: ${status} (controller is processing)" + break + elif [[ -n "${driver_pod}" ]]; then + log_info "Driver pod created (controller is processing, status not yet set)" + break + fi + log_info "Attempt ${attempt}/12: waiting for controller to set status or create driver pod..." + done + if [[ -z "${status}" && -z "${driver_pod}" ]]; then + log_error "SparkApplication not processed after 60s — controller may not be working" + errors=$((errors + 1)) + fi + + # Verify webhook mutation on the driver pod + local mutation_errors=0 + verify_spark_webhook_mutation "${user_ns}" || mutation_errors=$? + errors=$((errors + mutation_errors)) + + # Cleanup + kubectl delete sparkapplication spark-test -n "${user_ns}" --ignore-not-found &>/dev/null + + return "${errors}" +} + +verify_multi_ns() { + log_info "Verifying multi-NS spark-operator split (1 admin + 2 user namespaces)..." + local total_errors=0 + local ns_errors=0 + + echo "" + ns_errors=0 + verify_admin_ns || ns_errors=$? + total_errors=$((total_errors + ns_errors)) + + echo "" + ns_errors=0 + verify_user_ns "${USER_NAMESPACE_1}" || ns_errors=$? + total_errors=$((total_errors + ns_errors)) + + echo "" + ns_errors=0 + verify_user_ns "${USER_NAMESPACE_2}" || ns_errors=$? + total_errors=$((total_errors + ns_errors)) + + # --- Summary --- + echo "" + if [[ "${total_errors}" -eq 0 ]]; then + log_info "All multi-NS checks passed! (admin + 2 user namespaces, no conflicts)" + else + log_error "${total_errors} check(s) failed" + exit 1 + fi } delete_cluster() { @@ -235,15 +809,19 @@ Commands: create Create Kind cluster only install Install MLRun CE chart (assumes cluster exists) full Create cluster and install chart (default) + multi-ns Multi-NS test: admin + 2 user namespaces with spark-operator split verify Verify installation delete Delete Kind cluster help Show this help message Environment variables: - CLUSTER_NAME Kind cluster name (default: mlrun-ce-test) - NAMESPACE Kubernetes namespace (default: mlrun) - RELEASE_NAME Helm release name (default: mlrun) - CLEANUP_ON_EXIT Delete cluster on script exit (default: false) + CLUSTER_NAME Kind cluster name (default: mlrun-ce-test) + NAMESPACE Kubernetes namespace (default: mlrun) + RELEASE_NAME Helm release name (default: mlrun) + ADMIN_NAMESPACE Admin namespace for multi-ns (default: mlrun-admin) + USER_NAMESPACE_1 First user namespace for multi-ns (default: mlrun-user1) + USER_NAMESPACE_2 Second user namespace for multi-ns (default: mlrun-user2) + CLEANUP_ON_EXIT Delete cluster on script exit (default: false) Examples: $0 full # Full test: create cluster + install @@ -275,6 +853,15 @@ main() { install_chart verify_installation ;; + multi-ns) + create_kind_cluster + setup_helm_repos + build_dependencies + install_admin_chart + install_user_chart "${USER_NAMESPACE_1}" "mlrun-user1" + install_user_chart "${USER_NAMESPACE_2}" "mlrun-user2" + verify_multi_ns + ;; verify) verify_installation ;;