Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions config/rbac/daemonsets_role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: gpu-operator-daemonsets-role
rules:
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
12 changes: 12 additions & 0 deletions config/rbac/daemonsets_role_binding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: gpu-operator-daemonsets-rolebinding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: gpu-operator-daemonsets-role
subjects:
- kind: ServiceAccount
name: gpu-operator
namespace: system
2 changes: 2 additions & 0 deletions config/rbac/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ resources:
- service_account.yaml
- role.yaml
- role_binding.yaml
- daemonsets_role.yaml
- daemonsets_role_binding.yaml
- leader_election_role.yaml
- leader_election_role_binding.yaml
# Comment the following 4 lines if you want to disable
Expand Down
25 changes: 21 additions & 4 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ rules:
- configmaps
- endpoints
- events
- namespaces
- nodes
- persistentvolumeclaims
- pods
- pods/eviction
Expand All @@ -27,6 +25,23 @@ rules:
- patch
- update
- watch
- apiGroups:
- ""
resources:
- namespaces
verbs:
- get
- patch
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- patch
- update
- watch
- apiGroups:
- apiextensions.k8s.io
resources:
Expand All @@ -39,14 +54,14 @@ rules:
- apps
resources:
- controllerrevisions
- daemonsets
verbs:
- get
- list
- watch
- apiGroups:
- apps
resources:
- daemonsets
- deployments
- replicasets
- statefulsets
Expand Down Expand Up @@ -163,7 +178,9 @@ rules:
- rolebindings
- roles
verbs:
- '*'
- create
- delete
- update
- apiGroups:
- route.openshift.io
resources:
Expand Down
11 changes: 7 additions & 4 deletions controllers/clusterpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,14 @@ type ClusterPolicyReconciler struct {
// +kubebuilder:rbac:groups=config.openshift.io,resources=clusterversions;proxies,verbs=get;list;watch
// +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=use,resourceNames=privileged
// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles;clusterrolebindings;roles;rolebindings,verbs=*
// +kubebuilder:rbac:groups="",resources=namespaces;serviceaccounts;pods;pods/eviction;services;services/finalizers;endpoints,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups="",resources=persistentvolumeclaims;events;configmaps;secrets;nodes,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=apps,resources=deployments;daemonsets;replicasets;statefulsets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles;clusterrolebindings;roles;rolebindings,verbs=create;update;delete
// +kubebuilder:rbac:groups="",resources=namespaces,verbs=get;patch
// +kubebuilder:rbac:groups="",resources=serviceaccounts;pods;pods/eviction;services;services/finalizers;endpoints,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups="",resources=persistentvolumeclaims;events;configmaps;secrets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;patch
// +kubebuilder:rbac:groups=apps,resources=deployments;replicasets;statefulsets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=apps,resources=controllerrevisions,verbs=get;list;watch
// +kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch
// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors;prometheusrules,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=scheduling.k8s.io,resources=priorityclasses,verbs=get;list;watch;create
// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch
Expand Down
17 changes: 10 additions & 7 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -4065,13 +4065,14 @@ func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) {
}

func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) error {
// Get all DaemonSets owned by ClusterPolicy
//
// (cdesiniotis) There is a limitation with the controller-runtime client where only a single field selector
// is allowed when specifying ListOptions or DeleteOptions.
// See GH issue: https://github.com/kubernetes-sigs/controller-runtime/issues/612
// Get all DaemonSets owned by ClusterPolicy in operator namespace
list := &appsv1.DaemonSetList{}
err := n.client.List(ctx, list, client.MatchingFields{clusterPolicyControllerIndexKey: n.singleton.Name})
err := n.client.List(
ctx,
list,
client.MatchingFields{clusterPolicyControllerIndexKey: n.singleton.Name},
client.InNamespace(n.operatorNamespace),
)
if err != nil {
return fmt.Errorf("failed to list all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err)
}
Expand Down Expand Up @@ -4099,6 +4100,7 @@ func (n ClusterPolicyController) cleanupStalePrecompiledDaemonsets(ctx context.C
client.MatchingLabels{
precompiledIdentificationLabelKey: precompiledIdentificationLabelValue,
},
client.InNamespace(n.operatorNamespace),
}
list := &appsv1.DaemonSetList{}
err := n.client.List(ctx, list, opts...)
Expand Down Expand Up @@ -4243,6 +4245,7 @@ func (n ClusterPolicyController) ocpCleanupStaleDriverToolkitDaemonSets(ctx cont
client.MatchingLabels{
ocpDriverToolkitIdentificationLabel: ocpDriverToolkitIdentificationValue,
},
client.InNamespace(n.operatorNamespace),
}

list := &appsv1.DaemonSetList{}
Expand Down Expand Up @@ -4426,7 +4429,7 @@ func (n ClusterPolicyController) cleanupUnusedDriverDaemonSets(ctx context.Conte
// pairs If no error happens, returns the number of Pods belonging to
// the DaemonSet.
func (n ClusterPolicyController) cleanupDriverDaemonsets(ctx context.Context, searchKey string, searchValue string, namePrefix string) (int, error) {
var opts = []client.ListOption{client.MatchingLabels{searchKey: searchValue}}
var opts = []client.ListOption{client.MatchingLabels{searchKey: searchValue}, client.InNamespace(n.operatorNamespace)}

dsList := &appsv1.DaemonSetList{}
if err := n.client.List(ctx, dsList, opts...); err != nil {
Expand Down
2 changes: 1 addition & 1 deletion controllers/upgrade_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ const (
// +kubebuilder:rbac:groups=mellanox.com,resources=*,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;patch
// +kubebuilder:rbac:groups="",resources=pods,verbs=list
// +kubebuilder:rbac:groups=apps,resources=deployments;daemonsets;replicasets;statefulsets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=apps,resources=deployments;replicasets;statefulsets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=apps,resources=deployments/finalizers,verbs=update

// Reconcile is part of the main kubernetes reconciliation loop which aims to
Expand Down
7 changes: 0 additions & 7 deletions deployments/gpu-operator/templates/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,7 @@ rules:
- clusterrolebindings
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- apiGroups:
- ""
Expand All @@ -65,9 +61,6 @@ rules:
- namespaces
verbs:
- get
- list
- watch
- update
- patch
- apiGroups:
- ""
Expand Down
17 changes: 12 additions & 5 deletions deployments/gpu-operator/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,7 @@ rules:
- rolebindings
verbs:
- create
- get
- list
- watch
- update
- patch
- delete
- apiGroups:
- apps
Expand Down Expand Up @@ -50,7 +46,6 @@ rules:
- secrets
- services
- services/finalizers
- serviceaccounts
verbs:
- create
- get
Expand All @@ -59,6 +54,18 @@ rules:
- update
- patch
- delete
- apiGroups:
- ""
resources:
- serviceaccounts
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- coordination.k8s.io
resources:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
{{- if .Capabilities.APIVersions.Has "admissionregistration.k8s.io/v1/ValidatingAdmissionPolicy" }}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be further restricted to be installed if someone requests validatingadmissionpolicy guardrails in values.yaml. I haven't added added that and looking for feedback if we should do something like that or let these be always installed if supported by k8s.

apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicy
metadata:
name: gpu-operator-runtimeclass-handler-policy
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
spec:
failurePolicy: Fail
matchConstraints:
resourceRules:
- apiGroups:
- node.k8s.io
apiVersions:
- v1
operations:
- CREATE
- UPDATE
resources:
- runtimeclasses
matchConditions:
- name: gpu-operator-service-account
expression: request.userInfo.username == 'system:serviceaccount:{{ .Release.Namespace }}:gpu-operator'
validations:
- expression: >-
object.handler in ['nvidia', 'nvidia-cdi', 'nvidia-legacy', '{{ .Values.operator.runtimeClass }}'{{- range $runtimeClass := (default (list) .Values.kataManager.config.runtimeClasses) }}{{- if $runtimeClass.name }}, '{{ $runtimeClass.name }}'{{- end }}{{- end }}]
message: runtimeclass handler must be one of the allowed runtime classes configured by the chart
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicyBinding
metadata:
name: gpu-operator-runtimeclass-handler-policy-binding
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
spec:
policyName: gpu-operator-runtimeclass-handler-policy
validationActions:
- Deny
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicy
metadata:
name: gpu-operator-namespace-label-policy
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
spec:
failurePolicy: Fail
matchConstraints:
resourceRules:
- apiGroups:
- ""
apiVersions:
- v1
operations:
- UPDATE
resources:
- namespaces
matchConditions:
- name: gpu-operator-service-account
expression: request.userInfo.username == 'system:serviceaccount:{{ .Release.Namespace }}:gpu-operator'
- name: target-namespace
expression: object.metadata.name in ['{{ .Release.Namespace }}', 'nvidia-gpu-operator']
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need inputs here if these are the only namespaces we label or there can be others as well in case of openshift

validations:
- expression: >-
(!has(oldObject.metadata.labels) ||
oldObject.metadata.labels.all(k, v,
(has(object.metadata.labels) && k in object.metadata.labels && object.metadata.labels[k] == v) ||
k.startsWith('pod-security.kubernetes.io/') ||
k == 'openshift.io/cluster-monitoring'
)) &&
(!has(object.metadata.labels) ||
object.metadata.labels.all(k, v,
(has(oldObject.metadata.labels) && k in oldObject.metadata.labels && oldObject.metadata.labels[k] == v) ||
k.startsWith('pod-security.kubernetes.io/') ||
k == 'openshift.io/cluster-monitoring'
))
message: only pod-security.kubernetes.io/* and openshift.io/cluster-monitoring labels may be added or modified by gpu-operator
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicyBinding
metadata:
name: gpu-operator-namespace-label-policy-binding
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
spec:
policyName: gpu-operator-namespace-label-policy
validationActions:
- Deny
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicy
metadata:
name: gpu-operator-node-label-policy
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
spec:
failurePolicy: Fail
matchConstraints:
resourceRules:
- apiGroups:
- ""
apiVersions:
- v1
operations:
- UPDATE
resources:
- nodes
matchConditions:
- name: gpu-operator-service-account
expression: request.userInfo.username == 'system:serviceaccount:{{ .Release.Namespace }}:gpu-operator'
validations:
- expression: >-
(!has(oldObject.metadata.labels) ||
oldObject.metadata.labels.all(k, v,
(has(object.metadata.labels) && k in object.metadata.labels && object.metadata.labels[k] == v) ||
k.startsWith('nvidia.com/')
)) &&
(!has(object.metadata.labels) ||
object.metadata.labels.all(k, v,
(has(oldObject.metadata.labels) && k in oldObject.metadata.labels && oldObject.metadata.labels[k] == v) ||
k.startsWith('nvidia.com/')
))
message: only nvidia.com/* labels may be added, modified, or removed by gpu-operator on nodes
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicyBinding
metadata:
name: gpu-operator-node-label-policy-binding
labels:
{{- include "gpu-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: "gpu-operator"
spec:
policyName: gpu-operator-node-label-policy
validationActions:
- Deny
{{- end }}