From 30b3c39f0ae712eb20d13f59d675b24bff5186e7 Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Wed, 25 Feb 2026 13:39:46 -0500 Subject: [PATCH 1/4] Add docs for 26.3.0 release Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Co-authored-by: Rajath Agasthya --- gpu-operator/amazon-eks.rst | 2 +- gpu-operator/cdi.rst | 108 ++++++-- gpu-operator/conf.py | 226 ----------------- gpu-operator/getting-started.rst | 44 +++- gpu-operator/gpu-driver-configuration.rst | 32 +-- gpu-operator/gpu-operator-mig.rst | 230 ++++++++++++------ gpu-operator/index.rst | 4 +- .../install-gpu-operator-gov-ready.rst | 35 +-- gpu-operator/life-cycle-policy.rst | 58 ++--- .../manifests/input/mig-cm-values.yaml | 11 + gpu-operator/manifests/input/nvd-all.yaml | 31 ++- .../manifests/output/nri-get-pods-restart.txt | 12 + gpu-operator/platform-support.rst | 170 ++++++++----- gpu-operator/release-notes.rst | 123 +++++++++- gpu-operator/versions.json | 5 +- gpu-operator/versions1.json | 4 + openshift/openshift-virtualization.rst | 1 + 17 files changed, 609 insertions(+), 487 deletions(-) delete mode 100644 gpu-operator/conf.py create mode 100644 gpu-operator/manifests/output/nri-get-pods-restart.txt diff --git a/gpu-operator/amazon-eks.rst b/gpu-operator/amazon-eks.rst index c108ee586..4eb7b7606 100644 --- a/gpu-operator/amazon-eks.rst +++ b/gpu-operator/amazon-eks.rst @@ -110,7 +110,7 @@ without any limitations, you perform the following high-level actions: Make sure the instance type supports enough IP addresses for your workload. For example, the ``g4dn.xlarge`` instance type supports ``29`` IP addresses for pods on the node. -* Use an Amazon EKS optimized Amazon Machine Image (AMI) with Ubuntu 20.04, 22.04, or 24.04 on the nodes in the node group. +* Use an Amazon EKS optimized Amazon Machine Image (AMI) with a `supported operating system `_ on the nodes in the node group. AMIs support are specific to an AWS region and Kubernetes version. See https://cloud-images.ubuntu.com/aws-eks/ for the AMI values such as ``ami-00687acd80b7a620a``. diff --git a/gpu-operator/cdi.rst b/gpu-operator/cdi.rst index 6f70a5ccf..85e6c821a 100644 --- a/gpu-operator/cdi.rst +++ b/gpu-operator/cdi.rst @@ -16,13 +16,15 @@ .. headings # #, * *, =, -, ^, " -############################################################ -Container Device Interface (CDI) Support in the GPU Operator -############################################################ +################################################################################# +Container Device Interface (CDI) and Node Resource Interface (NRI) Plugin Support +################################################################################# -************************************ -About the Container Device Interface -************************************ +This page gives an overview of CDI and NRI Plugin support in the GPU Operator. + +************************************** +About Container Device Interface (CDI) +************************************** The `Container Device Interface (CDI) `_ is an open specification for container runtimes that abstracts what access to a device, such as an NVIDIA GPU, means, @@ -31,7 +33,7 @@ ensure that a device is available in a container. CDI simplifies adding support the specification is applicable to all container runtimes that support CDI. Starting with GPU Operator v25.10.0, CDI is used by default for enabling GPU support in containers running on Kubernetes. -Specifically, CDI support in container runtimes, e.g. containerd and cri-o, is used to inject GPU(s) into workload +Specifically, CDI support in container runtimes, like containerd and cri-o, is used to inject GPU(s) into workload containers. This differs from prior GPU Operator releases where CDI was used via a CDI-enabled ``nvidia`` runtime class. If you are upgrading from a version of the GPU Operator prior to v25.10.0, where CDI was disabled by default, and you are upgrading to v25.10.0 or later, where CDI is enabled by default, no configuration changes are required for standard workloads using GPU allocation through the Device Plugin. @@ -45,22 +47,21 @@ plugins. CDI and GPU Management Containers ********************************* -When CDI is enabled in GPU Operator versions v25.10.0 and later, GPU Management Containers that use the ``NVIDIA_VISIBLE_DEVICES`` environment variable to get GPU access, bypassing GPU allocation via the Device Plugin, must set ``runtimeClassName: nvidia`` in the pod specification. -A GPU Management Containers is a container that requires access to all GPUs without them being allocated by Kubernetes. +When CDI is enabled in GPU Operator versions v25.10.0 and later, GPU Management Containers that use the ``NVIDIA_VISIBLE_DEVICES`` environment variable to get GPU access, bypassing GPU allocation via the Device Plugin or DRA Driver for GPUs, must set ``runtimeClassName: nvidia`` in the pod specification. +A GPU Management Container is a container that requires access to all GPUs without them being allocated by Kubernetes. Examples of GPU Management Containers include monitoring agents and device plugins. -It is recommended that ``NVIDIA_VISIBLE_DEVICES`` only be used by management containers. +It is recommended that ``NVIDIA_VISIBLE_DEVICES`` only be used by GPU Management Containers. -******************************** -Enabling CDI During Installation -******************************** +************ +Enabling CDI +************ CDI is enabled by default during installation in GPU Operator v25.10.0 and later. Follow the instructions for installing the Operator with Helm on the :doc:`getting-started` page. CDI is also enabled by default during a Helm upgrade to GPU Operator v25.10.0 and later. -******************************* Enabling CDI After Installation ******************************* @@ -138,3 +139,82 @@ disable CDI and use the legacy NVIDIA Container Toolkit stack instead with the f nvidia.com/gpu.deploy.operator-validator=true \ nvidia.com/gpu.present=true \ --overwrite + + +.. _nri-plugin: + +********************************************** +About the Node Resource Interface (NRI) Plugin +********************************************** + +Node Resource Interface (NRI) is a standardized interface for plugging in extensions, called NRI Plugins, to OCI-compatible container runtimes like containerd. +NRI Plugins serve as hooks which intercept pod and container lifecycle events and perform functions including inject devices to a container, topology aware placement strategies, and more. +For more details on NRI, refer to the `NRI overview `_ in the containerd repository. + +When enabled in the GPU Operator, the NRI Plugin is managed by the NVIDIA Container Toolkit andprovides an alternative to the ``nvidia`` runtime class to provision GPU workload pods. +It allows the GPU Operator to extend the container runtime behaviour without modifying the container runtime itself. +This feature also simplifies deployments on platforms like k3s, k0s, or RKE, because the GPU Operator no longer needs you to set values like ``CONTAINERD_CONFIG``, ``CONTAINERD_SOCKET``, or ``RUNTIME_CONFIG_SOURCE`` for the Container Toolkit. + +*********************** +Enabling the NRI Plugin +*********************** + +The NRI Plugin requires the following: + +- CDI to be enabled in the GPU Operator. + +- containerd v1.7.30, v2.1.x, or v2.2.x. + If you are not using the latest containerd version, check that both CDI and NRI are enabled in the containerd configuration file before deploying GPU Operator. + + .. note:: + Enabling the NRI plugin is not supported with cri-o. + +To enable the NRI Plugin during installation, follow the instructions for installing the Operator with Helm on the :doc:`getting-started` page and include the ``--set cdi.nriPluginEnabled=true`` argument in you Helm command. + +Enabling the NRI Plugin After Installation +****************************************** + +#. Enable NRI Plugin by modifying the cluster policy: + + .. code-block:: console + + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \ + -p='[{"op": "replace", "path": "/spec/cdi/nriPluginEnabled", "value":true}]' + + *Example Output* + + .. code-block:: output + + clusterpolicy.nvidia.com/cluster-policy patched + +#. (Optional) Confirm that the container toolkit and device plugin pods restart: + + .. code-block:: console + + $ kubectl get pods -n gpu-operator + + *Example Output* + + .. literalinclude:: ./manifests/output/nri-get-pods-restart.txt + :language: output + :emphasize-lines: 6,9 + + +************************ +Disabling the NRI Plugin +************************ + +Disable the NRI Plugin and use the ``nvidia`` runtime class instead with the following procedure: + +Disable the NRI Plugin by modifying the cluster policy: + +.. code-block:: console + + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \ + -p='[{"op": "replace", "path": "/spec/cdi/nriPluginEnabled", "value":false}]' + +*Example Output* + +.. code-block:: output + + clusterpolicy.nvidia.com/cluster-policy patched diff --git a/gpu-operator/conf.py b/gpu-operator/conf.py deleted file mode 100644 index 464c78557..000000000 --- a/gpu-operator/conf.py +++ /dev/null @@ -1,226 +0,0 @@ - -import sphinx -import os -import logging -import sys -from string import Template - -logger = logging.getLogger(__name__) - -sys.path += [ - "/work/_repo/deps/repo_docs/omni/repo/docs/include", -] - - -project = "NVIDIA GPU Operator" - -copyright = "2020-2026, NVIDIA Corporation" -author = "NVIDIA Corporation" - -release = "25.10" -root_doc = "index" - -extensions = [ - "sphinx.ext.autodoc", # include documentation from docstrings - "sphinx.ext.ifconfig", # conditional include of text - "sphinx.ext.napoleon", # support for NumPy and Google style docstrings - "sphinx.ext.intersphinx", # link to other projects' documentation - "sphinx.ext.extlinks", # add roles to shorten external links - "myst_parser", # markdown parsing - "sphinxcontrib.mermaid", # create diagrams using text and code - "sphinxcontrib.youtube", # adds youtube:: directive - "sphinxemoji.sphinxemoji", # adds emoji substitutions (e.g. |:fire:|) - "sphinx_design", - "repo_docs.ext.inline_only", - "repo_docs.ext.toctree", - "repo_docs.ext.mdinclude", - "repo_docs.ext.include_patch", - "repo_docs.ext.youtube", - "repo_docs.ext.ifconfig", - "repo_docs.ext.source_substitutions", - "repo_docs.ext.mermaid", - "repo_docs.ext.exhale_file_fix", - "repo_docs.ext.output_format_text", - "repo_docs.ext.output_format_latex", - "repo_docs.ext.include_licenses", - "repo_docs.ext.add_templates", - "repo_docs.ext.breadcrumbs", - "repo_docs.ext.metadata", - "repo_docs.ext.confval", - "repo_docs.ext.customize_layout", - "repo_docs.ext.cpp_xrefs", -] - -# automatically add section level labels, up to level 4 -myst_heading_anchors = 4 - - -# configure sphinxcontrib.mermaid as we inject mermaid manually on pages that need it -mermaid_init_js = "" -mermaid_version= "" - - -intersphinx_mapping = {} -exclude_patterns = [ - ".git", - "Thumbs.db", - ".DS_Store", - ".pytest_cache", - "_repo", - "README.md", - "life-cycle-policy.rst", - "_build/docs/secure-services-istio-keycloak", - "_build/docs/openshift", - "_build/docs/gpu-telemetry", - "_build/docs/container-toolkit", - "_build/docs/review", - "_build/docs/partner-validated", - "_build/docs/driver-containers", - "_build/docs/sphinx_warnings.txt", - "_build/docs/kubernetes", - "_build/docs/tmp", - "_build/docs/dra-driver", - "_build/docs/edge", - "_build/docs/gpu-operator/24.9.1", - "_build/docs/gpu-operator/24.12.0", - "_build/docs/gpu-operator/25.3.4", - "_build/docs/gpu-operator/25.3.1", - "_build/docs/gpu-operator/24.9.2", - "_build/docs/gpu-operator/version1.json", - "_build/docs/gpu-operator/24.9", - "_build/docs/gpu-operator/25.3.0", - "_build/docs/gpu-operator/25.3", - "_build/docs/gpu-operator/25.10", -] - -html_theme = "sphinx_rtd_theme" - -html_logo = "/work/assets/nvidia-logo-white.png" -html_favicon = "/work/assets/favicon.ico" - -# If true, links to the reST sources are added to the pages. -html_show_sourcelink = False - -html_additional_search_indices = [] - -# If true, the raw source is copied which might be a problem if content is removed with `ifconfig` -html_copy_source = False - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -html_show_sphinx = False - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = [ - "/work/_repo/deps/repo_docs/media", -] - -html_last_updated_fmt = "" - -# https://sphinx-rtd-theme.readthedocs.io/en/stable/configuring.html -html_theme_options = { - "logo_only": True, - "prev_next_buttons_location": None, # our docs aren't a novel... - "navigation_depth": 10, -} - -html_extra_content_head = [' \n '] -html_extra_content_footer = [' \n '] -html_logo_target_url = "" - -html_breadcrumbs_home_url = "" -html_extra_breadcrumbs = [] - -html_css_files = [ - "omni-style.css", - "api-styles.css", -] - -html_js_files = [ - "version.js", - "social-media.js", -] - -# literal blocks default to c++ (useful for Doxygen \code blocks) -highlight_language = 'c++' - - -# add additional tags - - - -source_substitutions = {'minor_version': '25.10', 'version': 'v25.10.1', 'recommended': '580.105.08', 'dra_version': '25.12.0'} -source_substitutions.update({ - 'repo_docs_config': 'debug', - 'repo_docs_platform_target': 'linux-x86_64', - 'repo_docs_platform': 'linux-x86_64', - 'repo_docs_dash_build': '', - 'repo_docs_project': 'gpu-operator', - 'repo_docs_version': '25.10', - 'repo_docs_copyright': '2020-2026, NVIDIA Corporation', - # note: the leading '/' means this is relative to the docs_root (the source directory) - 'repo_docs_api_path': '/../_build/docs/gpu-operator/latest', -}) - -# add global metadata for all built pages -metadata_global = {} - -sphinx_event_handlers = [] -myst_enable_extensions = [ - "colon_fence", "dollarmath", -] -templates_path = ['/work/templates'] -extensions.extend([ - "linuxdoc.rstFlatTable", - "sphinx.ext.autosectionlabel", - "sphinx_copybutton", - "sphinx_design", -]) -suppress_warnings = [ 'autosectionlabel.*' ] -pygments_style = 'sphinx' -copybutton_exclude = '.linenos, .gp' - -html_theme = "nvidia_sphinx_theme" -html_copy_source = False -html_show_sourcelink = False -html_show_sphinx = False - -html_domain_indices = False -html_use_index = False -html_extra_path = ["versions1.json"] -html_static_path = ["/work/css"] -html_css_files = ["custom.css"] - -html_theme_options = { - "icon_links": [], - "switcher": { - "json_url": "../versions1.json", - "version_match": release, - }, -} - -highlight_language = 'console' - -intersphinx_mapping = { - "dcgm": ("https://docs.nvidia.com/datacenter/dcgm/latest/", "../work/dcgm-offline.inv"), - "gpuop": ("https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/", - ("_build/docs/gpu-operator/latest/objects.inv", None)), - "ctk": ("https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/", - ("_build/docs/container-toolkit/latest/objects.inv", None)), - "drv": ("https://docs.nvidia.com/datacenter/cloud-native/driver-containers/latest/", - ("_build/docs/driver-containers/latest/objects.inv", None)), - "ocp": ("https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/", - ("_build/docs/openshift/latest/objects.inv", None)), - "edge": ("https://docs.nvidia.com/datacenter/cloud-native/edge/latest/", - ("_build/docs/edge/latest/objects.inv", None)), -} -rst_epilog = ".. |gitlab_mr_url| replace:: Sorry Charlie...not a merge request." -if os.environ.get("CI_MERGE_REQUEST_IID") is not None: - rst_epilog = ".. |gitlab_mr_url| replace:: {}/-/merge_requests/{}".format( - os.environ["CI_MERGE_REQUEST_PROJECT_URL"], os.environ["CI_MERGE_REQUEST_IID"]) - -def setup(app): - app.add_config_value('build_name', 'public', 'env') - for (event, handler) in sphinx_event_handlers: - app.connect(event, handler) diff --git a/gpu-operator/getting-started.rst b/gpu-operator/getting-started.rst index d9b932a08..56935ecd9 100644 --- a/gpu-operator/getting-started.rst +++ b/gpu-operator/getting-started.rst @@ -50,12 +50,14 @@ Prerequisites && chmod 700 get_helm.sh \ && ./get_helm.sh -#. All worker nodes or node groups to run GPU workloads in the Kubernetes cluster must run the same operating system version to use the NVIDIA GPU Driver container. +#. If you are planning to use ClusterPolicy for driver configuration, all worker nodes or node groups to run GPU workloads in the Kubernetes cluster must run the same operating system version to use the NVIDIA GPU Driver container. Alternatively, if you pre-install the NVIDIA GPU Driver on the nodes, then you can run different operating systems. For worker nodes or node groups that run CPU workloads only, the nodes can run any operating system because the GPU Operator does not perform any configuration or management of nodes for CPU-only workloads. + If you are planning to use NVIDIA GPU Driver Custom Resource Definition, you can use a mix of operating system versions on CPU and GPU nodes. Refer to the :doc:`NVIDIA GPU Driver Custom Resource Definition ` page for more information. + #. Nodes must be configured with a container engine such as CRI-O or containerd. #. If your cluster uses Pod Security Admission (PSA) to restrict the behavior of pods, @@ -147,13 +149,19 @@ To view all the options, run ``helm show values nvidia/gpu-operator``. * - ``cdi.enabled`` - When set to ``true`` (default), the Container Device Interface (CDI) will be used for - injecting GPUs into workload containers. The Operator will no longer configure the `nvidia` - runtime class as the default runtime handler. Instead, native-CDI support in container runtimes - like containerd or cri-o will be leveraged for injecting GPUs into workload containers. - Using CDI aligns the Operator with the recent efforts to standardize how complex devices like GPUs - are exposed to containerized environments. + injecting GPUs into workload containers. + The Operator will no longer configure the ``nvidia`` runtime class as the default runtime handler. + Instead, native-CDI support in container runtimes like containerd or cri-o will be leveraged for injecting GPUs into workload containers. + Refer to the :doc:`cdi` page for more information. - ``true`` + * - ``cdi.nriPluginEnabled`` + - When set to ``true``, the Node Resource Interface (NRI) Plugin will be used for injecting GPUs into workload containers. + In NRI Plugin mode, the NVIDIA Container Toolkit will no longer modify the runtime config. + This feature requires CRI-O v1.34.0 or later or containerd v1.7.30, v2.1.x, or v2.2.x. + Refer to the :doc:`cdi` page for more information. + - ``false`` + * - ``cdi.default`` Deprecated. - This field is deprecated as of v25.10.0 and will be ignored. The ``cdi.enabled`` field is set to ``true`` by default in versions 25.10.0 and later. @@ -179,6 +187,10 @@ To view all the options, run ``helm show values nvidia/gpu-operator``. Available values are ``Cluster`` (default) or ``Local``. - ``Cluster`` + * - ``dcgmExporter.hostNetwork`` + - When set to ``true``, the DCGM Exporter will expose a metric port on the host's network namespace. + - ``false`` + * - ``devicePlugin.config`` - Specifies the configuration for the NVIDIA Device Plugin as a config map. @@ -210,6 +222,11 @@ To view all the options, run ``helm show values nvidia/gpu-operator``. ``Proprietary`` means the proprietary module is used. - ``auto`` + * - ``driver.nvidiaDriverCRD.enabled`` + - When set to ``true``, the Operator deploys NVIDIA GPU Driver Custom Resource Definition. + Refer to the :doc:`NVIDIA GPU Driver Custom Resource Definition ` page for more information. + - ``false`` + * - ``driver.repository`` - The images are downloaded from NGC. Specify another image repository when using custom driver images. @@ -300,9 +317,13 @@ To view all the options, run ``helm show values nvidia/gpu-operator``. - Specifies the default type of workload for the cluster, one of ``container``, ``vm-passthrough``, or ``vm-vgpu``. Setting ``vm-passthrough`` or ``vm-vgpu`` can be helpful if you plan to run all or mostly virtual machines in your cluster. - Refer to :doc:`KubeVirt `. - ``container`` - + + * - ``sandboxWorkloads.mode`` + - Specifies the sandbox mode to use when deploying sandbox workloads. + Accepted values are ``kubevirt`` (default) and ``kata``. + Refer to the :doc:`KubeVirt ` page for more information on using KubeVirt based workloads. + - ``kubevirt`` * - ``toolkit.enabled`` - By default, the Operator deploys the NVIDIA Container Toolkit (``nvidia-docker2`` stack) as a container on the system. Set this value to ``false`` when using the Operator on systems @@ -484,6 +505,12 @@ support for such custom configurations. Specifying Configuration Options for containerd *********************************************** +.. note:: + + It's recommended that you enable the NRI Plugin to configure the container runtime by setting ``cdi.nriPluginEnabled=true``. + When enabled, you do not need to specify the ``toolkit.env`` options and injecting GPUs into workload containers is handled by the NRI Plugin. + Refer to the :ref:`NRI Plugin ` documentation, for more information. + When you use containerd as the container runtime, the following configuration options are used with the container-toolkit deployed with GPU Operator: @@ -559,6 +586,7 @@ Refer to the :ref:`v24.9.0-known-limitations`. MicroK8s ======== + For MicroK8s, set the following in the ``ClusterPolicy``. .. code-block:: yaml diff --git a/gpu-operator/gpu-driver-configuration.rst b/gpu-operator/gpu-driver-configuration.rst index 4467051df..7b99cda17 100644 --- a/gpu-operator/gpu-driver-configuration.rst +++ b/gpu-operator/gpu-driver-configuration.rst @@ -26,21 +26,18 @@ NVIDIA GPU Driver Custom Resource Definition Overview of the GPU Driver Custom Resource Definition ***************************************************** -.. note:: +You can create one or more instances of an NVIDIA driver (``NVIDIADriver``) custom resource +to specify the NVIDIA GPU driver type and driver version to configure on specific nodes. +You can specify labels in the node selector field to control which NVIDIA driver configuration is applied to specific nodes. - Technology Preview features are not supported in production environments - and are not functionally complete. - Technology Preview features provide early access to upcoming product features, - enabling customers to test functionality and provide feedback during the development process. - These releases may not have any documentation, and testing is limited. - This feature does not support an upgrade from an earlier version of the NVIDIA GPU Operator. - You must uninstall an existing installation and then install the Operator again. - Uninstalling the Operator interrupts services and applications that require access to NVIDIA GPUs. +Limitations +=========== -As a technology preview feature, you can create one or more instances of an NVIDIA driver custom resource -to specify the NVIDIA GPU driver type and driver version to configure on specific nodes. -You can specify labels in the node selector field to control which NVIDIA driver configuration is applied to specific nodes. +* This feature is recommended for new cluster installations only. + Seamless upgrades from ClusterPolicy managed drivers to NVIDIADriver CR managed drivers are not supported. + Switching from ClusterPolicy to NVIDIADriver, will cause all existing driver pods to be terminated immediately and redeployed using the new NVIDIADriver configuration. +* Users are required to either use the default NVIDIADriver rendered by helm chart or create and manage their own custom NVIDIADriver. Comparison: Managing the Driver with CRD versus the Cluster Policy ================================================================== @@ -72,10 +69,10 @@ Driver Daemon Sets The NVIDIA GPU Operator starts a driver daemon set for each NVIDIA driver custom resource and each operating system version. -For example, if your cluster has one NVIDIA driver custom resource that specifies a 535 branch GPU driver and some +For example, if your cluster has one NVIDIA driver custom resource that specifies a 580 branch GPU driver and some worker nodes run Ubuntu 20.04 and other worker nodes run Ubuntu 22.04, the Operator starts two driver daemon sets. One daemon set configures the GPU driver on the Ubuntu 20.04 nodes and the other configures the driver on the Ubuntu 22.04 nodes. -All the nodes run the same 535 branch GPU driver. +All the nodes run the same 580 branch GPU driver. .. image:: graphics/nvd-basics.svg @@ -258,7 +255,7 @@ Perform the following steps to install the GPU Operator and use the NVIDIA drive .. code-block:: console - $ kubectl label node --overwrite driver.version=525.125.06 + $ kubectl label node --overwrite driver.version=580.126.20 - To use a mix of driver types, such as vGPU, label nodes for the driver type. - To use a mix of driver versions, label the nodes for the different versions. @@ -304,11 +301,6 @@ One Driver Type and Version on All Nodes .. literalinclude:: ./manifests/input/nvd-all.yaml :language: yaml - .. tip:: - - Because the manifest does not include a ``nodeSelector`` field, the driver custom - resource selects all nodes in the cluster that have an NVIDIA GPU. - #. Apply the manfiest: .. code-block:: console diff --git a/gpu-operator/gpu-operator-mig.rst b/gpu-operator/gpu-operator-mig.rst index 365a8f25c..db4fae916 100644 --- a/gpu-operator/gpu-operator-mig.rst +++ b/gpu-operator/gpu-operator-mig.rst @@ -34,16 +34,16 @@ Multi-Instance GPU (MIG) enables GPUs based on the NVIDIA Ampere and later archi Refer to the `MIG User Guide `__ for more information about MIG. GPU Operator deploys MIG Manager to manage MIG configuration on nodes in your Kubernetes cluster. +You must enable MIG during installation by choosing a MIG strategy before you can configure MIG. + +Refer to the :ref:`architecture section ` for more information about how MIG is implemented in the GPU Operator. ******************************** Enabling MIG During Installation ******************************** +Use the following steps to enable MIG and deploy MIG Manager. -The following steps use the ``single`` MIG strategy. -Alternatively, you can specify the ``mixed`` strategy. - -Perform the following steps to install the Operator and configure MIG: #. Install the Operator: @@ -55,17 +55,28 @@ Perform the following steps to install the Operator and configure MIG: --version=${version} \ --set mig.strategy=single - Set ``mig.strategy`` to ``mixed`` when MIG mode is not enabled on all GPUs on a node. - In a CSP environment such as Google Cloud, also specify + This example sets ``single`` as the MIG strategy. + Available MIG strategy options: + + * ``single``: MIG mode is enabled on all GPUs on a node. + * ``mixed``: MIG mode is not enabled on all GPUs on a node. + + In a cloud service provider (CSP) environment such as Google Cloud, also specify ``--set migManager.env[0].name=WITH_REBOOT --set-string migManager.env[0].value=true`` to ensure that the node reboots and can apply the MIG configuration. - MIG Manager supports preinstalled drivers. + MIG Manager supports preinstalled drivers, meaning drivers that are not managed by the GPU Operator and you installed directly on the host. If drivers are preinstalled, also specify ``--set driver.enabled=false``. Refer to :ref:`mig-with-preinstalled-drivers` for more details. - After several minutes, all the pods, including the ``nvidia-mig-manager`` are deployed on nodes that have MIG capable GPUs. + After several minutes, all GPU Operator pods, including the ``nvidia-mig-manager`` are deployed on nodes that have MIG capable GPUs. + + .. note:: + + MIG Manager requires that no user workloads are running on the GPUs being configured. + In some cases, the node might need to be rebooted, such as a CSP, so the node might need to be cordoned + before changing the MIG mode or the MIG geometry on the GPUs. #. Optional: Display the pods in the Operator namespace: @@ -91,29 +102,68 @@ Perform the following steps to install the Operator and configure MIG: :language: json :start-after: nvidia.com/gpu.memory - .. important:: - - MIG Manager requires that no user workloads are running on the GPUs being configured. - In some cases, the node may need to be rebooted, such as a CSP, so the node might need to be cordoned - before changing the MIG mode or the MIG geometry on the GPUs. - - .. note:: - - Known Issue: For drivers 570.124.06, 570.133.20, 570.148.08, and 570.158.01, - GPU workloads cannot be scheduled on nodes that have a mix of MIG slices and full GPUs. - This manifests as GPU pods getting stuck indefinitely in the ``Pending`` state. - NVIDIA recommends that you downgrade the driver to version 570.86.15 to work around this issue. - For more detailed information, see GitHub issue https://github.com/NVIDIA/gpu-operator/issues/1361. ************************ Configuring MIG Profiles ************************ -By default, nodes are labeled with ``nvidia.com/mig.config: all-disabled`` and you must specify the MIG configuration to apply. +When MIG is enabled, nodes are labeled with ``nvidia.com/mig.config: all-disabled`` by default. +To use a profile on a node, update the label value with the desired profile, for example, ``nvidia.com/mig.config=all-1g.10gb``. + +Introduced in GPU Operator v26.3.0, MIG Manager generates the MIG configuration for a node at runtime from the available hardware. +The configuration is generated on startup, discovering MIG profiles for each MIG-capable GPU on a node using `NVIDIA Management Library (NVML) `_, then writing it to a ConfigMap for each MIG-capable node in your cluster. +The ConfigMap is named ``-mig-config``, where ```` is the name of each MIG-capable node. +Each ConfigMap contains a complete mig-parted config, including ``all-disabled``, ``all-enabled``, per-profile configs such as ``all-1g.10gb``, and ``all-balanced`` with device-filter support for mixed GPU types. +When a new MIG-capable GPU is added to a node, the new GPU is automatically added to the ConfigMap. +For full details, see :ref:`mig-autogenerated-config`. + +If you need custom profiles, you can use a custom MIG configuration instead of the generated one. +You can use the Helm chart to create a ConfigMap from values at install time, or create and reference your own ConfigMap. +For an example, refer to :ref:`dynamically-creating-the-mig-configuration-configmap`. + +.. note:: + Generated MIG configuration might not be available on older drivers, such as 535 branch GPU drivers, as they do not support querying MIG profiles when MIG mode is disabled. In those cases, the GPU Operator will use a static ConfigMap, ``default-mig-parted-config``, for MIG profiles. + + +.. _mig-autogenerated-config: + +Autogenerated MIG Configuration +================================ + +When you do not specify a custom MIG configuration, MIG Manager generates the MIG configuration at runtime for each MIG-capable node and stores it in a ConfigMap. This section describes that autogenerated configuration. + +**ConfigMap name and location** + +* One ConfigMap per MIG-capable node, named ``-mig-config``, where ```` is the Kubernetes node name (for example, ``worker-0-mig-config``). +* ConfigMaps are created in the same namespace as the GPU Operator (typically ``gpu-operator``). + +**When it is created** + +* MIG Manager generates the configuration on startup. It discovers MIG-capable GPUs on the node using NVIDIA Management Library (NVML), enumerates the supported MIG profiles for each GPU, then writes a complete mig-parted config to the ConfigMap. +* When a new MIG-capable GPU is added to a node, MIG Manager updates the ConfigMap to include profiles for the new GPU. + +**ConfigMap contents** + +* Each ConfigMap contains a single key, ``config.yaml``, with a complete mig-parted configuration. The format is the same as the static default config: a ``version`` field and a ``mig-configs`` map. Included entries typically include: + * ``all-disabled`` and ``all-enabled`` + * Per-profile configs such as ``all-1g.10gb``, ``all-2g.20gb`` + * ``all-balanced`` with device-filter support for mixed GPU types on the node + +**Fallback when autogeneration is not available** + +* On older drivers (for example, 535 branch) that cannot query MIG profiles while MIG mode is disabled, dynamic discovery fails. The operator then uses the static ConfigMap ``default-mig-parted-config``. That ConfigMap is mounted into MIG Manager as ``config-default.yaml`` and used as a fallback. No per-node ``-mig-config`` ConfigMap is created in that case. + +**Inspecting the autogenerated config** + +* To view the generated configuration for a node: + + .. code-block:: console + + $ kubectl get configmap -n gpu-operator -mig-config -o yaml + +* Replace ```` with the actual node name (for example, ``worker-0-mig-config``). The ``data.config.yaml`` field contains the full mig-parted YAML. -MIG Manager uses the ``default-mig-parted-config`` config map in the GPU Operator namespace to identify supported MIG profiles. -Refer to the config map when you label the node or customize the config map. Example: Single MIG Strategy ============================ @@ -157,7 +207,7 @@ The following steps show how to use the single MIG strategy and configure the `` "nvidia.com/mig.strategy": "single" } - As described above, if the ``WITH_REBOOT`` option is set then MIG Manager sets the label to ``nvidia.com/mig.config.state: rebooting``. + When the ``WITH_REBOOT`` option is set, MIG Manager sets the label to ``nvidia.com/mig.config.state: rebooting``. #. Confirm that MIG Manager completed the configuration by checking the node labels: @@ -167,7 +217,7 @@ The following steps show how to use the single MIG strategy and configure the `` Check for the following labels: - * ``nvidia.com/gpu.count: 7``, this value differs according to the GPU model. + * ``nvidia.com/gpu.count: 7`` (the value differs according to the GPU model) * ``nvidia.com/gpu.slices.ci: 1`` * ``nvidia.com/gpu.slices.gi: 1`` * ``nvidia.com/mig.config.state: success`` @@ -186,7 +236,7 @@ The following steps show how to use the single MIG strategy and configure the `` "nvidia.com/mig.config.state": "success", "nvidia.com/mig.strategy": "single" -#. Optional: Run the ``nvidia-smi`` command in the driver container to verify that the MIG configuration: +#. Optional: Run the ``nvidia-smi`` command in the driver container to verify that the MIG configuration has been applied. .. code-block:: console @@ -240,7 +290,7 @@ The following steps show how to use the ``mixed`` MIG strategy and configure the :language: json :start-after: nvidia.com/gpu.memory -#. Optional: Run the ``nvidia-smi`` command in the driver container to verify that the GPU has been configured: +#. Optional: Run the ``nvidia-smi`` command in the driver container to verify that the GPU has been configured. .. code-block:: console @@ -327,18 +377,34 @@ The following steps show how to update a GPU on a node to the ``3g.40gb`` profil } +.. _dynamically-creating-the-mig-configuration-configmap: + + Example: Custom MIG Configuration During Installation ===================================================== -By default, the Operator creates the ``default-mig-parted-config`` config map and MIG Manager is configured to read profiles from that config map. +If you need to use custom profiles, you can create a custom ConfigMap during installation by passing in a name and data for the ConfigMap with the Helm command. + +The MIG Manager daemonset is configured to use this ConfigMap instead of the auto-generated one. -You can use the ``values.yaml`` file when you install or upgrade the Operator to create a config map with a custom configuration. +In your values.yaml file, set ``migManager.config.create`` to ``true``, set ``migManager.config.name``, and add the ConfigMap data under ``migManager.config.data``, for example: -#. In your ``values.yaml`` file, add the data for the config map, like the following example: +#. In your ``values.yaml`` file, add the data for the ConfigMap, like the following example: .. literalinclude:: manifests/input/mig-cm-values.yaml :language: yaml +.. note:: + Custom ConfigMaps must contain a key named "config.yaml" + +#. Install or upgrade the GPU Operator with this values file so the chart creates the ConfigMap: + + .. code-block:: console + + $ helm upgrade --install gpu-operator -n gpu-operator --create-namespace \ + nvidia/gpu-operator --version=${version} \ + -f values.yaml + #. If the custom configuration specifies more than one instance profile, set the strategy to ``mixed``: .. code-block:: console @@ -353,19 +419,56 @@ You can use the ``values.yaml`` file when you install or upgrade the Operator to $ kubectl label nodes nvidia.com/mig.config=custom-mig --overwrite +#. Optional: Monitor the MIG Manager logs to confirm the new MIG geometry is applied: + + .. code-block:: console + + $ kubectl logs -n gpu-operator -l app=nvidia-mig-manager -c nvidia-mig-manager + + *Example Output* + + .. code-block:: console + + Applying the selected MIG config to the node + time="2024-05-15T13:40:08Z" level=debug msg="Parsing config file..." + time="2024-05-15T13:40:08Z" level=debug msg="Selecting specific MIG config..." + time="2024-05-15T13:40:08Z" level=debug msg="Running apply-start hook" + time="2024-05-15T13:40:08Z" level=debug msg="Checking current MIG mode..." + time="2024-05-15T13:40:08Z" level=debug msg="Walking MigConfig for (devices=all)" + time="2024-05-15T13:40:08Z" level=debug msg=" GPU 0: 0x233010DE" + time="2024-05-15T13:40:08Z" level=debug msg=" Asserting MIG mode: Enabled" + time="2024-05-15T13:40:08Z" level=debug msg=" MIG capable: true\n" + time="2024-05-15T13:40:08Z" level=debug msg=" Current MIG mode: Enabled" + time="2024-05-15T13:40:08Z" level=debug msg="Checking current MIG device configuration..." + time="2024-05-15T13:40:08Z" level=debug msg="Walking MigConfig for (devices=all)" + time="2024-05-15T13:40:08Z" level=debug msg=" GPU 0: 0x233010DE" + time="2024-05-15T13:40:08Z" level=debug msg=" Asserting MIG config: map[1g.10gb:5 2g.20gb:1]" + time="2024-05-15T13:40:08Z" level=debug msg="Running pre-apply-config hook" + time="2024-05-15T13:40:08Z" level=debug msg="Applying MIG device configuration..." + time="2024-05-15T13:40:08Z" level=debug msg="Walking MigConfig for (devices=all)" + time="2024-05-15T13:40:08Z" level=debug msg=" GPU 0: 0x233010DE" + time="2024-05-15T13:40:08Z" level=debug msg=" MIG capable: true\n" + time="2024-05-15T13:40:08Z" level=debug msg=" Updating MIG config: map[1g.10gb:5 2g.20gb:1]" + time="2024-05-15T13:40:09Z" level=debug msg="Running apply-exit hook" + MIG configuration applied successfully + + +.. _example-custom-mig-configuration: Example: Custom MIG Configuration ================================= -By default, the Operator creates the ``default-mig-parted-config`` config map and MIG Manager is configured to read profiles from that config map. - -You can create a config map with a custom configuration if the default profiles do not meet your business needs. +You can create and apply a ConfigMap yourself if the default profiles do not meet your needs. #. Create a file, such as ``custom-mig-config.yaml``, with contents like the following example: .. literalinclude:: manifests/input/custom-mig-config.yaml :language: yaml + +.. note:: + Custom ConfigMaps must contain a key named "config.yaml" + #. Apply the manifest: .. code-block:: console @@ -380,7 +483,7 @@ You can create a config map with a custom configuration if the default profiles --type='json' \ -p='[{"op":"replace", "path":"/spec/mig/strategy", "value":"mixed"}]' -#. Patch the cluster policy so MIG Manager uses the custom config map: +#. Patch the cluster policy so MIG Manager uses the custom ConfigMap: .. code-block:: console @@ -394,38 +497,6 @@ You can create a config map with a custom configuration if the default profiles $ kubectl label nodes nvidia.com/mig.config=five-1g-one-2g --overwrite -#. Optional: Monitor the MIG Manager logs to confirm the new MIG geometry is applied: - - .. code-block:: console - - $ kubectl logs -n gpu-operator -l app=nvidia-mig-manager -c nvidia-mig-manager - - *Example Output* - - .. code-block:: console - - Applying the selected MIG config to the node - time="2024-05-15T13:40:08Z" level=debug msg="Parsing config file..." - time="2024-05-15T13:40:08Z" level=debug msg="Selecting specific MIG config..." - time="2024-05-15T13:40:08Z" level=debug msg="Running apply-start hook" - time="2024-05-15T13:40:08Z" level=debug msg="Checking current MIG mode..." - time="2024-05-15T13:40:08Z" level=debug msg="Walking MigConfig for (devices=all)" - time="2024-05-15T13:40:08Z" level=debug msg=" GPU 0: 0x233010DE" - time="2024-05-15T13:40:08Z" level=debug msg=" Asserting MIG mode: Enabled" - time="2024-05-15T13:40:08Z" level=debug msg=" MIG capable: true\n" - time="2024-05-15T13:40:08Z" level=debug msg=" Current MIG mode: Enabled" - time="2024-05-15T13:40:08Z" level=debug msg="Checking current MIG device configuration..." - time="2024-05-15T13:40:08Z" level=debug msg="Walking MigConfig for (devices=all)" - time="2024-05-15T13:40:08Z" level=debug msg=" GPU 0: 0x233010DE" - time="2024-05-15T13:40:08Z" level=debug msg=" Asserting MIG config: map[1g.10gb:5 2g.20gb:1]" - time="2024-05-15T13:40:08Z" level=debug msg="Running pre-apply-config hook" - time="2024-05-15T13:40:08Z" level=debug msg="Applying MIG device configuration..." - time="2024-05-15T13:40:08Z" level=debug msg="Walking MigConfig for (devices=all)" - time="2024-05-15T13:40:08Z" level=debug msg=" GPU 0: 0x233010DE" - time="2024-05-15T13:40:08Z" level=debug msg=" MIG capable: true\n" - time="2024-05-15T13:40:08Z" level=debug msg=" Updating MIG config: map[1g.10gb:5 2g.20gb:1]" - time="2024-05-15T13:40:09Z" level=debug msg="Running apply-exit hook" - MIG configuration applied successfully ******************************************* @@ -440,7 +511,7 @@ Verification: Running Sample CUDA Workloads Disabling MIG ************* -You can disable MIG on a node by setting the ``nvidia.con/mig.config`` label to ``all-disabled``: +You can disable MIG on a node by setting the ``nvidia.com/mig.config`` label to ``all-disabled``: .. code-block:: console @@ -476,12 +547,12 @@ Managing Host GPU Clients ========================= MIG Manager stops all operator-managed pods that have access to GPUs when applying a MIG reconfiguration. -When drivers are preinstalled, there may be GPU clients on the host that also need to be stopped. +When drivers are preinstalled, there can be GPU clients on the host that also need to be stopped. When drivers are preinstalled, MIG Manager attempts to stop and restart a list of systemd services on the host across a MIG reconfiguration. -The list of services are specified in the ``default-gpu-clients`` config map. +The list of services is specified in the ``default-gpu-clients`` ConfigMap. -The following sample GPU clients file, ``clients.yaml``, is used to create the ``default-gpu-clients`` config map: +The following sample GPU clients file, ``clients.yaml``, is used to create the ``default-gpu-clients`` ConfigMap: .. code-block:: yaml @@ -497,8 +568,8 @@ The following sample GPU clients file, ``clients.yaml``, is used to create the ` - dcgm.service - dcgm-exporter.service -You can modify the list by editing the config map after installation. -Alternatively, you can create a custom config map for use by MIG Manager by performing the following steps: +You can modify the list by editing the ConfigMap after installation. +Alternatively, you can create a custom ConfigMap for use by MIG Manager by performing the following steps: #. Create the ``gpu-operator`` namespace: @@ -520,22 +591,27 @@ Alternatively, you can create a custom config map for use by MIG Manager by perf -n gpu-operator --create-namespace \ nvidia/gpu-operator \ --version=${version} \ - --set migManager.gpuClientsConfig.name=gpu-clients + --set migManager.gpuClientsConfig.name=gpu-clients \ --set driver.enabled=false +.. _mig-architecture: + ***************** Architecture ***************** MIG Manager is designed as a controller within Kubernetes. It watches for changes to the -``nvidia.com/mig.config`` label on the node and then applies the user-requested MIG configuration +``nvidia.com/mig.config`` label on the node and then applies the user-requested MIG configuration. When the label changes, MIG Manager first stops all GPU pods, including device plugin, GPU feature discovery, and DCGM exporter. -MIG Manager then stops all host GPU clients listed in the ``clients.yaml`` config map if drivers are preinstalled. +MIG Manager then stops all host GPU clients listed in the ``clients.yaml`` ConfigMap if drivers are preinstalled. Finally, it applies the MIG reconfiguration and restarts the GPU pods and possibly, host GPU clients. The MIG reconfiguration can also involve rebooting a node if a reboot is required to enable MIG mode. -The default MIG profiles are specified in the ``default-mig-parted-config`` config map. +The default MIG profiles are specified in the ``-mig-config`` ConfigMap. +This ConfigMap is auto-generated by the MIG Manager for each MIG-capable node and contains the standard MIG profiles for the available GPUs on the node. +You can also configure the operator to configure a custom ConfigMap to use instead of the auto-generated one. + You can specify one of these profiles to apply to the ``mig.config`` label to trigger a reconfiguration of the MIG geometry. MIG Manager uses the `mig-parted `__ tool to apply the configuration diff --git a/gpu-operator/index.rst b/gpu-operator/index.rst index a202584c8..8d2dff342 100644 --- a/gpu-operator/index.rst +++ b/gpu-operator/index.rst @@ -41,14 +41,14 @@ :hidden: NVIDIA DRA Driver for GPUs - Multi-Instance GPU + Multi-Instance GPU (MIG) Time-Slicing GPUs gpu-operator-rdma.rst Outdated Kernels Custom GPU Driver Parameters precompiled-drivers.rst GPU Driver CRD - Container Device Interface (CDI) Support + CDI and NRI Support .. toctree:: :caption: Sandboxed Workloads diff --git a/gpu-operator/install-gpu-operator-gov-ready.rst b/gpu-operator/install-gpu-operator-gov-ready.rst index 875db7cb0..a033832e4 100644 --- a/gpu-operator/install-gpu-operator-gov-ready.rst +++ b/gpu-operator/install-gpu-operator-gov-ready.rst @@ -30,40 +30,15 @@ For more information on NVIDIA's government-ready support, refer to the white pa Supported GPU Operator Components ================================== -The government-ready NVIDIA GPU Operator includes the following components: - -.. _fn1: #base-image -.. |fn1| replace:: :sup:`1` - -.. list-table:: - :header-rows: 1 - - * - Component - - Version - * - NVIDIA GPU Operator - - v25.10.0 - * - NVIDIA GPU Feature Discovery - - 0.18.0 - * - NVIDIA Container Toolkit - - 1.18.0 - * - NVIDIA Device Plugin - - 0.18.0 - * - NVIDIA DCGM-exporter - - 4.4.1-4.6.0 - * - NVIDIA MIG Manager - - 0.13.0 - * - NVIDIA Driver - - 580.95.05 |fn1|_ - -:sup:`1` -Hardened for STIG/FIPS compliance + +Refer to the :ref:`operator-component-matrix` for a full list of supported government-ready GPU Operator components. Artifacts for these components are available from the `NVIDIA NGC Catalog `_. .. note:: - Not all GPU Operator components and features are available as government-ready containers in the v25.10.0 release. - For example, GPUDirect Storage and KubeVirt are not yet supported. + Not all GPU Operator components and features are available as government-ready containers in this release. + For example, NVIDIA GDS Driver and NVIDIA GDRCopy Driver are not yet supported. Validated Kubernetes Distributions @@ -73,6 +48,8 @@ The government-ready NVIDIA GPU Operator has been validated on the following Kub - Canonical Kubernetes 1.34 with Ubuntu Pro 24.04 and FIPS-compliant kernel - Red Hat OpenShift 4.19 in FIPS mode +- Rancher Kubernetes Engine 2 with Ubuntu 24.04 +- VMware VKS with Ubuntu 24.04 Install Government-Ready NVIDIA GPU Operator ============================================= diff --git a/gpu-operator/life-cycle-policy.rst b/gpu-operator/life-cycle-policy.rst index 69f71f832..c13890565 100644 --- a/gpu-operator/life-cycle-policy.rst +++ b/gpu-operator/life-cycle-policy.rst @@ -35,7 +35,6 @@ Patch releases typically include critical bug and CVE fixes, but can include min .. _operator_life_cycle_policy: -****************************** NVIDIA GPU Operator Life Cycle ****************************** @@ -54,13 +53,13 @@ The product life cycle and versioning are subject to change in the future. * - GPU Operator Version - Status - * - 25.10.x + * - 26.3.x - Supported - * - 25.3.x + * - 25.10.x - Deprecated - * - 24.9.x and lower + * - 25.3.x and lower - End of Support @@ -80,70 +79,52 @@ The following table shows the operands and default operand versions that corresp When post-release testing confirms support for newer versions of operands, these updates are identified as *recommended updates* to a GPU Operator version. Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. +.. note:: + All the following components are supported as :ref:`government-ready ` in the NVIDIA GPU Operator v26.3.0, except for NVIDIA GDS Driver and NVIDIA GDRCopy Driver. + **D** = Default driver, **R** = Recommended driver .. flat-table:: :header-rows: 2 * - :rspan:`1` Component - - :cspan:`2` GPU Operator Version + - GPU Operator Version - * - v25.10.0 - - v25.10.1 + * - v26.3 * - NVIDIA GPU Driver |ki|_ - - | `580.95.05 `_ (**D**, **R**) - | `580.82.07 `_ - | `575.57.08 `_ - | `570.195.03 `_ - | `550.163.01 `_ - | `535.274.02 `_ - | `590.48.01 `_ - | `580.126.16 `_ (**R**) - | `580.126.09 `_ - | `580.105.08 `_ (**D**) - | `580.95.05 `_ - | `580.82.07 `_ - | `575.57.08 `_ - | `570.211.01 `_ - | `570.195.03 `_ - | `550.163.01 `_ + | `580.126.20 `_ (**D**, **R**) + | `575.57.08 `_ | `535.288.01 `_ - | `535.274.02 `_ - * - NVIDIA Driver Manager for Kubernetes - - `v0.9.0 `__ - `v0.9.1 `__ * - NVIDIA Container Toolkit - - `1.18.0 `__ + - `1.19 `__ * - NVIDIA Kubernetes Device Plugin - - `0.18.0 `__ - - `0.18.1 `__ + - `0.18.2 `__ * - DCGM Exporter - - `v4.4.1-4.6.0 `__ - - `v4.4.2-4.7.0 `__ + - `v4.5.1-4.8.0 `__ * - Node Feature Discovery - `v0.18.2 `__ * - | NVIDIA GPU Feature Discovery | for Kubernetes - - `0.18.1 `__ + - `0.18.2 `__ * - NVIDIA MIG Manager for Kubernetes - - `0.13.0 `__ - - `0.13.1 `__ + - `0.14.0 `__ * - DCGM - - `4.4.1 `__ - - `4.4.2-1 `__ + - `4.5.2-1 `__ * - Validator for NVIDIA GPU Operator - - v25.10.0 + - v26.3 * - NVIDIA KubeVirt GPU Device Plugin - `v1.4.0 `__ @@ -152,10 +133,7 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information. - `v0.4.1 `__ * - NVIDIA GDS Driver |gds|_ - - `2.26.6 `__ - - * - NVIDIA Kata Manager for Kubernetes - - `v0.2.3 `__ + - `2.27.3 `__ * - | NVIDIA Confidential Computing | Manager for Kubernetes diff --git a/gpu-operator/manifests/input/mig-cm-values.yaml b/gpu-operator/manifests/input/mig-cm-values.yaml index 550f19a4b..b3b618b22 100644 --- a/gpu-operator/manifests/input/mig-cm-values.yaml +++ b/gpu-operator/manifests/input/mig-cm-values.yaml @@ -11,7 +11,18 @@ migManager: mig-enabled: false custom-mig: - devices: [0] + mig-enabled: false + - devices: [1] mig-enabled: true mig-devices: "1g.10gb": 2 + - devices: [2] + mig-enabled: true + mig-devices: "2g.20gb": 2 + "3g.40gb": 1 + - devices: [3] + mig-enabled: true + mig-devices: + "3g.40gb": 1 + "4g.40gb": 1 diff --git a/gpu-operator/manifests/input/nvd-all.yaml b/gpu-operator/manifests/input/nvd-all.yaml index 6f2fceb52..993d0c2fc 100644 --- a/gpu-operator/manifests/input/nvd-all.yaml +++ b/gpu-operator/manifests/input/nvd-all.yaml @@ -1,23 +1,36 @@ apiVersion: nvidia.com/v1alpha1 kind: NVIDIADriver metadata: - name: demo-all + name: nvidiadriver-sample spec: + # use pre-compiled packages for NVIDIA driver installation. + usePrecompiled: false driverType: gpu + repository: nvcr.io/nvidia image: driver + version: "580.126.20" imagePullPolicy: IfNotPresent imagePullSecrets: [] + nodeSelector: {} manager: {} rdma: enabled: false useHostMofed: false gds: enabled: false - repository: nvcr.io/nvidia - startupProbe: - failureThreshold: 120 - initialDelaySeconds: 60 - periodSeconds: 10 - timeoutSeconds: 60 - usePrecompiled: false - version: 535.104.12 + # Private mirror repository configuration + repoConfig: + name: "" + # custom ssl key/certificate configuration + certConfig: + name: "" + # vGPU licensing configuration + licensingConfig: + secretName: "" + nlsEnabled: true + # vGPU topology daemon configuration + virtualTopologyConfig: + name: "" + # kernel module configuration for NVIDIA driver + kernelModuleConfig: + name: "" diff --git a/gpu-operator/manifests/output/nri-get-pods-restart.txt b/gpu-operator/manifests/output/nri-get-pods-restart.txt new file mode 100644 index 000000000..7f00e5cc5 --- /dev/null +++ b/gpu-operator/manifests/output/nri-get-pods-restart.txt @@ -0,0 +1,12 @@ +NAME READY STATUS RESTARTS AGE +gpu-feature-discovery-qnw2q 1/1 Running 0 47h +gpu-operator-6d59774ff-hznmr 1/1 Running 0 2d +gpu-operator-node-feature-discovery-master-6d6649d597-7l8bj 1/1 Running 0 2d +gpu-operator-node-feature-discovery-worker-v86vj 1/1 Running 0 2d +nvidia-container-toolkit-daemonset-2768s 1/1 Running 0 2m11s +nvidia-cuda-validator-ls4vc 0/1 Completed 0 47h +nvidia-dcgm-exporter-fxp9h 1/1 Running 0 47h +nvidia-device-plugin-daemonset-dvp4v 1/1 Running 0 2m26s +nvidia-device-plugin-validator-kvxbs 0/1 Completed 0 47h +nvidia-driver-daemonset-m86r7 1/1 Running 0 2d +nvidia-operator-validator-xg98r 1/1 Running 0 47h diff --git a/gpu-operator/platform-support.rst b/gpu-operator/platform-support.rst index d854388bd..17eea1fc1 100644 --- a/gpu-operator/platform-support.rst +++ b/gpu-operator/platform-support.rst @@ -31,8 +31,9 @@ Platform Support .. _supported nvidia gpus and systems: +********************************************* Supported NVIDIA Data Center GPUs and Systems ---------------------------------------------- +********************************************* The following NVIDIA data center GPUs are supported on x86 based platforms: @@ -188,6 +189,8 @@ The following NVIDIA data center GPUs are supported on x86 based platforms: +-------------------------+------------------------+-------+ | Product | Architecture | Notes | +=========================+========================+=======+ + | NVIDIA DGX B300 | NVIDIA Blackwell | | + +-------------------------+------------------------+-------+ | NVIDIA DGX B200 | NVIDIA Blackwell | | +-------------------------+------------------------+-------+ | NVIDIA DGX Spark | NVIDIA Blackwell | | @@ -198,8 +201,12 @@ The following NVIDIA data center GPUs are supported on x86 based platforms: +-------------------------+------------------------+-------+ | NVIDIA HGX GB200 NVL72 | NVIDIA Blackwell | | +-------------------------+------------------------+-------+ + | NVIDIA HGX GB200 NVL4 | NVIDIA Blackwell | | + +-------------------------+------------------------+-------+ | NVIDIA HGX GB300 NVL72 | NVIDIA Blackwell | | +-------------------------+------------------------+-------+ + | NVIDIA DGX Station | NVIDIA Blackwell | | + +-------------------------+------------------------+-------+ .. note:: @@ -208,8 +215,9 @@ The following NVIDIA data center GPUs are supported on x86 based platforms: .. _gpu-operator-arm-platforms: +***************************** Supported ARM Based Platforms ------------------------------ +***************************** The following NVIDIA data center GPUs are supported: @@ -247,8 +255,9 @@ system that meets the following requirements is supported: .. _Supported Deployment Options, Hypervisors, and NVIDIA vGPU Based Products: +**************************** Supported Deployment Options ----------------------------- +**************************** The GPU Operator has been validated in the following scenarios: @@ -268,8 +277,9 @@ The GPU Operator has been validated in the following scenarios: .. _container-platforms: +**************************************************** Supported Operating Systems and Kubernetes Platforms ----------------------------------------------------- +**************************************************** .. _fn1: #kubernetes-version .. |fn1| replace:: :sup:`1` @@ -277,14 +287,16 @@ Supported Operating Systems and Kubernetes Platforms .. |fn2| replace:: :sup:`2` .. _fn3: #rhel-9 .. |fn3| replace:: :sup:`3` -.. _fn4: #k8s-version -.. |fn4| replace:: :sup:`4` +.. _fn5: #azure-linux-3 +.. |fn5| replace:: :sup:`5` The GPU Operator has been validated in the following scenarios: .. tab-set:: + :sync-group: container-platforms .. tab-item:: Bare Metal / Virtual Machines with GPU Passthrough and NVIDIA vGPU + :sync: bare-metal .. list-table:: :header-rows: 1 @@ -292,62 +304,80 @@ The GPU Operator has been validated in the following scenarios: * - | Operating | System - - Kubernetes |fn1|_, |fn4|_ + - Kubernetes |fn1|_ - | Red Hat | OpenShift - | VMware vSphere | Kubernetes Service (VKS) - | Rancher Kubernetes - | Engine 2 |fn4|_ - - | Mirantis k0s |fn4|_ + | Engine 2 + - | K3s + - | Mirantis k0s - | Canonical - | MicroK8s |fn4|_ + | MicroK8s - | Nutanix | NKP * - Ubuntu 20.04 LTS |fn2|_ - - 1.30---1.35 + - 1.32---1.35 + - + - 1.32---1.35 + - 1.32---1.35 - - - 1.30---1.35 - - 1.30---1.35 - - - 2.12, 2.13, 2.14 * - Ubuntu 22.04 LTS |fn2|_ - - 1.30---1.35 + - 1.32---1.35 - - - 1.30---1.35 - - 1.30---1.35 - - 1.30---1.35 + - 1.32---1.35 + - 1.32---1.35 + - 1.32---1.35 + - 1.32---1.35 - 1.33---1.35 - 2.12, 2.13, 2.14, 2.15 * - Ubuntu 24.04 LTS - - 1.30---1.35 + - 1.32---1.35 - - - - 1.30---1.35 - - 1.30---1.35 + - 1.32---1.35 + - 1.32---1.35 + - 1.32---1.35 - 1.33---1.35 - * - Red Hat Core OS - - - | 4.14---4.21 + - | 4.17---4.21 + - + - + - + - + - + - + + * - | Red Hat + | Enterprise + | Linux 10.0, 10.1 + - 1.32---1.35 - - + - 1.32---1.35 + - - - - * - | Red Hat | Enterprise - | Linux 9.2, 9.4, 9.6 |fn3|_ - - 1.30---1.35 + | Linux 9.2, 9.4, 9.6, 9.7 |fn3|_ + - 1.32---1.35 + - - + - 1.32---1.35 - - - 1.30---1.35 - - - @@ -356,14 +386,25 @@ The GPU Operator has been validated in the following scenarios: | Enterprise | Linux 8.8, | 8.10 - - 1.30---1.35 + - 1.32---1.35 - - - - 1.30---1.35 + - 1.32---1.35 + - - - - 2.12, 2.13, 2.14, 2.15 + * - Rocky Linux 9.7 + - 1.32---1.35 + - + - + - + - + - + - + - + .. _kubernetes-version: :sup:`1` @@ -387,16 +428,12 @@ The GPU Operator has been validated in the following scenarios: Non-precompiled driver containers for Red Hat Enterprise Linux 9.2, 9.4, and 9.6 versions are available for x86 based platforms only. They are not available for ARM based systems. - .. _k8s-version: - - :sup:`4` - Kubernetes v1.35 support was added in v25.10.1 and later. - .. note:: |ocp_csp_support| .. tab-item:: Cloud Service Providers + :sync: cloud-service-providers .. list-table:: :header-rows: 1 @@ -412,28 +449,37 @@ The GPU Operator has been validated in the following scenarios: | Kubernetes Service * - Ubuntu 20.04 LTS - - 1.30---1.35 - - 1.30---1.35 - - 1.30---1.35 + - 1.32---1.35 + - 1.32---1.35 + - 1.32---1.35 * - Ubuntu 22.04 LTS - - 1.30---1.35 - - 1.30---1.35 - - 1.30---1.35 + - 1.32---1.35 + - 1.32---1.35 + - 1.32---1.35 * - Ubuntu 24.04 LTS - - 1.30---1.35 - - 1.30---1.35 - - 1.30---1.35 + - 1.32---1.35 + - 1.32---1.35 + - 1.32---1.35 + + * - Azure Linux 3 (Local Program) |fn5|_ + - + - + - 1.32---1.35 + + .. _azure-linux-3: + + :sup:`5` + Azure Linux 3 are available as precompiled drivers and signed vGPU Guest Driver. - - Kubernetes v1.35 support was added in v25.10.1 and later. .. _supported-precompiled-drivers: +***************************** Supported Precompiled Drivers ------------------------------ +***************************** The GPU Operator has been validated with the following precompiled drivers. See the :doc:`precompiled-drivers` page for more information about using precompiled drivers. @@ -452,14 +498,14 @@ See the :doc:`precompiled-drivers` page for more information about using precomp +----------------------------+------------------------+----------------+---------------------+ - +**************************** Supported Container Runtimes ----------------------------- +**************************** The GPU Operator has been validated for the following container runtimes: +----------------------------+------------------------+----------------+ -| Operating System | Containerd 1.7 - 2.1 | CRI-O | +| Operating System | Containerd 1.7 - 2.2 | CRI-O | +============================+========================+================+ | Ubuntu 20.04 LTS | Yes | Yes | +----------------------------+------------------------+----------------+ @@ -474,9 +520,14 @@ The GPU Operator has been validated for the following container runtimes: | Red Hat Enterprise Linux 9 | Yes | Yes | +----------------------------+------------------------+----------------+ +.. note:: + + If you are planning to use the NRI Plugin, you must use at least CRI-O version v1.34.0 or containerd version v1.7.30, v2.1.x and v2.2.x. + If you are not using the latest containerd version, check that both CDI and NRI are enabled in the containerd configuration file before deploying GPU Operator. +************************************************* Support for KubeVirt and OpenShift Virtualization -------------------------------------------------- +************************************************* Red Hat OpenShift Virtualization is based on KubeVirt. @@ -487,13 +538,12 @@ Operating System Kubernetes KubeVirt OpenShift Virtual \ \ | GPU vGPU | GPU vGPU | Passthrough | Passthrough ================ =========== ============= ========= ============= =========== -Ubuntu 20.04 LTS 1.30---1.35 0.36+ 0.59.1+ -Ubuntu 22.04 LTS 1.30---1.35 0.36+ 0.59.1+ -Red Hat Core OS 4.14---4.21 4.14---4.21 +Ubuntu 24.04 LTS 1.32---1.35 0.36+ +Ubuntu 22.04 LTS 1.32---1.35 0.36+ 0.59.1+ +Ubuntu 20.04 LTS 1.32---1.35 0.36+ 0.59.1+ +Red Hat Core OS 4.17---4.21 4.17---4.21 ================ =========== ============= ========= ============= =========== -Kubernetes v1.35 support was added in v25.10.1 and later. - You can run GPU passthrough and NVIDIA vGPU in the same cluster as long as you use a software version that meets both requirements. @@ -519,14 +569,15 @@ KubeVirt and OpenShift Virtualization with NVIDIA vGPU is supported on the follo The L40G GPU is excluded. -Note that HGX platforms are not supported. +- NVIDIA HGX GB200 NVL72, GB300 NVL72 on Ubuntu 24.04 LTS. .. note:: KubeVirt with NVIDIA vGPU is supported on ``nodes`` with Linux kernel < 6.0, such as Ubuntu 22.04 ``LTS``. +************************** Support for GPUDirect RDMA --------------------------- +************************** Supported operating systems and NVIDIA GPU Drivers with GPUDirect RDMA. @@ -534,19 +585,19 @@ Supported operating systems and NVIDIA GPU Drivers with GPUDirect RDMA. - Ubuntu 24.04 LTS with Network Operator 25.7.0. - Ubuntu 20.04 and 22.04 LTS with Network Operator 25.7.0. - Red Hat Enterprise Linux 9.2, 9.4, and 9.6 with Network Operator 25.7.0. -- Red Hat OpenShift 4.14 and higher with Network Operator 25.7.0. +- Red Hat OpenShift 4.17 and higher with Network Operator 25.7.0. For information about configuring GPUDirect RDMA, refer to :doc:`gpu-operator-rdma`. - +***************************** Support for GPUDirect Storage ------------------------------ +***************************** Supported operating systems and NVIDIA GPU Drivers with GPUDirect Storage. - Ubuntu 24.04 LTS Network Operator 25.7.0. - Ubuntu 20.04 and 22.04 LTS with Network Operator 25.7.0. -- Red Hat OpenShift Container Platform 4.14 and higher. +- Red Hat OpenShift Container Platform 4.17 and higher. .. note:: @@ -560,8 +611,9 @@ Supported operating systems and NVIDIA GPU Drivers with GPUDirect Storage. Not supported with secure boot. Supported storage types are local NVMe and remote NFS. +******************************************* Additional Supported Tools and Integrations --------------------------------------------- +******************************************* Container management tools: diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index d58ded54e..6d4260a66 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -33,6 +33,127 @@ Refer to the :ref:`GPU Operator Component Matrix` for a list of software compone ---- +.. _v26.3.0: + +26.3.0 +======= + +New Features +------------ + +* Updated software component versions: + + - NVIDIA Driver Manager for Kubernetes v0.10.0 + - NVIDIA Container Toolkit v1.19.0 + - NVIDIA DCGM v4.5.2-1 + - NVIDIA DCGM Exporter v4.5.1-4.8.0 + - NVIDIA GDS Driver v2.27.3 + - NVIDIA Kubernetes Device Plugin v0.19.0 + - NVIDIA MIG Manager for Kubernetes v0.14.0 + - NVIDIA GPU Feature Discovery for Kubernetes v0.19.0 + +* Added support for these NVIDIA Data Center GPU Driver versions: + + - 580.126.20 (default) + +* Added support for Node Resource Interface (NRI) Plugin. + This is a new way of injecting GPU management CDI devices into operands, replacing the ``nvidia`` runtime class. + Enable by setting the ``cdi.nriPluginEnabled`` field to ``true`` in the ClusterPolicy custom resource or by setting the ``cdi.nriPluginEnabled`` flag in the Helm chart. + + When enabled, the Container Toolkit no longer modifies the container runtime configuration, such as the ``containerd.toml`` file, to inject GPUs into workload containers. Instead, the NRI Plugin injects GPUs into workload containers. + Additionally, the GPU Operator no longer requires setting values like ``CONTAINERD_CONFIG``, ``CONTAINERD_SOCKET``, or ``RUNTIME_CONFIG_SOURCE`` for the Container Toolkit on platforms such as K3s, k0s, and RKE. + + This feature requires containerd v1.7.30, v2.1.x, or v2.2.x. + If you are not using the latest containerd version, check that both CDI and NRI are enabled in the containerd configuration file before deploying GPU Operator. + + .. note:: + Enabling the NRI plugin is not supported with cri-o. + +* Added support for dynamic MIG config generation. + By default, the MIG Manager will automatically generate a per-node ConfigMap with the default MIG profiles for the available GPUs on the node. + This replaces the previous static ConfigMap. + You are still able to use a custom MIG configuration if you have specific requirements. + Refer to the :doc:`MIG Manager documentation ` for more information. + +* Added support for the NVIDIA Driver Custom Resource Definition (CRD). + Use this feature to configure multiple driver types and versions on different nodes or multiple operating system versions on nodes. + Refer to the :doc:`NVIDIA Driver Custom Resource Definition documentation ` for more information. + + .. note:: + This feature does not support an upgrade from an earlier version of the NVIDIA GPU Operator or switching from ClusterPolicy to the NVIDIA Driver CRD. + It is recommended that you only use this feature from new installations. + +* Added support for KubeVirt with GPU passthrough on Ubuntu 24.04 LTS + +* Added support for vGPU precompiled driver container for Azure Linux. + +* Added support for K3s. + +* Added support for containerd 2.2. + +* Added support for new MIG profiles with NVIDIA HGX GB300 NVL72. + +* Added support for new operating systems: + + - Rocky Linux 9.7 + - Red Hat Enterprise Linux 10.0, 10.1 + - Red Hat Enterprise Linux 9.7 + +* Added support for including extra manifests with the Helm chart in the ``extraObjects`` field. + +* Added the ``sandboxWorkloads.mode`` field to help manage sandbox workloads, with ``kubevirt`` and ``kata`` as valid values. + +* Added support for the DCGM Exporter to expose a metric port on the host network namespace. + Enabled by setting ``hostNetwork: true`` in the ClusterPolicy custom resource, or passing ``--set dcgmExporter.hostNetwork=true`` to the Helm chart. (`PR #1962 `_) + +* Added PodSecurityContext support for DaemonSets (`PR #2120 `_). + In ClusterPolicy, set ``spec.daemonsets.podSecurityContext``; in NVIDIADriver, set ``spec.podSecurityContext``. + +* See `PR #2014 `_ for related changes. + +* Validated Operator government-ready component support with Rancher Kubernetes Engine 2 using Ubuntu 24.04 and VMware VSphere with Kubernetes Service using Ubuntu 24.04. + +Improvements +------------ + +* Improved NVIDIA Driver resiliency when the driver container is removed. + In previouos versions, the NVIDIA Driver would unload the kernel modules and perform the driver compilation process, which could take several minutes to complete, delaying the driver container from restarting. + In v26.3.0, if there is no change to the CUDA driver version in the ClusterPolicy, the NVIDIA Driver will reuse the kernel modules that are available on the node. + This reduces the time to recover from the driver container removal from minteus to seconds. + +* Improved the NVIDIA Kubernetes Device Plugin to avoid unnecessary GPU unbind/rebind operations during rolling updates of the vfio-manager DaemonSet. + This improves the stability of GPU passthrough workloads (KubeVirt, Kata Containers). +* Improved the Upgrade Controller to decrease unnecessary reconciliation in environments with Node Feature Discovery (NFD) enabled. +* Reduced unnecessary API calls and speeding reconciliation on large GPU clusters by improving node label logic (`PR #2113 `_). + +* Improved the GPU Operator to now use operating system version labels from GPU worker nodes (added by NFD) when determining OS-specific paths for repository configuration files. (`PR #562 `_, `PR #2138 `_) + + +Fixed Issues +------------ + +* Fixed an issue where driver installations can fail because cached packages were incorrectly referenced. (`PR #592 `_) + +* Fixed a shared state issue that caused incorrect driver images in multi-node-pool clusters. (`PR #1952 `_) + +* Fixed an issue where the GPU Operator was applying driver upgrade annotations when the driver is disabled. (`PR #1968 `_) + +* Fixed an issue where an empty value in the Helm chart for ``device.plugin`` was incorrectly causing an error. (`PR #1999 `_) + +* Fixed an issue on OpenShift clusters where the ``dcgm-exporter`` pod gets bound to another Security Context Constraint (SCC) object instead of the ``nvidia-dcgm-exporter`` SCC that the GPU Operator creates. (`PR #2122 `_) + +* Fixed an issue where the GPU Operator was not correctly cleaning up DaemonSets (`PR #2081 `_). + +* Fixed an issue where the GPU Operator was not adding a namespace to ServiceAccount objects. (`PR #2039 `_) + + +Removals and Deprecations +------------------------- + +* Marked unused field ``defaultRuntime`` as optional in the ClusterPolicy. (`PR #2000 `_) +* The NVIDIA Kata Manager for Kubernetes is now deprecated. + To enable Kata Containers for GPUs, install the upstream kata-deploy Helm chart, which deploys all Kata runtime classes, including the NVIDIA-specific runtime classes. + .. _v25.10.1: @@ -137,7 +258,7 @@ New Features This differs from prior releases where CDI support in container runtimes was not used, and instead, an ``nvidia`` runtime class configured in CDI mode was used. - When CDI is enabled, no configuration changes are required for standard workloads using GPU allocation through the Device Plugin. Setting ``runtimeClassName`` is not required for standard workloads. For workloads that already have ``runtimeClassName: nvidia`` set in their pod spec YAML, no change is necessary. - - GPU Management Containers that use the ``NVIDIA_VISIBLE_DEVICES`` environment variable to get GPU access, bypassing GPU allocation via the Device Plugin, must set ``runtimeClassName: nvidia`` in the pod specification. It's recommended that ``NVIDIA_VISIBLE_DEVICES`` only be used by GPU Management Containers. A GPU Management Container is a container that requires access to all GPUs without them being allocated by Kubernetes. Examples include monitoring agents and device plugins. + - GPU Management Containers that use the ``NVIDIA_VISIBLE_DEVICES`` environment variable to get GPU access, bypassing GPU allocation via the Device Plugin or DRA Driver for GPUs, must set ``runtimeClassName: nvidia`` in the pod specification. It's recommended that ``NVIDIA_VISIBLE_DEVICES`` only be used by GPU Management Containers. A GPU Management Container is a container that requires access to all GPUs without them being allocated by Kubernetes. Examples include monitoring agents and device plugins. - For OpenShift users upgrading to v25.10.0, we recommend updating the ``cdi.enabled`` field in ClusterPolicy to ``true`` post-upgrade. This field will not automatically be updated to ``true`` since the Operator Lifecycle Manager (OLM) does not mutate custom diff --git a/gpu-operator/versions.json b/gpu-operator/versions.json index c893de42f..d8e53c86c 100644 --- a/gpu-operator/versions.json +++ b/gpu-operator/versions.json @@ -1,7 +1,10 @@ { - "latest": "25.10", + "latest": "26.3", "versions": [ + { + "version": "26.3" + }, { "version": "25.10" }, diff --git a/gpu-operator/versions1.json b/gpu-operator/versions1.json index 3557db032..f29832ee9 100644 --- a/gpu-operator/versions1.json +++ b/gpu-operator/versions1.json @@ -1,6 +1,10 @@ [ { "preferred": "true", + "url": "../26.3", + "version": "26.3" + }, + { "url": "../25.10", "version": "25.10" }, diff --git a/openshift/openshift-virtualization.rst b/openshift/openshift-virtualization.rst index 3b125c36c..b1c966a19 100644 --- a/openshift/openshift-virtualization.rst +++ b/openshift/openshift-virtualization.rst @@ -660,6 +660,7 @@ If the node is not labeled, then the ``default`` configuration will be used. For more information on this component and how it is configured, refer to the project `README `_. By default, the GPU Operator deploys a ConfigMap for the vGPU Device Manager, containing named configurations for all `vGPU types `_ supported by NVIDIA vGPU. +The GPU Operator only uses only adds Q and C profiles in the default ConfigMap Users can select a specific configuration for a worker node by applying the ``nvidia.com/vgpu.config`` node label. For example, labeling a node with ``nvidia.com/vgpu.config=A10-8Q`` would create 3 vGPU devices of type **A10-8Q** on all **A10** GPUs on the node (note: 3 is the maximum number of **A10-8Q** devices that can be created per GPU). From 5548b1c2da9200ffaf8cbf11172957d1473a1a94 Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Wed, 18 Mar 2026 10:19:46 -0400 Subject: [PATCH 2/4] Update release notes Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/release-notes.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index 6d4260a66..6d764d06a 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -109,8 +109,6 @@ New Features * Added PodSecurityContext support for DaemonSets (`PR #2120 `_). In ClusterPolicy, set ``spec.daemonsets.podSecurityContext``; in NVIDIADriver, set ``spec.podSecurityContext``. -* See `PR #2014 `_ for related changes. - * Validated Operator government-ready component support with Rancher Kubernetes Engine 2 using Ubuntu 24.04 and VMware VSphere with Kubernetes Service using Ubuntu 24.04. Improvements @@ -121,13 +119,13 @@ Improvements In v26.3.0, if there is no change to the CUDA driver version in the ClusterPolicy, the NVIDIA Driver will reuse the kernel modules that are available on the node. This reduces the time to recover from the driver container removal from minteus to seconds. -* Improved the NVIDIA Kubernetes Device Plugin to avoid unnecessary GPU unbind/rebind operations during rolling updates of the vfio-manager DaemonSet. - This improves the stability of GPU passthrough workloads (KubeVirt, Kata Containers). * Improved the Upgrade Controller to decrease unnecessary reconciliation in environments with Node Feature Discovery (NFD) enabled. * Reduced unnecessary API calls and speeding reconciliation on large GPU clusters by improving node label logic (`PR #2113 `_). * Improved the GPU Operator to now use operating system version labels from GPU worker nodes (added by NFD) when determining OS-specific paths for repository configuration files. (`PR #562 `_, `PR #2138 `_) +* Driver validation now waits for all enabled additional drivers (such as GDS and GDRCopy) to be installed before proceeding, and each node records a node-local view of enabled features when using multiple NVIDIADriver CRs or optional components. (`PR #2014 `_) + Fixed Issues ------------ From 976da628ba1b11652a107d9f0b1e77d57805e47a Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Wed, 18 Mar 2026 10:34:07 -0400 Subject: [PATCH 3/4] Update version, release notes, nvidiadriver Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/gpu-driver-configuration.rst | 8 ++++---- gpu-operator/release-notes.rst | 2 +- repo.toml | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gpu-operator/gpu-driver-configuration.rst b/gpu-operator/gpu-driver-configuration.rst index 7b99cda17..87516131b 100644 --- a/gpu-operator/gpu-driver-configuration.rst +++ b/gpu-operator/gpu-driver-configuration.rst @@ -237,8 +237,8 @@ The following table describes some of the fields in the custom resource. * - ``version`` - Specifies the GPU driver version to install. - For a data-center driver, specify a value like ``535.104.12``. - If you set ``usePrecompiled`` to ``true``, specify the driver branch, such as ``535``. + For a data-center driver, specify a value like ``580.126.20``. + If you set ``usePrecompiled`` to ``true``, specify the driver branch, such as ``580``. - Refer to the :ref:`operator-component-matrix`. @@ -385,7 +385,7 @@ Precompiled Driver Container on Some Nodes .. code-block:: console $ kubectl label node --overwrite driver.precompiled="true" - $ kubectl label node --overwrite driver.version="535" + $ kubectl label node --overwrite driver.version="580" #. Create a file, such as ``nvd-precomiled-some.yaml``, with contents like the following: @@ -476,7 +476,7 @@ If the driver daemon sets and pods are not running as you expect, perform the fo Name: demo-precomp ... - Version: 535.104.05 + Version: 580.126.20 Status: Conditions: Last Transition Time: 2023-10-13T14:33:30Z diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index 6d764d06a..61058a2a7 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -76,7 +76,7 @@ New Features Refer to the :doc:`MIG Manager documentation ` for more information. * Added support for the NVIDIA Driver Custom Resource Definition (CRD). - Use this feature to configure multiple driver types and versions on different nodes or multiple operating system versions on nodes. + Use this feature on new cluster installations to configure multiple driver types and versions on different nodes or multiple operating system versions on nodes. Refer to the :doc:`NVIDIA Driver Custom Resource Definition documentation ` for more information. .. note:: diff --git a/repo.toml b/repo.toml index c0ef9ffd8..8e601f22b 100644 --- a/repo.toml +++ b/repo.toml @@ -169,8 +169,8 @@ output_format = "linkcheck" docs_root = "${root}/gpu-operator" project = "gpu-operator" name = "NVIDIA GPU Operator" -version = "25.10" # Update repo_docs.projects.openshift.version to match latest patch version maj.min.patch -source_substitutions = { minor_version = "25.10", version = "v25.10.1", recommended = "580.105.08", dra_version = "25.12.0" } +version = "26.3" # Update repo_docs.projects.openshift.version to match latest patch version maj.min.patch +source_substitutions = { minor_version = "26.3", version = "v26.3.0", recommended = "580.126.20", dra_version = "25.12.0" } copyright_start = 2020 sphinx_exclude_patterns = [ "life-cycle-policy.rst", From f9c178585ff5dc873e8e0c8311e1734f9b76b797 Mon Sep 17 00:00:00 2001 From: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> Date: Wed, 18 Mar 2026 10:48:56 -0400 Subject: [PATCH 4/4] Update nvd manifest Signed-off-by: Abigail McCarthy <20771501+a-mccarthy@users.noreply.github.com> --- gpu-operator/manifests/input/nvd-demo-gold.yaml | 2 +- gpu-operator/manifests/input/nvd-driver-multiple.yaml | 2 +- gpu-operator/manifests/input/nvd-precompiled-all.yaml | 2 +- gpu-operator/manifests/input/nvd-precompiled-some.yaml | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gpu-operator/manifests/input/nvd-demo-gold.yaml b/gpu-operator/manifests/input/nvd-demo-gold.yaml index 383e49932..267b3a4ed 100644 --- a/gpu-operator/manifests/input/nvd-demo-gold.yaml +++ b/gpu-operator/manifests/input/nvd-demo-gold.yaml @@ -32,4 +32,4 @@ spec: periodSeconds: 10 timeoutSeconds: 60 usePrecompiled: false - version: 535.104.12 + version: 580.126.20 diff --git a/gpu-operator/manifests/input/nvd-driver-multiple.yaml b/gpu-operator/manifests/input/nvd-driver-multiple.yaml index 94940296a..65635209a 100644 --- a/gpu-operator/manifests/input/nvd-driver-multiple.yaml +++ b/gpu-operator/manifests/input/nvd-driver-multiple.yaml @@ -12,7 +12,7 @@ spec: nodeSelector: driver.config: "gold" repository: nvcr.io/nvidia - version: "535.104.12" + version: "580.126.20" --- apiVersion: nvidia.com/v1alpha1 kind: NVIDIADriver diff --git a/gpu-operator/manifests/input/nvd-precompiled-all.yaml b/gpu-operator/manifests/input/nvd-precompiled-all.yaml index cd2760c5f..25efb9e7c 100644 --- a/gpu-operator/manifests/input/nvd-precompiled-all.yaml +++ b/gpu-operator/manifests/input/nvd-precompiled-all.yaml @@ -13,4 +13,4 @@ spec: repository: nvcr.io/nvidia resources: {} usePrecompiled: true - version: "535" + version: "580" diff --git a/gpu-operator/manifests/input/nvd-precompiled-some.yaml b/gpu-operator/manifests/input/nvd-precompiled-some.yaml index aa384f343..3089aa99c 100644 --- a/gpu-operator/manifests/input/nvd-precompiled-some.yaml +++ b/gpu-operator/manifests/input/nvd-precompiled-some.yaml @@ -11,8 +11,8 @@ spec: manager: {} nodeSelector: driver.precompiled: "true" - driver.version: "535" + driver.version: "580" repository: nvcr.io/nvidia resources: {} usePrecompiled: true - version: "535" + version: "580"