From c9f814232518c59d0d09d33cdabede448fa8e687 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 17 Mar 2026 19:37:52 +0100
Subject: [PATCH 1/9] cleanup nixl mounts

---
 src/cloudai/workloads/common/nixl.py          | 36 +++++++++++++++++++
 .../nixl_bench/slurm_command_gen_strategy.py  |  3 ++
 .../slurm_command_gen_strategy.py             |  3 ++
 .../test_command_gen_strategy_slurm.py        | 19 ++++++++++
 .../nixl_kvbench/test_command_gen_slurm.py    | 24 +++++++++++--
 5 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py
index fc35d6dab..be977e421 100644
--- a/src/cloudai/workloads/common/nixl.py
+++ b/src/cloudai/workloads/common/nixl.py
@@ -17,6 +17,7 @@
 
 import logging
 import re
+import shlex
 from functools import cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Final, Generic, TypeVar, cast
@@ -231,6 +232,41 @@ def _unique_file_name(self, file_name: str, used_filenames: set[str]) -> str:
         used_filenames.add(candidate)
         return candidate
 
+    def gen_cleanup_srun_command(self) -> list[str]:
+        cleanup_cmds = self._container_cleanup_commands()
+        if not cleanup_cmds:
+            return []
+
+        return [
+            *self.gen_srun_prefix(with_num_nodes=False),
+            "--overlap",
+            "--nodelist=$SLURM_JOB_MASTER_NODE",
+            "--ntasks-per-node=1",
+            "--ntasks=1",
+            "-N1",
+            "bash",
+            "-c",
+            f'"{"; ".join(cleanup_cmds)}"',
+        ]
+
+    def _container_cleanup_commands(self) -> list[str]:
+        cleanup_cmds: list[str] = []
+
+        filepath_raw: str | None = cast(str | None, self.test_run.test.cmd_args_dict.get("filepath"))
+        if filepath_raw:
+            filepath = Path(filepath_raw)
+            if filepath == Path("/"):
+                logging.warning("Skipping filepath cleanup for '/': refusing to delete container root contents.")
+            else:
+                cleanup_cmds.append(f"rm -rf {shlex.quote(str(filepath))}")
+
+        device_list_raw: str | None = cast(str | None, self.test_run.test.cmd_args_dict.get("device_list"))
+        if device_list_raw:
+            for device_path in get_files_from_device_list(device_list_raw):
+                cleanup_cmds.append(f"rm -rf {shlex.quote(str(device_path))}")
+
+        return cleanup_cmds
+
     @property
     def final_env_vars(self) -> dict[str, str | list[str]]:
         env_vars = super().final_env_vars
diff --git a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py
index 7d0995e6f..19f4f4abf 100644
--- a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py
@@ -36,6 +36,7 @@ def _gen_srun_command(self) -> str:
         nixl_commands = self.gen_nixlbench_srun_commands(
             self.gen_nixlbench_command(), str(self.tdef.cmd_args_dict.get("backend", "unset"))
         )
+        cleanup_command = self.gen_cleanup_srun_command()
         self._current_image_url = None
 
         commands: list[str] = [
@@ -46,6 +47,8 @@ def _gen_srun_command(self) -> str:
             " ".join(nixl_commands[-1]),
             " ".join(self.gen_kill_and_wait_cmd("etcd_pid")),
         ]
+        if cleanup_command:
+            commands.insert(-1, " ".join(cleanup_command))
         return "\n".join(commands)
 
     def gen_nixlbench_command(self) -> list[str]:
diff --git a/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py
index f695c76f0..86d30e47d 100644
--- a/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py
@@ -39,6 +39,7 @@ def _gen_srun_command(self) -> str:
         kvbench_commands = self.gen_nixlbench_srun_commands(
             self.gen_kvbench_command(), str(self.tdef.cmd_args.backend or "unset")
         )
+        cleanup_command = self.gen_cleanup_srun_command()
         self._current_image_url = None
 
         self.create_env_vars_file()
@@ -51,6 +52,8 @@ def _gen_srun_command(self) -> str:
             " ".join(kvbench_commands[-1]),
             " ".join(self.gen_kill_and_wait_cmd("etcd_pid")),
         ]
+        if cleanup_command:
+            final_cmd.insert(-1, " ".join(cleanup_command))
         return "\n".join(final_cmd)
 
     def gen_kvbench_command(self) -> list[str]:
diff --git a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
index f63802e6e..57a6ee50b 100644
--- a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
@@ -105,6 +105,25 @@ def test_container_mounts(self, nixl_bench_tr: TestRun, slurm_system: SlurmSyste
             assert (nixl_bench_tr.output_path / "device_list_mounts" / local_device_filename).is_file()
             assert (nixl_bench_tr.output_path / "device_list_mounts" / local_device_filename).stat().st_size == 1024
 
+    def test_cleanup_srun_command(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
+        nixl_bench_tr.test.cmd_args = NIXLBenchCmdArgs.model_validate(
+            {
+                "docker_image_url": "docker.io/library/ubuntu:22.04",
+                "path_to_benchmark": "/nixlbench",
+                "backend": "GUSLI",
+                "device_list": "11:K:/dev/nvme0n1,12:F:/p1/store0.bin,13:F:/p2/store0.bin",
+                "filepath": "/data",
+            }
+        )
+        strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr)
+        strategy._current_image_url = str(cast(NIXLBenchTestDefinition, nixl_bench_tr.test).docker_image.installed_path)
+
+        cleanup_cmd = " ".join(strategy.gen_cleanup_srun_command())
+
+        assert "rm -rf /data" in cleanup_cmd
+        assert "rm -rf /p1/store0.bin" in cleanup_cmd
+        assert "rm -rf /p2/store0.bin" in cleanup_cmd
+
     @pytest.mark.parametrize(
         ("override", "expected_error_match", "expected_total_buffer_size"),
         (
diff --git a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
index e9c595828..354cb19fd 100644
--- a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
+++ b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
@@ -38,8 +38,10 @@ def kvbench() -> NIXLKVBenchTestDefinition:
 
 
 @pytest.fixture
-def kvbench_tr(kvbench: NIXLKVBenchTestDefinition) -> TestRun:
-    return TestRun(name="nixl-bench", num_nodes=2, nodes=[], test=kvbench)
+def kvbench_tr(kvbench: NIXLKVBenchTestDefinition, tmp_path) -> TestRun:
+    output_path = tmp_path / "nixl-kvbench"
+    output_path.mkdir(parents=True, exist_ok=True)
+    return TestRun(name="nixl-bench", num_nodes=2, nodes=[], test=kvbench, output_path=output_path)
 
 
 def test_gen_kvbench_ucx(kvbench_tr: TestRun, slurm_system: SlurmSystem):
@@ -124,3 +126,21 @@ def test_get_etcd_srun_command_with_etcd_image(kvbench_tr: TestRun, slurm_system
     cmd = " ".join(strategy.gen_etcd_srun_command(tdef.cmd_args.etcd_path))
     assert tdef.etcd_image is not None
     assert f"--container-image={tdef.etcd_image.installed_path}" in cmd
+
+
+def test_kvbench_cleanup_srun_command_uses_container_paths(kvbench_tr: TestRun, slurm_system: SlurmSystem):
+    kvbench_tr.test.cmd_args = NIXLKVBenchCmdArgs.model_validate(
+        {
+            "docker_image_url": "docker://image/url",
+            "backend": "GUSLI",
+            "filepath": "/data",
+            "device_list": "11:F:/store0.bin",
+        }
+    )
+    strategy = NIXLKVBenchSlurmCommandGenStrategy(slurm_system, kvbench_tr)
+    strategy._current_image_url = str(cast(NIXLKVBenchTestDefinition, kvbench_tr.test).docker_image.installed_path)
+
+    cmd = " ".join(strategy.gen_cleanup_srun_command())
+
+    assert "rm -rf /data" in cmd
+    assert "rm -rf /store0.bin" in cmd

From 2a501dc79a1f9643d8f8a46e0b06750cb8379e66 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Thu, 19 Mar 2026 17:20:55 +0100
Subject: [PATCH 2/9] using host os filepaths for cleanup

---
 src/cloudai/workloads/common/nixl.py          | 35 ++++++-------------
 .../nixl_bench/slurm_command_gen_strategy.py  |  2 +-
 .../slurm_command_gen_strategy.py             |  2 +-
 .../test_command_gen_strategy_slurm.py        | 33 +++++++++++++----
 .../nixl_kvbench/test_command_gen_slurm.py    | 32 ++++++++++++++---
 5 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py
index be977e421..4a81c470f 100644
--- a/src/cloudai/workloads/common/nixl.py
+++ b/src/cloudai/workloads/common/nixl.py
@@ -232,40 +232,25 @@ def _unique_file_name(self, file_name: str, used_filenames: set[str]) -> str:
         used_filenames.add(candidate)
         return candidate
 
-    def gen_cleanup_srun_command(self) -> list[str]:
-        cleanup_cmds = self._container_cleanup_commands()
-        if not cleanup_cmds:
+    def gen_cleanup_command(self) -> list[str]:
+        cleanup_targets = self._cleanup_targets()
+        if not cleanup_targets:
             return []
 
-        return [
-            *self.gen_srun_prefix(with_num_nodes=False),
-            "--overlap",
-            "--nodelist=$SLURM_JOB_MASTER_NODE",
-            "--ntasks-per-node=1",
-            "--ntasks=1",
-            "-N1",
-            "bash",
-            "-c",
-            f'"{"; ".join(cleanup_cmds)}"',
-        ]
+        return ["rm", "-rf", *(shlex.quote(path) for path in cleanup_targets)]
 
-    def _container_cleanup_commands(self) -> list[str]:
-        cleanup_cmds: list[str] = []
+    def _cleanup_targets(self) -> list[str]:
+        cleanup_targets: list[str] = []
 
         filepath_raw: str | None = cast(str | None, self.test_run.test.cmd_args_dict.get("filepath"))
         if filepath_raw:
-            filepath = Path(filepath_raw)
-            if filepath == Path("/"):
-                logging.warning("Skipping filepath cleanup for '/': refusing to delete container root contents.")
-            else:
-                cleanup_cmds.append(f"rm -rf {shlex.quote(str(filepath))}")
+            cleanup_targets.append(str((self.test_run.output_path / "filepath_mount").resolve()))
 
         device_list_raw: str | None = cast(str | None, self.test_run.test.cmd_args_dict.get("device_list"))
-        if device_list_raw:
-            for device_path in get_files_from_device_list(device_list_raw):
-                cleanup_cmds.append(f"rm -rf {shlex.quote(str(device_path))}")
+        if device_list_raw and get_files_from_device_list(device_list_raw):
+            cleanup_targets.append(str((self.test_run.output_path / "device_list_mounts").resolve()))
 
-        return cleanup_cmds
+        return cleanup_targets
 
     @property
     def final_env_vars(self) -> dict[str, str | list[str]]:
diff --git a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py
index 19f4f4abf..8cb0f14bc 100644
--- a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py
@@ -36,7 +36,7 @@ def _gen_srun_command(self) -> str:
         nixl_commands = self.gen_nixlbench_srun_commands(
             self.gen_nixlbench_command(), str(self.tdef.cmd_args_dict.get("backend", "unset"))
         )
-        cleanup_command = self.gen_cleanup_srun_command()
+        cleanup_command = self.gen_cleanup_command()
         self._current_image_url = None
 
         commands: list[str] = [
diff --git a/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py
index 86d30e47d..73be5b9c7 100644
--- a/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py
@@ -39,7 +39,7 @@ def _gen_srun_command(self) -> str:
         kvbench_commands = self.gen_nixlbench_srun_commands(
             self.gen_kvbench_command(), str(self.tdef.cmd_args.backend or "unset")
         )
-        cleanup_command = self.gen_cleanup_srun_command()
+        cleanup_command = self.gen_cleanup_command()
         self._current_image_url = None
 
         self.create_env_vars_file()
diff --git a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
index 57a6ee50b..5497b382a 100644
--- a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
@@ -105,7 +105,7 @@ def test_container_mounts(self, nixl_bench_tr: TestRun, slurm_system: SlurmSyste
             assert (nixl_bench_tr.output_path / "device_list_mounts" / local_device_filename).is_file()
             assert (nixl_bench_tr.output_path / "device_list_mounts" / local_device_filename).stat().st_size == 1024
 
-    def test_cleanup_srun_command(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
+    def test_cleanup_command_uses_host_paths(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
         nixl_bench_tr.test.cmd_args = NIXLBenchCmdArgs.model_validate(
             {
                 "docker_image_url": "docker.io/library/ubuntu:22.04",
@@ -116,13 +116,32 @@ def test_cleanup_srun_command(self, nixl_bench_tr: TestRun, slurm_system: SlurmS
             }
         )
         strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr)
-        strategy._current_image_url = str(cast(NIXLBenchTestDefinition, nixl_bench_tr.test).docker_image.installed_path)
 
-        cleanup_cmd = " ".join(strategy.gen_cleanup_srun_command())
+        cleanup_cmd = " ".join(strategy.gen_cleanup_command())
+        filepath_dir = nixl_bench_tr.output_path / "filepath_mount"
+        device_list_dir = nixl_bench_tr.output_path / "device_list_mounts"
+        assert cleanup_cmd == f"rm -rf {filepath_dir} {device_list_dir}"
 
-        assert "rm -rf /data" in cleanup_cmd
-        assert "rm -rf /p1/store0.bin" in cleanup_cmd
-        assert "rm -rf /p2/store0.bin" in cleanup_cmd
+    def test_gen_cleanup_command_empty_without_storage_args(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
+        strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr)
+        assert strategy.gen_cleanup_command() == []
+
+    def test_gen_srun_command_includes_host_cleanup(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
+        nixl_bench_tr.test.cmd_args = NIXLBenchCmdArgs.model_validate(
+            {
+                "docker_image_url": "docker.io/library/ubuntu:22.04",
+                "path_to_benchmark": "/nixlbench",
+                "backend": "GUSLI",
+                "device_list": "11:F:/store0.bin",
+                "filepath": "/data",
+            }
+        )
+        strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr)
+
+        cleanup_cmd = " ".join(strategy.gen_cleanup_command())
+        cmd = strategy._gen_srun_command()
+
+        assert cleanup_cmd in cmd
 
     @pytest.mark.parametrize(
         ("override", "expected_error_match", "expected_total_buffer_size"),
@@ -240,7 +259,7 @@ def test_gen_nixl_srun_command(
                 assert "--nodelist=$SLURM_JOB_MASTER_NODE" in cmd
 
 
-def test_gen_srun_command(nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
+def test_gen_wait_for_etcd_command(nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
     strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr)
     cmd = strategy.gen_wait_for_etcd_command()
     assert cmd == [
diff --git a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
index 354cb19fd..6c3b744dd 100644
--- a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
+++ b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
@@ -128,7 +128,7 @@ def test_get_etcd_srun_command_with_etcd_image(kvbench_tr: TestRun, slurm_system
     assert f"--container-image={tdef.etcd_image.installed_path}" in cmd
 
 
-def test_kvbench_cleanup_srun_command_uses_container_paths(kvbench_tr: TestRun, slurm_system: SlurmSystem):
+def test_kvbench_cleanup_command_uses_host_paths(kvbench_tr: TestRun, slurm_system: SlurmSystem):
     kvbench_tr.test.cmd_args = NIXLKVBenchCmdArgs.model_validate(
         {
             "docker_image_url": "docker://image/url",
@@ -138,9 +138,31 @@ def test_kvbench_cleanup_srun_command_uses_container_paths(kvbench_tr: TestRun,
         }
     )
     strategy = NIXLKVBenchSlurmCommandGenStrategy(slurm_system, kvbench_tr)
-    strategy._current_image_url = str(cast(NIXLKVBenchTestDefinition, kvbench_tr.test).docker_image.installed_path)
 
-    cmd = " ".join(strategy.gen_cleanup_srun_command())
+    cmd = " ".join(strategy.gen_cleanup_command())
+    filepath_dir = kvbench_tr.output_path / "filepath_mount"
+    device_list_dir = kvbench_tr.output_path / "device_list_mounts"
+    assert cmd == f"rm -rf {filepath_dir} {device_list_dir}"
 
-    assert "rm -rf /data" in cmd
-    assert "rm -rf /store0.bin" in cmd
+
+def test_kvbench_gen_cleanup_command_empty_without_storage_args(kvbench_tr: TestRun, slurm_system: SlurmSystem):
+    strategy = NIXLKVBenchSlurmCommandGenStrategy(slurm_system, kvbench_tr)
+
+    assert strategy.gen_cleanup_command() == []
+
+
+def test_kvbench_gen_srun_command_includes_host_cleanup(kvbench_tr: TestRun, slurm_system: SlurmSystem):
+    kvbench_tr.test.cmd_args = NIXLKVBenchCmdArgs.model_validate(
+        {
+            "docker_image_url": "docker://image/url",
+            "backend": "GUSLI",
+            "filepath": "/data",
+            "device_list": "11:F:/store0.bin",
+        }
+    )
+    strategy = NIXLKVBenchSlurmCommandGenStrategy(slurm_system, kvbench_tr)
+
+    cleanup_cmd = " ".join(strategy.gen_cleanup_command())
+    cmd = strategy._gen_srun_command()
+
+    assert cleanup_cmd in cmd

From 271cc885594e58d0577ecf7d6e4ba6fb8c8cc327 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Thu, 19 Mar 2026 18:48:32 +0100
Subject: [PATCH 3/9] change commands order

---
 src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py  | 2 +-
 .../workloads/nixl_kvbench/slurm_command_gen_strategy.py        | 2 +-
 tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py   | 1 +
 tests/workloads/nixl_kvbench/test_command_gen_slurm.py          | 1 +
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py
index 8cb0f14bc..218023d12 100644
--- a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py
@@ -48,7 +48,7 @@ def _gen_srun_command(self) -> str:
             " ".join(self.gen_kill_and_wait_cmd("etcd_pid")),
         ]
         if cleanup_command:
-            commands.insert(-1, " ".join(cleanup_command))
+            commands.append(" ".join(cleanup_command))
         return "\n".join(commands)
 
     def gen_nixlbench_command(self) -> list[str]:
diff --git a/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py
index 73be5b9c7..9285b0fe6 100644
--- a/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py
@@ -53,7 +53,7 @@ def _gen_srun_command(self) -> str:
             " ".join(self.gen_kill_and_wait_cmd("etcd_pid")),
         ]
         if cleanup_command:
-            final_cmd.insert(-1, " ".join(cleanup_command))
+            final_cmd.append(" ".join(cleanup_command))
         return "\n".join(final_cmd)
 
     def gen_kvbench_command(self) -> list[str]:
diff --git a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
index 5497b382a..624a31132 100644
--- a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
@@ -142,6 +142,7 @@ def test_gen_srun_command_includes_host_cleanup(self, nixl_bench_tr: TestRun, sl
         cmd = strategy._gen_srun_command()
 
         assert cleanup_cmd in cmd
+        assert cmd.rfind("kill -TERM $etcd_pid") < cmd.rfind(cleanup_cmd)
 
     @pytest.mark.parametrize(
         ("override", "expected_error_match", "expected_total_buffer_size"),
diff --git a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
index 6c3b744dd..dd537f7ba 100644
--- a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
+++ b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
@@ -166,3 +166,4 @@ def test_kvbench_gen_srun_command_includes_host_cleanup(kvbench_tr: TestRun, slu
     cmd = strategy._gen_srun_command()
 
     assert cleanup_cmd in cmd
+    assert cmd.rfind("kill -TERM $etcd_pid") < cmd.rfind(cleanup_cmd)

From 386edb2a372e4bbb378e34638e0b84c34216bec2 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Fri, 20 Mar 2026 11:15:22 +0100
Subject: [PATCH 4/9] using python for clenaup

---
 src/cloudai/_core/command_gen_strategy.py     |  4 +++
 .../systems/slurm/single_sbatch_runner.py     |  5 +++-
 src/cloudai/systems/slurm/slurm_runner.py     |  8 ++++++
 src/cloudai/workloads/common/nixl.py          | 20 +++++++-------
 .../nixl_bench/slurm_command_gen_strategy.py  |  3 ---
 .../slurm_command_gen_strategy.py             |  3 ---
 tests/test_get_job_id.py                      | 19 +++++++++++--
 tests/test_single_sbatch_runner.py            | 27 ++++++++++++++++++-
 .../test_command_gen_strategy_slurm.py        | 26 +++++++++---------
 .../nixl_kvbench/test_command_gen_slurm.py    | 26 +++++++++---------
 10 files changed, 95 insertions(+), 46 deletions(-)

diff --git a/src/cloudai/_core/command_gen_strategy.py b/src/cloudai/_core/command_gen_strategy.py
index 5238bb675..0268fbca5 100644
--- a/src/cloudai/_core/command_gen_strategy.py
+++ b/src/cloudai/_core/command_gen_strategy.py
@@ -49,6 +49,10 @@ def store_test_run(self) -> None:
         """
         pass
 
+    def cleanup_job_artifacts(self) -> None:
+        """Best-effort cleanup hook run after the job has fully completed."""
+        return
+
     @property
     def final_env_vars(self) -> dict[str, str | list[str]]:
         if not self._final_env_vars:
diff --git a/src/cloudai/systems/slurm/single_sbatch_runner.py b/src/cloudai/systems/slurm/single_sbatch_runner.py
index 2ea28d554..31865b433 100644
--- a/src/cloudai/systems/slurm/single_sbatch_runner.py
+++ b/src/cloudai/systems/slurm/single_sbatch_runner.py
@@ -22,7 +22,7 @@
 from typing import Generator, Optional, cast
 
 from cloudai.configurator.cloudai_gym import CloudAIGymEnv
-from cloudai.core import JobIdRetrievalError, System, TestRun, TestScenario
+from cloudai.core import BaseJob, JobIdRetrievalError, System, TestRun, TestScenario
 from cloudai.util import CommandShell, format_time_limit, parse_time_limit
 
 from .slurm_command_gen_strategy import SlurmCommandGenStrategy
@@ -214,6 +214,9 @@ def handle_dse(self):
                 reward = gym.compute_reward(observation)
                 gym.write_trajectory(idx, combination, reward, observation)
 
+    def completed_test_runs(self, job: BaseJob) -> list[TestRun]:
+        return list(self.all_trs)
+
     def _submit_test(self, tr: TestRun) -> SlurmJob:
         with open(self.scenario_root / "cloudai_sbatch_script.sh", "w") as f:
             f.write(self.gen_sbatch_content())
diff --git a/src/cloudai/systems/slurm/slurm_runner.py b/src/cloudai/systems/slurm/slurm_runner.py
index 50a70082d..fd8f0902e 100644
--- a/src/cloudai/systems/slurm/slurm_runner.py
+++ b/src/cloudai/systems/slurm/slurm_runner.py
@@ -77,10 +77,18 @@ def on_job_submit(self, tr: TestRun) -> None:
         cmd_gen = self.get_cmd_gen_strategy(self.system, tr)
         cmd_gen.store_test_run()
 
+    def completed_test_runs(self, job: BaseJob) -> list[TestRun]:
+        return [cast(SlurmJob, job).test_run]
+
     def on_job_completion(self, job: BaseJob) -> None:
         logging.debug(f"Job completion callback for job {job.id}")
         self.system.complete_job(cast(SlurmJob, job))
         self.store_job_metadata(cast(SlurmJob, job))
+        for tr in self.completed_test_runs(job):
+            try:
+                self.get_cmd_gen_strategy(self.system, tr).cleanup_job_artifacts()
+            except Exception:
+                logging.warning(f"Cleanup failed for test run at {tr.output_path}", exc_info=True)
 
     def _mock_job_metadata(self) -> SlurmStepMetadata:
         return SlurmStepMetadata(
diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py
index 4a81c470f..950f91707 100644
--- a/src/cloudai/workloads/common/nixl.py
+++ b/src/cloudai/workloads/common/nixl.py
@@ -17,7 +17,7 @@
 
 import logging
 import re
-import shlex
+import shutil
 from functools import cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Final, Generic, TypeVar, cast
@@ -232,23 +232,21 @@ def _unique_file_name(self, file_name: str, used_filenames: set[str]) -> str:
         used_filenames.add(candidate)
         return candidate
 
-    def gen_cleanup_command(self) -> list[str]:
-        cleanup_targets = self._cleanup_targets()
-        if not cleanup_targets:
-            return []
-
-        return ["rm", "-rf", *(shlex.quote(path) for path in cleanup_targets)]
+    def cleanup_job_artifacts(self) -> None:
+        for cleanup_target in self._cleanup_targets():
+            if cleanup_target.exists():
+                shutil.rmtree(cleanup_target)
 
-    def _cleanup_targets(self) -> list[str]:
-        cleanup_targets: list[str] = []
+    def _cleanup_targets(self) -> list[Path]:
+        cleanup_targets: list[Path] = []
 
         filepath_raw: str | None = cast(str | None, self.test_run.test.cmd_args_dict.get("filepath"))
         if filepath_raw:
-            cleanup_targets.append(str((self.test_run.output_path / "filepath_mount").resolve()))
+            cleanup_targets.append((self.test_run.output_path / "filepath_mount").resolve())
 
         device_list_raw: str | None = cast(str | None, self.test_run.test.cmd_args_dict.get("device_list"))
         if device_list_raw and get_files_from_device_list(device_list_raw):
-            cleanup_targets.append(str((self.test_run.output_path / "device_list_mounts").resolve()))
+            cleanup_targets.append((self.test_run.output_path / "device_list_mounts").resolve())
 
         return cleanup_targets
 
diff --git a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py
index 218023d12..7d0995e6f 100644
--- a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py
@@ -36,7 +36,6 @@ def _gen_srun_command(self) -> str:
         nixl_commands = self.gen_nixlbench_srun_commands(
             self.gen_nixlbench_command(), str(self.tdef.cmd_args_dict.get("backend", "unset"))
         )
-        cleanup_command = self.gen_cleanup_command()
         self._current_image_url = None
 
         commands: list[str] = [
@@ -47,8 +46,6 @@ def _gen_srun_command(self) -> str:
             " ".join(nixl_commands[-1]),
             " ".join(self.gen_kill_and_wait_cmd("etcd_pid")),
         ]
-        if cleanup_command:
-            commands.append(" ".join(cleanup_command))
         return "\n".join(commands)
 
     def gen_nixlbench_command(self) -> list[str]:
diff --git a/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py
index 9285b0fe6..f695c76f0 100644
--- a/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py
@@ -39,7 +39,6 @@ def _gen_srun_command(self) -> str:
         kvbench_commands = self.gen_nixlbench_srun_commands(
             self.gen_kvbench_command(), str(self.tdef.cmd_args.backend or "unset")
         )
-        cleanup_command = self.gen_cleanup_command()
         self._current_image_url = None
 
         self.create_env_vars_file()
@@ -52,8 +51,6 @@ def _gen_srun_command(self) -> str:
             " ".join(kvbench_commands[-1]),
             " ".join(self.gen_kill_and_wait_cmd("etcd_pid")),
         ]
-        if cleanup_command:
-            final_cmd.append(" ".join(cleanup_command))
         return "\n".join(final_cmd)
 
     def gen_kvbench_command(self) -> list[str]:
diff --git a/tests/test_get_job_id.py b/tests/test_get_job_id.py
index 260593dea..ecdf6ced3 100644
--- a/tests/test_get_job_id.py
+++ b/tests/test_get_job_id.py
@@ -16,14 +16,14 @@
 
 import subprocess
 from pathlib import Path
-from unittest.mock import Mock
+from unittest.mock import Mock, patch
 
 import pytest
 
 from cloudai.core import JobIdRetrievalError, TestRun, TestScenario
 from cloudai.systems.lsf.lsf_runner import LSFRunner
 from cloudai.systems.lsf.lsf_system import LSFSystem
-from cloudai.systems.slurm import SlurmRunner, SlurmSystem
+from cloudai.systems.slurm import SlurmJob, SlurmRunner, SlurmSystem
 from cloudai.util import CommandShell
 from cloudai.workloads.sleep.sleep import SleepCmdArgs, SleepTestDefinition
 
@@ -88,6 +88,21 @@ def test_slurm_get_job_id(slurm_runner: SlurmRunner, stdout: str, stderr: str, e
     assert res == expected_job_id
 
 
+def test_slurm_runner_on_job_completion_calls_cleanup(slurm_runner: SlurmRunner):
+    tr = slurm_runner.test_scenario.test_runs[0]
+    job = SlurmJob(tr, id=1)
+    slurm_runner.store_job_metadata = Mock()
+    cleanup = Mock()
+    slurm_runner.get_cmd_gen_strategy = Mock(return_value=Mock(cleanup_job_artifacts=cleanup))
+
+    with patch.object(SlurmSystem, "complete_job") as complete_job:
+        slurm_runner.on_job_completion(job)
+
+    complete_job.assert_called_once_with(job)
+    slurm_runner.store_job_metadata.assert_called_once_with(job)
+    cleanup.assert_called_once()
+
+
 @pytest.mark.parametrize(
     "stdout, stderr, expected_job_id",
     [
diff --git a/tests/test_single_sbatch_runner.py b/tests/test_single_sbatch_runner.py
index 72ad93f79..91d3cdf27 100644
--- a/tests/test_single_sbatch_runner.py
+++ b/tests/test_single_sbatch_runner.py
@@ -16,8 +16,9 @@
 
 import copy
 import re
+from pathlib import Path
 from typing import Generator, Optional, cast
-from unittest.mock import Mock
+from unittest.mock import Mock, patch
 
 import pandas as pd
 import pytest
@@ -506,6 +507,30 @@ def test_store_job_metadata(nccl_tr: TestRun, slurm_system: SlurmSystem) -> None
     assert sjm == SlurmJobMetadata.model_validate(toml.loads(toml.dumps(sjm.model_dump())))
 
 
+def test_on_job_completion_cleans_all_effective_test_runs(
+    dse_tr: TestRun, nccl_tr: TestRun, slurm_system: SlurmSystem
+) -> None:
+    tc = TestScenario(name="tc", test_runs=[dse_tr, nccl_tr])
+    runner = SingleSbatchRunner(mode="run", system=slurm_system, test_scenario=tc, output_path=slurm_system.output_path)
+    runner.mode = "dry-run"
+    runner.store_job_metadata = Mock()
+
+    cleanup_calls: list[Path] = []
+
+    def _cmd_gen(_, tr: TestRun):
+        return Mock(cleanup_job_artifacts=Mock(side_effect=lambda: cleanup_calls.append(tr.output_path)))
+
+    runner.get_cmd_gen_strategy = Mock(side_effect=_cmd_gen)
+
+    expected_paths = [tr.output_path for tr in runner.all_trs]
+    job = SlurmJob(nccl_tr, id=1)
+
+    with patch.object(SlurmSystem, "complete_job"):
+        runner.on_job_completion(job)
+
+    assert cleanup_calls == expected_paths
+
+
 def test_pre_test(nccl_tr: TestRun, sleep_tr: TestRun, slurm_system: SlurmSystem) -> None:
     nccl_tr.pre_test = TestScenario(name="pre_test", test_runs=[sleep_tr])
     tc = TestScenario(name="tc", test_runs=[nccl_tr])
diff --git a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
index 624a31132..ce2e26a83 100644
--- a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
@@ -105,7 +105,7 @@ def test_container_mounts(self, nixl_bench_tr: TestRun, slurm_system: SlurmSyste
             assert (nixl_bench_tr.output_path / "device_list_mounts" / local_device_filename).is_file()
             assert (nixl_bench_tr.output_path / "device_list_mounts" / local_device_filename).stat().st_size == 1024
 
-    def test_cleanup_command_uses_host_paths(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
+    def test_cleanup_job_artifacts(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
         nixl_bench_tr.test.cmd_args = NIXLBenchCmdArgs.model_validate(
             {
                 "docker_image_url": "docker.io/library/ubuntu:22.04",
@@ -116,17 +116,22 @@ def test_cleanup_command_uses_host_paths(self, nixl_bench_tr: TestRun, slurm_sys
             }
         )
         strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr)
-
-        cleanup_cmd = " ".join(strategy.gen_cleanup_command())
         filepath_dir = nixl_bench_tr.output_path / "filepath_mount"
         device_list_dir = nixl_bench_tr.output_path / "device_list_mounts"
-        assert cleanup_cmd == f"rm -rf {filepath_dir} {device_list_dir}"
+        other_file = nixl_bench_tr.output_path / "keep.txt"
+        filepath_dir.mkdir(parents=True, exist_ok=True)
+        device_list_dir.mkdir(parents=True, exist_ok=True)
+        (filepath_dir / "a.txt").write_text("x")
+        (device_list_dir / "b.txt").write_text("x")
+        other_file.write_text("keep")
 
-    def test_gen_cleanup_command_empty_without_storage_args(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
-        strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr)
-        assert strategy.gen_cleanup_command() == []
+        strategy.cleanup_job_artifacts()
 
-    def test_gen_srun_command_includes_host_cleanup(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
+        assert not filepath_dir.exists()
+        assert not device_list_dir.exists()
+        assert other_file.exists()
+
+    def test_gen_srun_command_excludes_cleanup(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
         nixl_bench_tr.test.cmd_args = NIXLBenchCmdArgs.model_validate(
             {
                 "docker_image_url": "docker.io/library/ubuntu:22.04",
@@ -137,12 +142,9 @@ def test_gen_srun_command_includes_host_cleanup(self, nixl_bench_tr: TestRun, sl
             }
         )
         strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr)
-
-        cleanup_cmd = " ".join(strategy.gen_cleanup_command())
         cmd = strategy._gen_srun_command()
 
-        assert cleanup_cmd in cmd
-        assert cmd.rfind("kill -TERM $etcd_pid") < cmd.rfind(cleanup_cmd)
+        assert "rm -rf " not in cmd
 
     @pytest.mark.parametrize(
         ("override", "expected_error_match", "expected_total_buffer_size"),
diff --git a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
index dd537f7ba..e09e3f78c 100644
--- a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
+++ b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
@@ -128,7 +128,7 @@ def test_get_etcd_srun_command_with_etcd_image(kvbench_tr: TestRun, slurm_system
     assert f"--container-image={tdef.etcd_image.installed_path}" in cmd
 
 
-def test_kvbench_cleanup_command_uses_host_paths(kvbench_tr: TestRun, slurm_system: SlurmSystem):
+def test_kvbench_cleanup_job_artifacts(kvbench_tr: TestRun, slurm_system: SlurmSystem):
     kvbench_tr.test.cmd_args = NIXLKVBenchCmdArgs.model_validate(
         {
             "docker_image_url": "docker://image/url",
@@ -138,20 +138,23 @@ def test_kvbench_cleanup_command_uses_host_paths(kvbench_tr: TestRun, slurm_syst
         }
     )
     strategy = NIXLKVBenchSlurmCommandGenStrategy(slurm_system, kvbench_tr)
-
-    cmd = " ".join(strategy.gen_cleanup_command())
     filepath_dir = kvbench_tr.output_path / "filepath_mount"
     device_list_dir = kvbench_tr.output_path / "device_list_mounts"
-    assert cmd == f"rm -rf {filepath_dir} {device_list_dir}"
-
+    other_file = kvbench_tr.output_path / "keep.txt"
+    filepath_dir.mkdir(parents=True, exist_ok=True)
+    device_list_dir.mkdir(parents=True, exist_ok=True)
+    (filepath_dir / "a.txt").write_text("x")
+    (device_list_dir / "b.txt").write_text("x")
+    other_file.write_text("keep")
 
-def test_kvbench_gen_cleanup_command_empty_without_storage_args(kvbench_tr: TestRun, slurm_system: SlurmSystem):
-    strategy = NIXLKVBenchSlurmCommandGenStrategy(slurm_system, kvbench_tr)
+    strategy.cleanup_job_artifacts()
 
-    assert strategy.gen_cleanup_command() == []
+    assert not filepath_dir.exists()
+    assert not device_list_dir.exists()
+    assert other_file.exists()
 
 
-def test_kvbench_gen_srun_command_includes_host_cleanup(kvbench_tr: TestRun, slurm_system: SlurmSystem):
+def test_kvbench_gen_srun_command_excludes_cleanup(kvbench_tr: TestRun, slurm_system: SlurmSystem):
     kvbench_tr.test.cmd_args = NIXLKVBenchCmdArgs.model_validate(
         {
             "docker_image_url": "docker://image/url",
@@ -161,9 +164,6 @@ def test_kvbench_gen_srun_command_includes_host_cleanup(kvbench_tr: TestRun, slu
         }
     )
     strategy = NIXLKVBenchSlurmCommandGenStrategy(slurm_system, kvbench_tr)
-
-    cleanup_cmd = " ".join(strategy.gen_cleanup_command())
     cmd = strategy._gen_srun_command()
 
-    assert cleanup_cmd in cmd
-    assert cmd.rfind("kill -TERM $etcd_pid") < cmd.rfind(cleanup_cmd)
+    assert "rm -rf " not in cmd

From 409854622df79cfc2c5c6cd8139515fc3b3de963 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Fri, 20 Mar 2026 15:00:26 +0100
Subject: [PATCH 5/9] remove redundant tests

---
 .../nixl_bench/test_command_gen_strategy_slurm.py | 15 ---------------
 .../nixl_kvbench/test_command_gen_slurm.py        | 15 ---------------
 2 files changed, 30 deletions(-)

diff --git a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
index ce2e26a83..e984855c0 100644
--- a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
@@ -131,21 +131,6 @@ def test_cleanup_job_artifacts(self, nixl_bench_tr: TestRun, slurm_system: Slurm
         assert not device_list_dir.exists()
         assert other_file.exists()
 
-    def test_gen_srun_command_excludes_cleanup(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
-        nixl_bench_tr.test.cmd_args = NIXLBenchCmdArgs.model_validate(
-            {
-                "docker_image_url": "docker.io/library/ubuntu:22.04",
-                "path_to_benchmark": "/nixlbench",
-                "backend": "GUSLI",
-                "device_list": "11:F:/store0.bin",
-                "filepath": "/data",
-            }
-        )
-        strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr)
-        cmd = strategy._gen_srun_command()
-
-        assert "rm -rf " not in cmd
-
     @pytest.mark.parametrize(
         ("override", "expected_error_match", "expected_total_buffer_size"),
         (
diff --git a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
index e09e3f78c..fecf1d371 100644
--- a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
+++ b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py
@@ -152,18 +152,3 @@ def test_kvbench_cleanup_job_artifacts(kvbench_tr: TestRun, slurm_system: SlurmS
     assert not filepath_dir.exists()
     assert not device_list_dir.exists()
     assert other_file.exists()
-
-
-def test_kvbench_gen_srun_command_excludes_cleanup(kvbench_tr: TestRun, slurm_system: SlurmSystem):
-    kvbench_tr.test.cmd_args = NIXLKVBenchCmdArgs.model_validate(
-        {
-            "docker_image_url": "docker://image/url",
-            "backend": "GUSLI",
-            "filepath": "/data",
-            "device_list": "11:F:/store0.bin",
-        }
-    )
-    strategy = NIXLKVBenchSlurmCommandGenStrategy(slurm_system, kvbench_tr)
-    cmd = strategy._gen_srun_command()
-
-    assert "rm -rf " not in cmd

From 87514ce6bd5b46e9505ed41d3b50cf9906732663 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Fri, 20 Mar 2026 15:01:42 +0100
Subject: [PATCH 6/9] revert redundant line change

---
 tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
index e984855c0..814b20e7b 100644
--- a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py
@@ -247,7 +247,7 @@ def test_gen_nixl_srun_command(
                 assert "--nodelist=$SLURM_JOB_MASTER_NODE" in cmd
 
 
-def test_gen_wait_for_etcd_command(nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
+def test_gen_srun_command(nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
     strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr)
     cmd = strategy.gen_wait_for_etcd_command()
     assert cmd == [

From 6f2338a6389e8df5d6f447e89e97df8623669ccc Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Fri, 20 Mar 2026 18:57:26 +0100
Subject: [PATCH 7/9] update copyright

---
 src/cloudai/_core/command_gen_strategy.py | 2 +-
 src/cloudai/systems/slurm/slurm_runner.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/_core/command_gen_strategy.py b/src/cloudai/_core/command_gen_strategy.py
index 0268fbca5..56a17f0ee 100644
--- a/src/cloudai/_core/command_gen_strategy.py
+++ b/src/cloudai/_core/command_gen_strategy.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/cloudai/systems/slurm/slurm_runner.py b/src/cloudai/systems/slurm/slurm_runner.py
index fd8f0902e..dae0cdb29 100644
--- a/src/cloudai/systems/slurm/slurm_runner.py
+++ b/src/cloudai/systems/slurm/slurm_runner.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

From 1a88b1cae56445198c9d437c1c230870b529557f Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Fri, 20 Mar 2026 19:10:15 +0100
Subject: [PATCH 8/9] Update src/cloudai/workloads/common/nixl.py

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 src/cloudai/workloads/common/nixl.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py
index 950f91707..89ee72ba9 100644
--- a/src/cloudai/workloads/common/nixl.py
+++ b/src/cloudai/workloads/common/nixl.py
@@ -236,6 +236,7 @@ def cleanup_job_artifacts(self) -> None:
         for cleanup_target in self._cleanup_targets():
             if cleanup_target.exists():
                 shutil.rmtree(cleanup_target)
+                logging.debug(f"Cleaned up job artifact: {cleanup_target}")
 
     def _cleanup_targets(self) -> list[Path]:
         cleanup_targets: list[Path] = []

From a2d607048dfd2c4d0305892901d76bbf9637b47e Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Fri, 20 Mar 2026 19:26:32 +0100
Subject: [PATCH 9/9] safer nixl artifacts deletion

---
 src/cloudai/workloads/common/nixl.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py
index 89ee72ba9..430a63951 100644
--- a/src/cloudai/workloads/common/nixl.py
+++ b/src/cloudai/workloads/common/nixl.py
@@ -234,7 +234,7 @@ def _unique_file_name(self, file_name: str, used_filenames: set[str]) -> str:
 
     def cleanup_job_artifacts(self) -> None:
         for cleanup_target in self._cleanup_targets():
-            if cleanup_target.exists():
+            if cleanup_target.is_dir():
                 shutil.rmtree(cleanup_target)
                 logging.debug(f"Cleaned up job artifact: {cleanup_target}")
 
@@ -243,11 +243,11 @@ def _cleanup_targets(self) -> list[Path]:
 
         filepath_raw: str | None = cast(str | None, self.test_run.test.cmd_args_dict.get("filepath"))
         if filepath_raw:
-            cleanup_targets.append((self.test_run.output_path / "filepath_mount").resolve())
+            cleanup_targets.append(self.test_run.output_path / "filepath_mount")
 
         device_list_raw: str | None = cast(str | None, self.test_run.test.cmd_args_dict.get("device_list"))
         if device_list_raw and get_files_from_device_list(device_list_raw):
-            cleanup_targets.append((self.test_run.output_path / "device_list_mounts").resolve())
+            cleanup_targets.append(self.test_run.output_path / "device_list_mounts")
 
         return cleanup_targets