From 0fea789afc687248cf57477d49eed83cdecd3226 Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Thu, 20 Jul 2023 11:14:24 -0700 Subject: [PATCH 1/3] make friendly for keyword filter in test Signed-off-by: Masaki Kozuki --- .../optimizers/test_distributed_fused_lamb.py | 46 ++++++++++++------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/apex/contrib/test/optimizers/test_distributed_fused_lamb.py b/apex/contrib/test/optimizers/test_distributed_fused_lamb.py index f38f371b7..7d28d1724 100644 --- a/apex/contrib/test/optimizers/test_distributed_fused_lamb.py +++ b/apex/contrib/test/optimizers/test_distributed_fused_lamb.py @@ -26,7 +26,7 @@ def forward(self, input_tensor, gt): return loss # A test for distributed fused Lamb optimizer: run several iterations and see if loss decreases -# There are two instances of the same test because based on `world_size` the optimizer decides what collectives operation to use. +# There are two instances of the same test because based on `world_size` the optimizer decides what collectives operation to use. # If torch.distributed.get_world_size() == torch.cuda.device_count() it uses only `all_gather`. # If torch.distributed.get_world_size() < torch.cuda.device_count() it uses both `all_gather` and `reduce_scatter`. class NcclDistributedFusedLAMB(NcclDistributedTestBase): @@ -35,14 +35,24 @@ def world_size(self) -> int: return torch.cuda.device_count() @common_utils.parametrize("no_copy", [False, True]) - @common_utils.parametrize("opt_kwargs", [ - dict(overlap_reductions=True, dwu_num_blocks=2, dwu_num_chunks=2, - fused_norm=False, fuse_scale=False, clip_after_ar=True, - full_ar=False), - dict(overlap_reductions=False, dwu_num_blocks=1, dwu_num_chunks=1, - fused_norm=True, fuse_scale=True, clip_after_ar=False), - ]) - def test_distributed_fused_lamb(self, no_copy, opt_kwargs): + @common_utils.parametrize( + "overlap_reductions,dwu_num_blocks,dwu_num_chunks,fused_norm,fuse_scale,clip_after_ar,full_ar", + ( + (True, 2, 2, False, False, True, False), + (False, 1, 1, True, True, False, False), + ), + ) + def test_distributed_fused_lamb( + self, + no_copy, + overlap_reductions, + dwu_num_blocks, + dwu_num_chunks, + fused_norm, + fuse_scale, + clip_after_ar, + full_ar, + ): if no_copy and 'no_copy' not in inspect.getfullargspec(torch.distributed.reduce_scatter).args: self.skipTest("does not support no_copy") if no_copy and 'no_copy' not in inspect.getfullargspec(torch.distributed.all_gather).args: @@ -66,13 +76,10 @@ def test_distributed_fused_lamb(self, no_copy, opt_kwargs): {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] - if 'full_ar' not in opt_kwargs: - opt_kwargs['full_ar'] = gpu_count == torch.cuda.device_count() - - # Aidyn-A: not sure what parameters are the best for testing purposes, - # setting up whatever I think appropriate. + # Aidyn-A: not sure what parameters are the best for testing purposes, + # setting up whatever I think appropriate. optimizer = DistributedFusedLAMB( - optimizer_grouped_parameters, + optimizer_grouped_parameters, lr=0.1, betas=(0.9, 0.9), eps=1e-6, @@ -84,7 +91,14 @@ def test_distributed_fused_lamb(self, no_copy, opt_kwargs): use_nvlamb=False, set_param_views_to_flat_buffer=False, e5m2_allgather=False, - **opt_kwargs + no_copy=no_copy, + overlap_reductions=overlap_reductions, + dwu_num_blocks=dwu_num_blocks, + dwu_num_chunks=dwu_num_chunks, + fused_norm=fused_norm, + fuse_scale=fuse_scale, + clip_after_ar=clip_after_ar, + full_ar=full_ar, ) optimizer.set_global_scale(init_scale) From 9be5eae6b91c2295b286e57cc4257c1175efae66 Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Mon, 31 Jul 2023 14:00:40 -0700 Subject: [PATCH 2/3] cosmetic Signed-off-by: Masaki Kozuki --- .../optimizers/test_distributed_fused_lamb.py | 45 ++++++++++--------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/apex/contrib/test/optimizers/test_distributed_fused_lamb.py b/apex/contrib/test/optimizers/test_distributed_fused_lamb.py index 7d28d1724..66c321132 100644 --- a/apex/contrib/test/optimizers/test_distributed_fused_lamb.py +++ b/apex/contrib/test/optimizers/test_distributed_fused_lamb.py @@ -53,9 +53,10 @@ def test_distributed_fused_lamb( clip_after_ar, full_ar, ): - if no_copy and 'no_copy' not in inspect.getfullargspec(torch.distributed.reduce_scatter).args: + supports_no_copy = 'no_copy' in inspect.getfullargspec(torch.distributed.reduce_scatter).args + if no_copy and not supports_no_copy: self.skipTest("does not support no_copy") - if no_copy and 'no_copy' not in inspect.getfullargspec(torch.distributed.all_gather).args: + if no_copy and not supports_no_copy: self.skipTest("does not support no_copy") assert torch.distributed.is_initialized() @@ -79,26 +80,26 @@ def test_distributed_fused_lamb( # Aidyn-A: not sure what parameters are the best for testing purposes, # setting up whatever I think appropriate. optimizer = DistributedFusedLAMB( - optimizer_grouped_parameters, - lr=0.1, - betas=(0.9, 0.9), - eps=1e-6, - max_grad_norm=1.0, - dwu_group_size=gpu_count, - dwu_num_rs_pg=1, - dwu_num_ar_pg=1, - dwu_num_ag_pg=1, - use_nvlamb=False, - set_param_views_to_flat_buffer=False, - e5m2_allgather=False, - no_copy=no_copy, - overlap_reductions=overlap_reductions, - dwu_num_blocks=dwu_num_blocks, - dwu_num_chunks=dwu_num_chunks, - fused_norm=fused_norm, - fuse_scale=fuse_scale, - clip_after_ar=clip_after_ar, - full_ar=full_ar, + optimizer_grouped_parameters, + lr=0.1, + betas=(0.9, 0.9), + eps=1e-6, + max_grad_norm=1.0, + dwu_group_size=gpu_count, + dwu_num_rs_pg=1, + dwu_num_ar_pg=1, + dwu_num_ag_pg=1, + use_nvlamb=False, + set_param_views_to_flat_buffer=False, + e5m2_allgather=False, + overlap_reductions=overlap_reductions, + dwu_num_blocks=dwu_num_blocks, + dwu_num_chunks=dwu_num_chunks, + fused_norm=fused_norm, + fuse_scale=fuse_scale, + clip_after_ar=clip_after_ar, + full_ar=full_ar, + **{'no_copy': no_copy} if supports_no_copy else {} ) optimizer.set_global_scale(init_scale) From a56008998a25f18494e2b685dd1c7e58b707862f Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Mon, 31 Jul 2023 14:04:53 -0700 Subject: [PATCH 3/3] parenthesis Signed-off-by: Masaki Kozuki --- apex/contrib/test/optimizers/test_distributed_fused_lamb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apex/contrib/test/optimizers/test_distributed_fused_lamb.py b/apex/contrib/test/optimizers/test_distributed_fused_lamb.py index 66c321132..e969e5d53 100644 --- a/apex/contrib/test/optimizers/test_distributed_fused_lamb.py +++ b/apex/contrib/test/optimizers/test_distributed_fused_lamb.py @@ -99,7 +99,7 @@ def test_distributed_fused_lamb( fuse_scale=fuse_scale, clip_after_ar=clip_after_ar, full_ar=full_ar, - **{'no_copy': no_copy} if supports_no_copy else {} + **({'no_copy': no_copy} if supports_no_copy else {}) ) optimizer.set_global_scale(init_scale)