From 768ae8b385b78f6f7b68bba255481d61ef380597 Mon Sep 17 00:00:00 2001 From: wkliao Date: Mon, 9 Feb 2026 16:29:08 -0600 Subject: [PATCH 1/6] spell check. github copilot code quality [nitpick] Variable name `dir_name` uses underscore convention while `path_dup` uses underscore as well, but the naming could be more consistent with the project's naming conventions. Consider using consistent naming patterns throughout the file. --- src/drivers/pncio/pncio_lustre_open.c | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/drivers/pncio/pncio_lustre_open.c b/src/drivers/pncio/pncio_lustre_open.c index 76a0ba38b..62cc83cb4 100644 --- a/src/drivers/pncio/pncio_lustre_open.c +++ b/src/drivers/pncio/pncio_lustre_open.c @@ -68,17 +68,17 @@ static int get_total_avail_osts(const char *path) { - char *dir_name=NULL, *path_dup=NULL; + char *dir_path=NULL, *path_copy=NULL; int err, ost_count=0, is_mdt=0; struct stat sb; - path_dup = NCI_Strdup(path); + path_copy = NCI_Strdup(path); - err = stat(path_dup, &sb); + err = stat(path_copy, &sb); if (errno == ENOENT) { /* file does not exist, try folder */ /* get the parent folder name */ - dir_name = dirname(path_dup); - err = stat(dir_name, &sb); + dir_path = dirname(path_copy); + err = stat(dir_path, &sb); } if (err != 0) { printf("Warning at %s (%d): path \"%s\" stat() failed (%s)\n", @@ -88,20 +88,20 @@ int get_total_avail_osts(const char *path) /* llapi_get_obd_count() only works for directories */ if (S_ISDIR(sb.st_mode)) - dir_name = (dir_name == NULL) ? path_dup : dir_name; + dir_path = (dir_path == NULL) ? path_copy : dir_path; else /* get the parent folder name */ - dir_name = dirname(path_dup); + dir_path = dirname(path_copy); - err = llapi_get_obd_count(dir_name, &ost_count, is_mdt); + err = llapi_get_obd_count(dir_path, &ost_count, is_mdt); if (err != 0) { printf("Warning at %d: path \"%s\" llapi_get_obd_count() failed (%s)\n", - __LINE__,dir_name,strerror(errno)); + __LINE__,dir_path,strerror(errno)); ost_count = 0; } err_out: - if (path_dup != NULL) NCI_Free(path_dup); + if (path_copy != NULL) NCI_Free(path_copy); return ost_count; } @@ -311,7 +311,7 @@ uint64_t get_striping(int fd, if (err != 0) { #ifdef PNETCDF_LUSTRE_DEBUG snprintf(int_str, 32, "%lu", *pattern); - printf("Error at %s (%d) llapi_layout_pattern_get() fails to get patter %s\n", + printf("Error at %s (%d) llapi_layout_pattern_get() fails to get pattern %s\n", __FILE__, __LINE__, PATTERN_STR(*pattern, int_str)); #endif goto err_out; @@ -428,7 +428,7 @@ int set_striping(const char *path, err = llapi_layout_stripe_size_set(layout, stripe_size); if (err != 0) { #ifdef PNETCDF_LUSTRE_DEBUG - printf("Error at %s (%d) llapi_layout_stripe_size_set() fails to set strpe size %lu (%s)\n", + printf("Error at %s (%d) llapi_layout_stripe_size_set() fails to set stripe size %lu (%s)\n", __FILE__, __LINE__, stripe_size, strerror(errno)); #endif goto err_out; @@ -470,7 +470,7 @@ int set_striping(const char *path, #ifdef PNETCDF_LUSTRE_DEBUG char int_str[32]; snprintf(int_str, 32, "%lu", pattern); - printf("Error at %s (%d) llapi_layout_pattern_set() fails ito set pattern %s (%s)\n", + printf("Error at %s (%d) llapi_layout_pattern_set() fails to set pattern %s (%s)\n", __FILE__, __LINE__, PATTERN_STR(pattern, int_str), strerror(errno)); #endif goto err_out; @@ -816,7 +816,7 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) * 2. root sets and obtains striping info * 3. root broadcasts striping info * 4. non-root processes receive striping info from root - * 5. non-root processes opens the fie + * 5. non-root processes opens the file */ int PNCIO_Lustre_create(PNCIO_File *fd, From c09973dc7aec9d70a242f7aa31ca5dbb1487d585 Mon Sep 17 00:00:00 2001 From: wkliao Date: Wed, 11 Feb 2026 11:30:04 -0600 Subject: [PATCH 2/6] rename internal variable striping to nc_striping to be consistent --- src/drivers/ncmpio/ncmpio_NC.h | 2 +- src/drivers/ncmpio/ncmpio_create.c | 2 +- src/drivers/ncmpio/ncmpio_util.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/drivers/ncmpio/ncmpio_NC.h b/src/drivers/ncmpio/ncmpio_NC.h index dda0469f1..1831eb671 100644 --- a/src/drivers/ncmpio/ncmpio_NC.h +++ b/src/drivers/ncmpio/ncmpio_NC.h @@ -409,7 +409,7 @@ struct NC { #endif int hdr_chunk; /* chunk size for reading header, one chunk at a time */ int data_chunk; /* chunk size for moving variables to higher offsets */ - int striping; /* PNCIO_STRIPING_AUTO or PNCIO_STRIPING_INHERIT */ + int nc_striping; /* PNCIO_STRIPING_AUTO or PNCIO_STRIPING_INHERIT */ MPI_Offset v_align; /* alignment of the beginning of fixed-size variables */ MPI_Offset r_align; /* file alignment for record variable section */ MPI_Offset info_v_align; /* v_align set in MPI Info object */ diff --git a/src/drivers/ncmpio/ncmpio_create.c b/src/drivers/ncmpio/ncmpio_create.c index ea8fe7a2e..e5bfb413a 100644 --- a/src/drivers/ncmpio/ncmpio_create.c +++ b/src/drivers/ncmpio/ncmpio_create.c @@ -408,7 +408,7 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == /* If hint nc_striping is set to "auto" and hint striping_factor is not * set by the user, then set hint striping_factor to ncp->num_nodes. */ - if (ncp->striping == PNCIO_STRIPING_AUTO) { + if (ncp->nc_striping == PNCIO_STRIPING_AUTO) { int striping_factor=0; if (user_info != MPI_INFO_NULL) { MPI_Info_get(user_info, "striping_factor", MPI_MAX_INFO_VAL-1, diff --git a/src/drivers/ncmpio/ncmpio_util.c b/src/drivers/ncmpio/ncmpio_util.c index 6961348a4..befca23a9 100644 --- a/src/drivers/ncmpio/ncmpio_util.c +++ b/src/drivers/ncmpio/ncmpio_util.c @@ -267,10 +267,10 @@ void ncmpio_hint_extract(NC *ncp, /* When creating a file, inherit file striping from the parent folder or * let PnetCDF to decide. */ - ncp->striping = PNCIO_STRIPING_AUTO; + ncp->nc_striping = PNCIO_STRIPING_AUTO; MPI_Info_get(info, "nc_striping", MPI_MAX_INFO_VAL-1, value, &flag); if (flag && strcasecmp(value, "inherit") == 0) - ncp->striping = PNCIO_STRIPING_INHERIT; + ncp->nc_striping = PNCIO_STRIPING_INHERIT; } /*----< ncmpio_hint_set() >--------------------------------------------------*/ @@ -384,7 +384,7 @@ void ncmpio_hint_set(NC *ncp, /* When creating a file, inherit file striping from the parent folder or * let PnetCDF to decide. */ - if (ncp->striping == PNCIO_STRIPING_AUTO) + if (ncp->nc_striping == PNCIO_STRIPING_AUTO) MPI_Info_set(info, "nc_striping", "auto"); else MPI_Info_set(info, "nc_striping", "inherit"); From 67b27e2904a27e0962ca2ecdb7686ca75b72655f Mon Sep 17 00:00:00 2001 From: wkliao Date: Wed, 11 Feb 2026 17:43:16 -0600 Subject: [PATCH 3/6] add node IDs of all processes as an attribute cached in MPI communicator In case the same communicator is used to create/open multiple files, we can cache the node IDs of all processes as attributes of the MPI communicator, so they can be reused every time the same communicator is used to create/open a file in the same application run. Using the cached node IDs can avoid repeatedly calling MPI_Get_processor_name(), MPI_Gather(), and MPI_Bcast() to construct the node IDs. Note cb_nodes, the number of I/O aggregators, and rank IDs of I/O aggregators cannot be reused by another file, because cb_nodes is a hint that may be set to a different value each time file is created/opened. Thus cb_nodes and rank IDs of I/O aggregators must be recalculated at each file create/open. Fortunately, Lustre_set_cb_node_list() and GEN_set_cb_node_list() do not make any MPI communication calls. The cost of recalculating cb_nodes and rank IDs is expected to be small. --- src/dispatchers/file.c | 208 +++++++++++++++++++++++++++++++++++-- src/drivers/common/utils.c | 10 +- src/include/dispatch.h | 12 ++- 3 files changed, 217 insertions(+), 13 deletions(-) diff --git a/src/dispatchers/file.c b/src/dispatchers/file.c index b4bc7a950..e54684835 100644 --- a/src/dispatchers/file.c +++ b/src/dispatchers/file.c @@ -39,7 +39,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; #define BUFREAD64(buf,var) memcpy(&var, buf, 8); if (diff_endian) swap_64(&var); #endif -/* Note accessing the following 3 global variables must be protected by a +/* Note accessing the following 5 global variables must be protected by a * mutex, otherwise it will not be thread safe. */ @@ -52,6 +52,12 @@ static int pnc_numfiles; */ static int ncmpi_default_create_format = NC_FORMAT_CLASSIC; +/* attribute to be cached in all communicators */ +static int pncio_node_ids_keyval = MPI_KEYVAL_INVALID; + +/* attribute to be cached in MPI_COMM_SELF */ +static int pncio_init_keyval = MPI_KEYVAL_INVALID; + #define NCMPII_HANDLE_ERROR(func) \ if (mpireturn != MPI_SUCCESS) { \ int errorStringLen; \ @@ -69,6 +75,146 @@ static int ncmpi_default_create_format = NC_FORMAT_CLASSIC; } \ } +/* struct PNCIO_node_ids is defined in dispatch.h */ + +/*----< PNCIO_node_ids_copy() >----------------------------------------------*/ +/* A function to be invoked when a communicator is duplicated, which adds a + * reference to the already allocated memory space storing node ID array. + */ +static +int PNCIO_node_ids_copy(MPI_Comm comm, + int keyval, + void *extra, + void *attr_inP, + void *attr_outP, + int *flag) +{ + PNCIO_node_ids *attr_in = (PNCIO_node_ids*) attr_inP; + PNCIO_node_ids **attr_out = (PNCIO_node_ids**)attr_outP; + + if (attr_in == NULL) + return MPI_ERR_KEYVAL; + else + attr_in->ref_count++; + + *attr_out = attr_in; + + *flag = 1; /* make a copy in the new communicator */ + + return MPI_SUCCESS; +} + +/*----< PNCIO_node_ids_delete() >--------------------------------------------*/ +/* Callback function to be called when a communicator is freed, which frees the + * allocated memory space of node ID array. + */ +static +int PNCIO_node_ids_delete(MPI_Comm comm, + int keyval, + void *attr_val, + void *extra) +{ + PNCIO_node_ids *node_ids = (PNCIO_node_ids*) attr_val; + + if (node_ids == NULL) + return MPI_ERR_KEYVAL; + else + node_ids->ref_count--; + + if (node_ids->ref_count <= 0) { + /* free the allocated array */ + if (node_ids->ids != NULL) + free(node_ids->ids); + free(node_ids); + } + return MPI_SUCCESS; +} + +/*----< PNCIO_end_call() >---------------------------------------------------*/ +/* Callback function to be called at MPI_Finalize(), which frees all cached + * attributes. + */ +static +int PNCIO_end_call(MPI_Comm comm, + int keyval, + void *attribute_val, + void *extra_state) +{ + /* Free all keyvals used by PnetCDF */ + + MPI_Comm_free_keyval(&keyval); /* free pncio_init_keyval */ + + if (pncio_node_ids_keyval != MPI_KEYVAL_INVALID) + MPI_Comm_free_keyval(&pncio_node_ids_keyval); + + return MPI_SUCCESS; +} + +/*----< set_get_comm_attr() >------------------------------------------------*/ +/* Create/set/get attributes into/from the MPI communicators passed in from the + * user application. + */ +static +void set_get_comm_attr(MPI_Comm comm, + PNCIO_node_ids *node_idsP) +{ + PNCIO_node_ids *node_ids; + + if (pncio_init_keyval == MPI_KEYVAL_INVALID) { + /* This is the first call ever to PnetCDF API. Creating key + * pncio_init_keyval is necessary for MPI_Finalize() to free key + * pncio_node_ids_keyval. + */ + MPI_Comm_create_keyval(MPI_NULL_COPY_FN, PNCIO_end_call, + &pncio_init_keyval, (void*)0); + MPI_Comm_set_attr(MPI_COMM_SELF, pncio_init_keyval, (void*)0); + } + + if (pncio_node_ids_keyval == MPI_KEYVAL_INVALID) { + MPI_Comm_create_keyval(PNCIO_node_ids_copy, PNCIO_node_ids_delete, + &pncio_node_ids_keyval, NULL); + /* ignore error, as it is not a critical error */ + } + + if (pncio_node_ids_keyval != MPI_KEYVAL_INVALID) { + int found, nprocs; + + MPI_Comm_get_attr(comm, pncio_node_ids_keyval, &node_ids, &found); + if (!found) { + /* Construct an array storing node IDs of all processes. Note the + * memory allocated for node_ids will be freed by + * PNCIO_node_ids_delete(), a callback function invoked when the + * MPI communicator is freed. + */ + node_ids = (PNCIO_node_ids*) malloc(sizeof(PNCIO_node_ids)); + node_ids->ref_count = 1; + + MPI_Comm_size(comm, &nprocs); + if (nprocs == 1) { + node_ids->num_nodes = 1; + node_ids->ids = (int*) malloc(sizeof(int)); + node_ids->ids[0] = 0; + } + else { + /* Constructing node IDs requires communication calls to + * MPI_Get_processor_name(), MPI_Gather(), and MPI_Bcast(). + */ + ncmpii_construct_node_list(comm, &node_ids->num_nodes, + &node_ids->ids); + } + + /* FYI. The same key pncio_node_ids_keyval can be added to + * different MPI communicators with same or different values. + */ + MPI_Comm_set_attr(comm, pncio_node_ids_keyval, node_ids); + } + /* else case: returned node_ids contains the cached value */ + + /* copy contents */ + *node_idsP = *node_ids; + } +} + /*----< new_id_PNCList() >---------------------------------------------------*/ /* Return a new ID (array index) from the PNC list, pnc_filelist[] that is * not used. Note the used elements in pnc_filelist[] may not be contiguous. @@ -348,6 +494,7 @@ ncmpi_create(MPI_Comm comm, void *ncp; PNC *pncp; PNC_driver *driver; + PNCIO_node_ids node_ids; #ifdef BUILD_DRIVER_FOO int enable_foo_driver=0; #endif @@ -358,6 +505,24 @@ ncmpi_create(MPI_Comm comm, MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &nprocs); +#ifdef ENABLE_THREAD_SAFE + int perr; + perr = pthread_mutex_lock(&lock); + if (perr != 0) + printf("Warning in file %s line %d: pthread_mutex_lock() failed (%s)\n", + __FILE__, __LINE__, strerror(perr)); +#endif + + /* creating communicator attributes must be protected by a mutex */ + set_get_comm_attr(comm, &node_ids); + +#ifdef ENABLE_THREAD_SAFE + perr = pthread_mutex_unlock(&lock); + if (perr != 0) + printf("Warning in file %s line %d: pthread_mutex_unlock() failed (%s)\n", + __FILE__, __LINE__, strerror(perr)); +#endif + if (rank == 0) set_env_mode(&env_mode); @@ -523,9 +688,12 @@ ncmpi_create(MPI_Comm comm, return err; } - /* Duplicate comm, because users may free it (though unlikely). Note - * MPI_Comm_dup() is collective. We pass pncp->comm to drivers, so there - * is no need for a driver to duplicate it again. + /* Duplicate comm, because users may use it doing other point-to-point + * communication. When this happened, that communication can mess up with + * the PnetCDF/MPI-IO internal communication, particularly when in + * independent data mode. Note MPI_Comm_dup() is collective. We pass + * pncp->comm to drivers, so there is no need for a driver to duplicate it + * again. */ if (comm != MPI_COMM_WORLD && comm != MPI_COMM_SELF) { mpireturn = MPI_Comm_dup(comm, &pncp->comm); @@ -542,7 +710,7 @@ ncmpi_create(MPI_Comm comm, /* calling the driver's create subroutine */ err = driver->create(pncp->comm, pncp->path, cmode, *ncidp, env_mode, - combined_info, &ncp); + combined_info, node_ids, &ncp); if (status == NC_NOERR) status = err; if (combined_info != MPI_INFO_NULL) MPI_Info_free(&combined_info); if (status != NC_NOERR && status != NC_EMULTIDEFINE_CMODE) { @@ -591,6 +759,7 @@ ncmpi_open(MPI_Comm comm, void *ncp; PNC *pncp; PNC_driver *driver; + PNCIO_node_ids node_ids; #ifdef BUILD_DRIVER_FOO int enable_foo_driver=0; #endif @@ -601,6 +770,24 @@ ncmpi_open(MPI_Comm comm, MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &nprocs); +#ifdef ENABLE_THREAD_SAFE + int perr; + perr = pthread_mutex_lock(&lock); + if (perr != 0) + printf("Warning in file %s line %d: pthread_mutex_lock() failed (%s)\n", + __FILE__, __LINE__, strerror(perr)); +#endif + + /* creating communicator attributes must be protected by a mutex */ + set_get_comm_attr(comm, &node_ids); + +#ifdef ENABLE_THREAD_SAFE + perr = pthread_mutex_unlock(&lock); + if (perr != 0) + printf("Warning in file %s line %d: pthread_mutex_unlock() failed (%s)\n", + __FILE__, __LINE__, strerror(perr)); +#endif + if (rank == 0) set_env_mode(&env_mode); @@ -754,9 +941,12 @@ ncmpi_open(MPI_Comm comm, err = new_id_PNCList(ncidp, pncp); if (err != NC_NOERR) return err; - /* Duplicate comm, because users may free it (though unlikely). Note - * MPI_Comm_dup() is collective. We pass pncp->comm to drivers, so there - * is no need for a driver to duplicate it again. + /* Duplicate comm, because users may use it doing other point-to-point + * communication. When this happened, that communication can mess up with + * the PnetCDF/MPI-IO internal communication, particularly when in + * independent data mode. Note MPI_Comm_dup() is collective. We pass + * pncp->comm to drivers, so there is no need for a driver to duplicate it + * again. */ if (comm != MPI_COMM_WORLD && comm != MPI_COMM_SELF) { mpireturn = MPI_Comm_dup(comm, &pncp->comm); @@ -772,7 +962,7 @@ ncmpi_open(MPI_Comm comm, /* calling the driver's open subroutine */ err = driver->open(pncp->comm, pncp->path, omode, *ncidp, env_mode, - combined_info, &ncp); + combined_info, node_ids, &ncp); if (status == NC_NOERR) status = err; if (combined_info != MPI_INFO_NULL) MPI_Info_free(&combined_info); if (status != NC_NOERR && status != NC_EMULTIDEFINE_OMODE && diff --git a/src/drivers/common/utils.c b/src/drivers/common/utils.c index ffac366ac..362a81ed1 100644 --- a/src/drivers/common/utils.c +++ b/src/drivers/common/utils.c @@ -184,8 +184,14 @@ ncmpii_construct_node_list(MPI_Comm comm, MPI_Gatherv(my_procname, my_procname_len, MPI_CHAR, NULL, NULL, NULL, MPI_CHAR, root, comm); - /* compute node IDs of each MPI process */ - node_ids = (int *) NCI_Malloc(sizeof(int) * (nprocs + 1)); + /* node_ids is an array storing the compute node IDs of all MPI processes + * in the MPI communicator supplied by the application program. Here, we + * use malloc() instead of NCI_Malloc, because node_ids will be freed when + * the communicator is freed. When communicator is MPI_COMM_WORLD or + * MPI_COMM_SELF, it is freed at MPI_Finalize() whose calls to free() + * cannot be tracked by PnetCDF. + */ + node_ids = (int *) malloc(sizeof(int) * (nprocs + 1)); if (rank == root) { /* all_procnames[] can tell us the number of nodes and number of diff --git a/src/include/dispatch.h b/src/include/dispatch.h index 2075ac455..5251b2690 100644 --- a/src/include/dispatch.h +++ b/src/include/dispatch.h @@ -44,10 +44,18 @@ typedef enum { API_VARM } NC_api; +typedef struct { + int ref_count ; /* reference count */ + int num_nodes; /* number of unique compute nodes */ + int *ids; /* [nprocs] node ID of each MPI process */ +} PNCIO_node_ids; + struct PNC_driver { /* APIs manipulate files */ - int (*create)(MPI_Comm, const char*, int, int, int, MPI_Info, void**); - int (*open)(MPI_Comm, const char*, int, int, int, MPI_Info, void**); + int (*create)(MPI_Comm, const char*, int, int, int, MPI_Info, + PNCIO_node_ids, void**); + int (*open)(MPI_Comm, const char*, int, int, int, MPI_Info, + PNCIO_node_ids, void**); int (*close)(void*); int (*enddef)(void*); int (*_enddef)(void*,MPI_Offset,MPI_Offset,MPI_Offset,MPI_Offset); From f6d23e58914c7812d05285b699979dda58681823 Mon Sep 17 00:00:00 2001 From: wkliao Date: Wed, 11 Feb 2026 18:30:58 -0600 Subject: [PATCH 4/6] receive node IDs of all processes from dispatcher's file create/open --- src/drivers/ncmpio/ncmpio_NC.h | 20 +++---- src/drivers/ncmpio/ncmpio_create.c | 57 ++++++++++++-------- src/drivers/ncmpio/ncmpio_driver.h | 6 ++- src/drivers/ncmpio/ncmpio_file_misc.c | 11 ++-- src/drivers/ncmpio/ncmpio_intra_node.c | 37 ++++++------- src/drivers/ncmpio/ncmpio_open.c | 75 ++++++++++++++------------ src/drivers/ncmpio/ncmpio_subfile.c | 12 ++++- src/drivers/ncmpio/ncmpio_util.c | 6 ++- 8 files changed, 129 insertions(+), 95 deletions(-) diff --git a/src/drivers/ncmpio/ncmpio_NC.h b/src/drivers/ncmpio/ncmpio_NC.h index 1831eb671..995a083ff 100644 --- a/src/drivers/ncmpio/ncmpio_NC.h +++ b/src/drivers/ncmpio/ncmpio_NC.h @@ -406,6 +406,7 @@ struct NC { int num_subfiles; /* no. subfiles */ struct NC *ncp_sf; /* ncp of subfile */ MPI_Comm comm_sf; /* subfile MPI communicator */ + PNCIO_node_ids node_ids_sf; /* node IDs of subfile MPI communicator */ #endif int hdr_chunk; /* chunk size for reading header, one chunk at a time */ int data_chunk; /* chunk size for moving variables to higher offsets */ @@ -432,16 +433,15 @@ struct NC { MPI_Offset put_size; /* amount of writes committed so far in bytes */ MPI_Offset get_size; /* amount of reads committed so far in bytes */ - MPI_Comm comm; /* MPI communicator */ - int rank; /* MPI rank of this process */ - int nprocs; /* no. MPI processes */ - int num_nodes; /* no. unique compute nodes allocated */ - int *node_ids; /* [nprocs] node IDs of each rank */ - MPI_Info mpiinfo; /* used MPI info object */ - MPI_File collective_fh; /* MPI-IO file handle for collective mode */ - MPI_File independent_fh; /* MPI-IO file handle for independent mode */ - PNCIO_File *pncio_fh; /* PNCIO file handler */ - int fstype; /* file system type: PNCIO_LUSTRE, PNCIO_UFS */ + MPI_Comm comm; /* MPI communicator */ + int rank; /* MPI rank of this process */ + int nprocs; /* no. MPI processes */ + PNCIO_node_ids node_ids; /* node IDs of each rank */ + MPI_Info mpiinfo; /* used MPI info object */ + MPI_File collective_fh; /* MPI-IO file handle for collective mode */ + MPI_File independent_fh; /* MPI-IO file handle for independent mode */ + PNCIO_File *pncio_fh; /* PNCIO file handler */ + int fstype; /* file system type: PNCIO_LUSTRE, PNCIO_UFS */ NC_dimarray dims; /* dimensions defined */ NC_attrarray attrs; /* global attributes defined */ diff --git a/src/drivers/ncmpio/ncmpio_create.c b/src/drivers/ncmpio/ncmpio_create.c index e5bfb413a..9d318ffa3 100644 --- a/src/drivers/ncmpio/ncmpio_create.c +++ b/src/drivers/ncmpio/ncmpio_create.c @@ -34,13 +34,14 @@ /*----< ncmpio_create() >----------------------------------------------------*/ int -ncmpio_create(MPI_Comm comm, - const char *path, - int cmode, - int ncid, - int env_mode, - MPI_Info user_info, /* user's and env info combined */ - void **ncpp) +ncmpio_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info user_info, /* user's and env info combined */ + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { char *filename, value[MPI_MAX_INFO_VAL + 1], *mpi_name; int rank, nprocs, mpiomode, err, mpireturn, default_format, file_exist=1; @@ -340,25 +341,27 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == /* initialize unlimited_id as no unlimited dimension yet defined */ ncp->dims.unlimited_id = -1; - /* Construct a list of unique IDs of compute nodes allocated to this job - * and save it in ncp->node_ids[nprocs], which contains node IDs of each - * rank. The node IDs are used either when intra-node aggregation (INA) is - * enabled or when using PnetCDF's PNCIO driver. + /* node_ids stores a list of unique IDs of compute nodes of all MPI ranks + * in the MPI communicator passed from the user application. It is a keyval + * attribute cached in the communicator. See src/dispatchers/file.c for + * details. The node IDs will be used when the intra-node aggregation (INA) + * is enabled and when PnetCDF's PNCIO driver is used. * * When intra-node aggregation (INA) is enabled, node IDs are used to * create a new MPI communicator consisting of the intra-node aggregators * only. The communicator will be used to call file open in MPI-IO or * PnetCDF's PNCIO driver. This means only intra-node aggregators will * perform file I/O in PnetCDF collective put and get operations. + * + * node_ids will be used to calculate cb_nodes, the number of MPI-IO/PNCIO + * aggregators (not INA aggregators). */ - ncp->node_ids = NULL; - err = ncmpii_construct_node_list(comm, &ncp->num_nodes, &ncp->node_ids); - if (err != NC_NOERR) return err; + ncp->node_ids = node_ids; /* When the total number of aggregators >= number of processes, disable * intra-node aggregation. */ - if (ncp->num_aggrs_per_node * ncp->num_nodes >= ncp->nprocs) + if (ncp->num_aggrs_per_node * node_ids.num_nodes >= ncp->nprocs) ncp->num_aggrs_per_node = 0; /* ncp->num_aggrs_per_node = 0, or > 0 is an indicator of whether the INA @@ -370,6 +373,12 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == ncp->ina_rank = -1; ncp->ina_node_list = NULL; if (ncp->num_aggrs_per_node > 0) { + /* Must duplicate node_ids, as node_ids.ids[] will be modified by + * ncmpio_ina_init(). + */ + ncp->node_ids.ids = (int*) NCI_Malloc(sizeof(int) * ncp->nprocs); + memcpy(ncp->node_ids.ids, node_ids.ids, sizeof(int) * ncp->nprocs); + /* Divide all ranks into groups. Each group is assigned one intra-node * aggregator. The following metadata related to intra-node aggregation * will be set up in ncmpio_ina_init(). @@ -380,7 +389,7 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == * ncp->ina_comm is an MPI communicator consisting of only intra-node * aggregators across all nodes, which will be used when calling * MPI_File_open(). For non-aggregator, it == MPI_COMM_NULL. - * ncp->node_ids[] will be modified to contain the nodes IDs of all + * ncp->node_ids.ids[] will be modified to contain the nodes IDs of all * intra-node aggregators, and will be passed to pncio_fh. */ err = ncmpio_ina_init(ncp); @@ -417,7 +426,7 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == striping_factor = atoi(value); } if (striping_factor == 0) { - sprintf(value, "%d", ncp->num_nodes); + sprintf(value, "%d", ncp->node_ids.num_nodes); MPI_Info_set(user_info, "striping_factor", value); } } @@ -473,7 +482,6 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == /* When ncp->fstype != PNCIO_FSTYPE_MPIIO, use PnetCDF's PNCIO driver */ ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1, sizeof(PNCIO_File)); ncp->pncio_fh->file_system = ncp->fstype; - ncp->pncio_fh->num_nodes = ncp->num_nodes; ncp->pncio_fh->node_ids = ncp->node_ids; err = PNCIO_File_open(comm, filename, mpiomode, user_info, @@ -567,13 +575,16 @@ if (ncp->rank == 0) { NCI_Free(ncp->ina_node_list); ncp->ina_node_list = NULL; } - /* node_ids is no longer needed */ - if (ncp->node_ids != NULL) { - NCI_Free(ncp->node_ids); - ncp->node_ids = NULL; + if (ncp->num_aggrs_per_node > 0) { + /* node_ids is no longer needed. Note node_ids is duplicated above from + * the MPI communicator's cached keyval attribute when + * ncp->num_aggrs_per_node > 0. + */ + NCI_Free(ncp->node_ids.ids); + ncp->node_ids.ids = NULL; } if (ncp->pncio_fh != NULL) - ncp->pncio_fh->node_ids = NULL; + ncp->pncio_fh->node_ids.ids = NULL; return NC_NOERR; } diff --git a/src/drivers/ncmpio/ncmpio_driver.h b/src/drivers/ncmpio/ncmpio_driver.h index f0f01ec5f..2d9924d89 100644 --- a/src/drivers/ncmpio/ncmpio_driver.h +++ b/src/drivers/ncmpio/ncmpio_driver.h @@ -13,11 +13,13 @@ extern int ncmpio_create(MPI_Comm comm, const char *path, int cmode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncmpio_open(MPI_Comm comm, const char *path, int omode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncmpio_close(void *ncdp); diff --git a/src/drivers/ncmpio/ncmpio_file_misc.c b/src/drivers/ncmpio/ncmpio_file_misc.c index c1c97871c..76292c088 100644 --- a/src/drivers/ncmpio/ncmpio_file_misc.c +++ b/src/drivers/ncmpio/ncmpio_file_misc.c @@ -172,9 +172,9 @@ ncmpio_begin_indep_data(void *ncdp) ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File)); ncp->pncio_fh->file_system = ncp->fstype; - ncp->pncio_fh->num_nodes = 1; - ncp->pncio_fh->node_ids = (int*) NCI_Malloc(sizeof(int)); - ncp->pncio_fh->node_ids[0] = 0; + ncp->pncio_fh->node_ids.num_nodes = 1; + ncp->pncio_fh->node_ids.ids = (int*) NCI_Malloc(sizeof(int)); + ncp->pncio_fh->node_ids.ids[0] = 0; int omode = fClr(ncp->mpiomode, MPI_MODE_CREATE); @@ -196,8 +196,9 @@ ncmpio_begin_indep_data(void *ncdp) /* Add PnetCDF hints into ncp->mpiinfo */ ncmpio_hint_set(ncp, ncp->mpiinfo); - NCI_Free(ncp->pncio_fh->node_ids); - ncp->pncio_fh->node_ids = NULL; + NCI_Free(ncp->pncio_fh->node_ids.ids); + ncp->pncio_fh->node_ids.num_nodes = 0; + ncp->pncio_fh->node_ids.ids = NULL; return NC_NOERR; } diff --git a/src/drivers/ncmpio/ncmpio_intra_node.c b/src/drivers/ncmpio/ncmpio_intra_node.c index c69b6fe87..f85c54ca4 100644 --- a/src/drivers/ncmpio/ncmpio_intra_node.c +++ b/src/drivers/ncmpio/ncmpio_intra_node.c @@ -357,11 +357,12 @@ void heap_merge(int nprocs, * * The subroutine performs the following tasks. * 1. Make use of the affinity of each MPI process to its compute node, - * represented by ncp->num_nodes and ncp->node_ids[]. These two member of - * ncp should have been set from a call to ncmpii_construct_node_list() - * earlier during ncmpio_create() and ncmpio_open(). - * + ncp->num_nodes is the number of unique compute nodes. - * + ncp->node_ids[ncp->nprocs] contains node IDs for all processes. + * represented by ncp->node_ids.num_nodes and ncp->node_ids.ids[]. Note + * ncp->node_ids should have already been set from a call to + * ncmpii_construct_node_list() earlier during ncmpi_create() and + * ncmpi_open() at the dispatcher. + * + ncp->node_ids.num_nodes is the number of unique compute nodes. + * + ncp->node_ids.ids[ncp->nprocs] contains node IDs for all processes. * 2. Divide processes into groups, select aggregators, and determine whether * self process is an intra-node aggregator. * + ncp->my_aggr is rank ID of my aggregator. @@ -404,11 +405,11 @@ ncmpio_ina_init(NC *ncp) * entering this subroutine. Thus ncp->num_aggrs_per_node must be > 0. */ - /* ncp->node_ids[] has been established in ncmpii_construct_node_list() + /* ncp->node_ids.ids[] has been established in ncmpii_construct_node_list() * called in ncmpio_create() or ncmpio_open() before entering this * subroutine. my_node_id is this rank's node ID. */ - my_node_id = ncp->node_ids[ncp->rank]; + my_node_id = ncp->node_ids.ids[ncp->rank]; /* nprocs_my_node: the number of processes in my nodes * ranks_my_node[]: rank IDs of all processes in my node. @@ -419,7 +420,7 @@ ncmpio_ina_init(NC *ncp) my_rank_index = -1; nprocs_my_node = 0; for (i=0; inprocs; i++) { - if (ncp->node_ids[i] == my_node_id) { + if (ncp->node_ids.ids[i] == my_node_id) { if (i == ncp->rank) my_rank_index = nprocs_my_node; ranks_my_node[nprocs_my_node] = i; @@ -498,10 +499,10 @@ ncmpio_ina_init(NC *ncp) * so that only aggregators call MPI-IO functions to access the file. * * When using the PnetCDF's internal PNCIO driver, we can pass a list of - * node_ids of the new communicator to the PNCIO file handler, + * node IDs of the new communicator to the PNCIO file handler, * ncp->pncio_fh, so to prevent the driver from the repeated work of - * constructing the list of node IDs, node_ids. If using MPI-IO driver, - * then ROMIO will do this internally again anyway. + * constructing the list of node IDs, node_ids.ids[]. If using MPI-IO + * driver, then ROMIO will do this internally again anyway. */ do_io = (ncp->my_aggr == ncp->rank) ? 1 : 0; @@ -511,14 +512,14 @@ ncmpio_ina_init(NC *ncp) TRACE_COMM(MPI_Allgather)(&do_io, 1, MPI_INT, ncp->ina_node_list, 1, MPI_INT,ncp->comm); - /* Construct ncp->node_ids[] and ncp->ina_node_list[]. Their contents + /* Construct ncp->node_ids.ids[] and ncp->ina_node_list[]. Their contents * depend on the layout of MPI process allocation to the compute nodes. * The common layouts can be two kinds: * + cyclic - MPI ranks are assigned to nodes round-robin-ly, * + block - MPI ranks are assigned to a node and then move on to next. * * Below uses an example of nodes=3, nprocs=10, * num_aggrs_per_node=2. - * ncp->node_ids[] should be + * ncp->node_ids.ids[] should be * block process allocation: 0,0,0,0,1,1,1,2,2,2 * cyclic process allocation: 0,1,2,0,1,2,0,1,2,0 * Accordingly, ncp->ina_node_list[] can be two kinds @@ -526,7 +527,7 @@ ncmpio_ina_init(NC *ncp) * cyclic process allocation: 1,1,1,0,0,0,1,1,1,0 */ - /* ncp->node_ids[]: node IDs of processes in the new MPI communicator. + /* ncp->node_ids.ids[]: node IDs of processes in the new MPI communicator. * ncp->ina_node_list[]: the rank IDs of the new MPI communicator. */ ina_nprocs = 0; @@ -535,11 +536,11 @@ ncmpio_ina_init(NC *ncp) ina_nprocs++; /* count the total number of INA aggregators */ ncp->ina_node_list[j] = i; - /* Modify ncp->node_ids[] to store the node IDs of the processes in - * the new communicator. Note ncp->node_ids[] from now on is used - * by PnetCDF's PNCIO driver only. + /* Modify ncp->node_ids.ids[] to store the node IDs of the + * processes in the new communicator. Note ncp->node_ids.ids[] from + * now on is used by PnetCDF's PNCIO driver only. */ - ncp->node_ids[j] = ncp->node_ids[i]; + ncp->node_ids.ids[j] = ncp->node_ids.ids[i]; j++; } } diff --git a/src/drivers/ncmpio/ncmpio_open.c b/src/drivers/ncmpio/ncmpio_open.c index 5ca8e2997..c5540202a 100644 --- a/src/drivers/ncmpio/ncmpio_open.c +++ b/src/drivers/ncmpio/ncmpio_open.c @@ -32,13 +32,14 @@ /*----< ncmpio_open() >------------------------------------------------------*/ int -ncmpio_open(MPI_Comm comm, - const char *path, - int omode, - int ncid, - int env_mode, - MPI_Info user_info, /* user's and env info combined */ - void **ncpp) +ncmpio_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info user_info, /* user's and env info combined */ + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) { char *filename, value[MPI_MAX_INFO_VAL + 1], *mpi_name; int i, rank, nprocs, mpiomode, err, status=NC_NOERR, mpireturn, flag; @@ -130,28 +131,28 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == fSet(ncp->flags, env_mode); - /* Construct a list of unique IDs of compute nodes allocated to this job - * and save it in ncp->node_ids[nprocs], which contains node IDs of each - * rank. The node IDs are used either when intra-node aggregation is - * enabled or when using PnetCDF's PNCIO driver. + /* node_ids stores a list of unique IDs of compute nodes of all MPI ranks + * in the MPI communicator passed from the user application. It is a keyval + * attribute cached in the communicator. See src/dispatchers/file.c for + * details. The node IDs will be used when the intra-node aggregation (INA) + * is enabled and when PnetCDF's PNCIO driver is used. * - * When intra-node aggregation is enabled, node IDs are used to create a - * new MPI communicator consisting of the intra-node aggregators only. The - * communicator will be used to call file open in MPI-IO or PnetCDF's PNCIO - * driver. This means only intra-node aggregators will perform file I/O in - * PnetCDF collective put and get operations. + * When intra-node aggregation (INA) is enabled, node IDs are used to + * create a new MPI communicator consisting of the intra-node aggregators + * only. The communicator will be used to call file open in MPI-IO or + * PnetCDF's PNCIO driver. This means only intra-node aggregators will + * perform file I/O in PnetCDF collective put and get operations. + * + * node_ids will be used to calculate cb_nodes, the number of MPI-IO/PNCIO + * aggregators (not INA aggregators). */ - ncp->node_ids = NULL; - if (ncp->fstype != PNCIO_FSTYPE_MPIIO || ncp->num_aggrs_per_node != 0) { - err = ncmpii_construct_node_list(comm, &ncp->num_nodes, &ncp->node_ids); - if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); + ncp->node_ids = node_ids; - /* When the total number of aggregators >= number of processes, disable - * intra-node aggregation. - */ - if (ncp->num_aggrs_per_node * ncp->num_nodes >= ncp->nprocs) - ncp->num_aggrs_per_node = 0; - } + /* When the total number of aggregators >= number of processes, disable + * intra-node aggregation. + */ + if (ncp->num_aggrs_per_node * node_ids.num_nodes >= ncp->nprocs) + ncp->num_aggrs_per_node = 0; /* ncp->num_aggrs_per_node = 0, or > 0 indicates whether this feature * is disabled or enabled globally for all processes. @@ -162,6 +163,12 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == ncp->ina_rank = -1; ncp->ina_node_list = NULL; if (ncp->num_aggrs_per_node > 0) { + /* Must duplicate node_ids, as node_ids.ids[] will be modified by + * ncmpio_ina_init(). + */ + ncp->node_ids.ids = (int*) NCI_Malloc(sizeof(int) * ncp->nprocs); + memcpy(ncp->node_ids.ids, node_ids.ids, sizeof(int) * ncp->nprocs); + /* Divide all ranks into groups. Each group is assigned with one * intra-node aggregator. The following metadata related to intra-node * aggregation will be set up. @@ -172,7 +179,7 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == * ncp->ina_comm will be created consisting of only intra-node * aggregators, which will be used when calling MPI_File_open(). * For non-aggregator, ncp->ina_comm == MPI_COMM_NULL. - * ncp->node_ids[] will be modified to contain the nodes IDs of + * ncp->node_ids.ids[] will be modified to contain the nodes IDs of * intra-node aggregators only, which will be passed to pncio_fh. */ err = ncmpio_ina_init(ncp); @@ -218,7 +225,6 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == /* When ncp->fstype != PNCIO_FSTYPE_MPIIO, use PnetCDF's PNCIO driver */ ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File)); ncp->pncio_fh->file_system = ncp->fstype; - ncp->pncio_fh->num_nodes = ncp->num_nodes; ncp->pncio_fh->node_ids = ncp->node_ids; err = PNCIO_File_open(comm, filename, mpiomode, user_info, @@ -296,13 +302,16 @@ if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == NCI_Free(ncp->ina_node_list); ncp->ina_node_list = NULL; } - /* node_ids is no longer needed */ - if (ncp->node_ids != NULL) { - NCI_Free(ncp->node_ids); - ncp->node_ids = NULL; + if (ncp->num_aggrs_per_node > 0) { + /* node_ids is no longer needed. Note node_ids is duplicated above from + * the MPI communicator's cached keyval attribute when + * ncp->num_aggrs_per_node > 0. + */ + NCI_Free(ncp->node_ids.ids); + ncp->node_ids.ids = NULL; } if (ncp->pncio_fh != NULL) - ncp->pncio_fh->node_ids = NULL; + ncp->pncio_fh->node_ids.ids = NULL; /* read header from file into NC object pointed by ncp -------------------*/ err = ncmpio_hdr_get_NC(ncp); diff --git a/src/drivers/ncmpio/ncmpio_subfile.c b/src/drivers/ncmpio/ncmpio_subfile.c index 59a8ed89f..c0d0bd855 100644 --- a/src/drivers/ncmpio/ncmpio_subfile.c +++ b/src/drivers/ncmpio/ncmpio_subfile.c @@ -129,10 +129,12 @@ subfile_create(NC *ncp) MPI_Info_set(info, "romio_lustre_start_iodevice", offset); MPI_Info_set(info, "striping_factor", "1"); */ + ncmpii_construct_node_list(ncp->comm_sf, &ncp->node_ids_sf.num_nodes, + &ncp->node_ids_sf.ids); void *ncp_sf; status = ncmpio_create(ncp->comm_sf, path_sf, ncp->iomode, ncp->ncid, - ncp->flags, info, &ncp_sf); + ncp->flags, info, ncp->node_ids_sf, &ncp_sf); if (status != NC_NOERR && myrank == 0) fprintf(stderr, "%s: error in creating file(%s): %s\n", __func__, path_sf, ncmpi_strerror(status)); @@ -186,9 +188,12 @@ ncmpio_subfile_open(NC *ncp) /* sprintf(path_sf, "%s%d/%s", path, color, file); */ sprintf(path_sf, "%s.subfile_%i.%s", ncp->path, color, "nc"); + ncmpii_construct_node_list(ncp->comm_sf, &ncp->node_ids_sf.num_nodes, + &ncp->node_ids_sf.ids); + void *ncp_sf; status = ncmpio_open(ncp->comm_sf, path_sf, ncp->iomode, ncp->ncid, - ncp->flags, MPI_INFO_NULL, &ncp_sf); + ncp->flags, MPI_INFO_NULL, ncp->node_ids_sf, &ncp_sf); ncp->ncp_sf = (NC*) ncp_sf; return status; @@ -200,6 +205,9 @@ int ncmpio_subfile_close(NC *ncp) int status = NC_NOERR; if (ncp->ncp_sf != NULL) { + if (ncp->node_ids_sf.ids != NULL) + free(ncp->node_ids_sf.ids); + status = ncmpio_close(ncp->ncp_sf); if (status != NC_NOERR) return status; ncp->ncp_sf = NULL; diff --git a/src/drivers/ncmpio/ncmpio_util.c b/src/drivers/ncmpio/ncmpio_util.c index befca23a9..179bd7a4b 100644 --- a/src/drivers/ncmpio/ncmpio_util.c +++ b/src/drivers/ncmpio/ncmpio_util.c @@ -51,8 +51,10 @@ void ncmpio_hint_extract(NC *ncp, ncp->ibuf_size = PNC_DEFAULT_IBUF_SIZE; #ifdef ENABLE_SUBFILING - ncp->subfile_mode = 0; - ncp->num_subfiles = 0; + ncp->subfile_mode = 0; + ncp->num_subfiles = 0; + ncp->node_ids_sf.num_nodes = 0; + ncp->node_ids_sf.ids = NULL; #endif ncp->dims.hash_size = PNC_HSIZE_DIM; From 549c699336f162005908744ec5853813af40f6a9 Mon Sep 17 00:00:00 2001 From: wkliao Date: Wed, 11 Feb 2026 18:32:50 -0600 Subject: [PATCH 5/6] Update data structure for storing node IDs and number of nodes --- src/drivers/pncio/pncio.h | 5 +- src/drivers/pncio/pncio_lustre_open.c | 70 ++++++++++++++------------- src/drivers/pncio/pncio_open.c | 18 +++---- 3 files changed, 48 insertions(+), 45 deletions(-) diff --git a/src/drivers/pncio/pncio.h b/src/drivers/pncio/pncio.h index 3ad9156fd..267509d53 100644 --- a/src/drivers/pncio/pncio.h +++ b/src/drivers/pncio/pncio.h @@ -25,6 +25,7 @@ #define FDTYPE int #include +#include #include #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) @@ -155,9 +156,7 @@ typedef struct { int file_system; /* type of file system */ int fd_sys; /* system file descriptor */ - int num_nodes; /* number of unique compute nodes from - * MPI_Get_processor_name() */ - int *node_ids; /* [nprocs] node IDs of each rank */ + PNCIO_node_ids node_ids;/* node IDs of each rank */ int access_mode; /* Access mode (sequential, append, etc.), * possibly modified to deal with * data sieving or deferred open */ diff --git a/src/drivers/pncio/pncio_lustre_open.c b/src/drivers/pncio/pncio_lustre_open.c index 62cc83cb4..b63e2f822 100644 --- a/src/drivers/pncio/pncio_lustre_open.c +++ b/src/drivers/pncio/pncio_lustre_open.c @@ -516,23 +516,23 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) MPI_Comm_rank(fd->comm, &rank); /* number of MPI processes running on each node */ - nprocs_per_node = (int *) NCI_Calloc(fd->num_nodes, sizeof(int)); + nprocs_per_node = (int *) NCI_Calloc(fd->node_ids.num_nodes, sizeof(int)); - for (i=0; inode_ids[i]]++; + for (i=0; inode_ids.ids[i]]++; /* construct rank IDs of MPI processes running on each node */ - ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->num_nodes); + ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->node_ids.num_nodes); ranks_per_node[0] = (int *) NCI_Malloc(sizeof(int) * nprocs); - for (i=1; inum_nodes; i++) + for (i=1; inode_ids.num_nodes; i++) ranks_per_node[i] = ranks_per_node[i - 1] + nprocs_per_node[i - 1]; - for (i=0; inum_nodes; i++) nprocs_per_node[i] = 0; + for (i=0; inode_ids.num_nodes; i++) nprocs_per_node[i] = 0; /* Populate ranks_per_node[], list of MPI ranks running on each node. * Populate nprocs_per_node[], number of MPI processes on each node. */ for (i=0; inode_ids[i]; + k = fd->node_ids.ids[i]; ranks_per_node[k][nprocs_per_node[k]] = i; nprocs_per_node[k]++; } @@ -540,9 +540,10 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) /* To save a call to MPI_Bcast(), all processes run the same codes below to * calculate num_aggr, the number of aggregators (later becomes cb_nodes). * - * The calculation is based on the number of compute nodes, fd->num_nodes, - * and processes per node, nprocs_per_node. At this moment, all processes - * should have obtained the Lustre file striping settings. + * The calculation is based on the number of compute nodes, + * fd->node_ids.num_nodes, and processes per node, nprocs_per_node. At this + * moment, all processes should have obtained the Lustre file striping + * settings. */ striping_factor = fd->hints->striping_factor; @@ -581,11 +582,14 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) */ if (fd->hints->cb_nodes == 0) { /* User did not set hint "cb_nodes" */ - if (nprocs >= striping_factor * 8 && nprocs/fd->num_nodes >= 8) + if (nprocs >= striping_factor * 8 && + nprocs/fd->node_ids.num_nodes >= 8) num_aggr = striping_factor * 8; - else if (nprocs >= striping_factor * 4 && nprocs/fd->num_nodes >= 4) + else if (nprocs >= striping_factor * 4 && + nprocs/fd->node_ids.num_nodes >= 4) num_aggr = striping_factor * 4; - else if (nprocs >= striping_factor * 2 && nprocs/fd->num_nodes >= 2) + else if (nprocs >= striping_factor * 2 && + nprocs/fd->node_ids.num_nodes >= 2) num_aggr = striping_factor * 2; else num_aggr = striping_factor; @@ -646,10 +650,10 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) * Aggregator 3, running on node 0, access OST 7. */ int max_nprocs_node = 0; - for (i=0; inum_nodes; i++) + for (i=0; inode_ids.num_nodes; i++) max_nprocs_node = MAX(max_nprocs_node, nprocs_per_node[i]); - int max_naggr_node = striping_factor / fd->num_nodes; - if (striping_factor % fd->num_nodes) max_naggr_node++; + int max_naggr_node = striping_factor / fd->node_ids.num_nodes; + if (striping_factor % fd->node_ids.num_nodes) max_naggr_node++; /* max_naggr_node is the max number of processes per node to be picked * as aggregator in each round. */ @@ -684,7 +688,7 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) } #endif - if (striping_factor <= fd->num_nodes) { + if (striping_factor <= fd->node_ids.num_nodes) { /* When number of OSTs is less than number of compute nodes, first * select number of nodes equal to the number of OSTs by spread the * selection evenly across all compute nodes (i.e. with a stride @@ -700,9 +704,9 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) if (block_assignment) { int n=0; int remain = num_aggr % striping_factor; - int node_stride = fd->num_nodes / striping_factor; + int node_stride = fd->node_ids.num_nodes / striping_factor; /* walk through each node and pick aggregators */ - for (j=0; jnum_nodes; j+=node_stride) { + for (j=0; jnode_ids.num_nodes; j+=node_stride) { /* Selecting node IDs with a stride. j is the node ID */ int nranks_per_node = num_aggr / striping_factor; /* front nodes may have 1 more to pick */ @@ -712,7 +716,7 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) /* Selecting rank IDs within node j with a stride */ fd->hints->ranklist[n] = ranks_per_node[j][k*rank_stride]; if (++n == num_aggr) { - j = fd->num_nodes; /* break loop j */ + j = fd->node_ids.num_nodes; /* break loop j */ break; /* loop k */ } } @@ -720,7 +724,7 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) } else { int avg = num_aggr / striping_factor; - int stride = fd->num_nodes / striping_factor; + int stride = fd->node_ids.num_nodes / striping_factor; if (num_aggr % striping_factor) avg++; for (i = 0; i < num_aggr; i++) { /* j is the selected node ID. This selection is round-robin @@ -733,23 +737,23 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) } } } - else { /* striping_factor > fd->num_nodes */ + else { /* striping_factor > fd->node_ids.num_nodes */ /* When number of OSTs is more than number of compute nodes, I/O * aggregators are selected from all nodes. Within each node, * aggregators are spread evenly instead of the first few ranks. */ int *naggr_per_node, *idx_per_node, avg; - idx_per_node = (int*) NCI_Calloc(fd->num_nodes, sizeof(int)); - naggr_per_node = (int*) NCI_Malloc(fd->num_nodes * sizeof(int)); - for (i = 0; i < striping_factor % fd->num_nodes; i++) - naggr_per_node[i] = striping_factor / fd->num_nodes + 1; - for (; i < fd->num_nodes; i++) - naggr_per_node[i] = striping_factor / fd->num_nodes; + idx_per_node = (int*) NCI_Calloc(fd->node_ids.num_nodes, sizeof(int)); + naggr_per_node = (int*) NCI_Malloc(fd->node_ids.num_nodes * sizeof(int)); + for (i = 0; i < striping_factor % fd->node_ids.num_nodes; i++) + naggr_per_node[i] = striping_factor / fd->node_ids.num_nodes + 1; + for (; i < fd->node_ids.num_nodes; i++) + naggr_per_node[i] = striping_factor / fd->node_ids.num_nodes; avg = num_aggr / striping_factor; if (avg > 0) - for (i = 0; i < fd->num_nodes; i++) + for (i = 0; i < fd->node_ids.num_nodes; i++) naggr_per_node[i] *= avg; - for (i = 0; i < fd->num_nodes; i++) + for (i = 0; i < fd->node_ids.num_nodes; i++) naggr_per_node[i] = MIN(naggr_per_node[i], nprocs_per_node[i]); /* naggr_per_node[] is the number of aggregators that can be * selected as I/O aggregators @@ -757,14 +761,14 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) if (block_assignment) { int n = 0; - for (j=0; jnum_nodes; j++) { + for (j=0; jnode_ids.num_nodes; j++) { /* j is the node ID */ int rank_stride = nprocs_per_node[j] / naggr_per_node[j]; /* try stride==1 seems no effect, rank_stride = 1; */ for (k=0; khints->ranklist[n] = ranks_per_node[j][k*rank_stride]; if (++n == num_aggr) { - j = fd->num_nodes; /* break loop j */ + j = fd->node_ids.num_nodes; /* break loop j */ break; /* loop k */ } } @@ -773,7 +777,7 @@ int Lustre_set_cb_node_list(PNCIO_File *fd) else { for (i = 0; i < num_aggr; i++) { int stripe_i = i % striping_factor; - j = stripe_i % fd->num_nodes; /* to select from node j */ + j = stripe_i % fd->node_ids.num_nodes; /* select from node j */ k = nprocs_per_node[j] / naggr_per_node[j]; k *= idx_per_node[j]; /* try stride==1 seems no effect, k = idx_per_node[j]; */ @@ -956,7 +960,7 @@ assert(mpi_io_mode & MPI_MODE_CREATE); */ if (str_factor == 0 && (stripe_count == LLAPI_LAYOUT_DEFAULT || stripe_count == LLAPI_LAYOUT_WIDE)) { - stripe_count = MIN(fd->num_nodes, total_num_OSTs); + stripe_count = MIN(fd->node_ids.num_nodes, total_num_OSTs); if (overstriping_ratio > 1) stripe_count *= overstriping_ratio; } else if (str_factor > 0) diff --git a/src/drivers/pncio/pncio_open.c b/src/drivers/pncio/pncio_open.c index a8d1c7d6a..c7ec8539b 100644 --- a/src/drivers/pncio/pncio_open.c +++ b/src/drivers/pncio/pncio_open.c @@ -43,7 +43,7 @@ int GEN_set_cb_node_list(PNCIO_File *fd) /* If hint cb_nodes is not set by user, select one rank per node to be * an I/O aggregator */ - fd->hints->cb_nodes = fd->num_nodes; + fd->hints->cb_nodes = fd->node_ids.num_nodes; else if (fd->hints->cb_nodes > nprocs) /* cb_nodes must be <= nprocs */ fd->hints->cb_nodes = nprocs; @@ -53,23 +53,23 @@ int GEN_set_cb_node_list(PNCIO_File *fd) return NC_ENOMEM; /* number of MPI processes running on each node */ - nprocs_per_node = (int *) NCI_Calloc(fd->num_nodes, sizeof(int)); + nprocs_per_node = (int *) NCI_Calloc(fd->node_ids.num_nodes, sizeof(int)); - for (i=0; inode_ids[i]]++; + for (i=0; inode_ids.ids[i]]++; /* construct rank IDs of MPI processes running on each node */ - ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->num_nodes); + ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->node_ids.num_nodes); ranks_per_node[0] = (int *) NCI_Malloc(sizeof(int) * nprocs); - for (i=1; inum_nodes; i++) + for (i=1; inode_ids.num_nodes; i++) ranks_per_node[i] = ranks_per_node[i - 1] + nprocs_per_node[i - 1]; - for (i=0; inum_nodes; i++) nprocs_per_node[i] = 0; + for (i=0; inode_ids.num_nodes; i++) nprocs_per_node[i] = 0; /* Populate ranks_per_node[], list of MPI ranks running on each node. * Populate nprocs_per_node[], number of MPI processes on each node. */ for (i=0; inode_ids[i]; + k = fd->node_ids.ids[i]; ranks_per_node[k][nprocs_per_node[k]] = i; nprocs_per_node[k]++; } @@ -81,7 +81,7 @@ int GEN_set_cb_node_list(PNCIO_File *fd) for (i=0; ihints->cb_nodes; i++) { if (j >= nprocs_per_node[k]) { /* if run out of ranks in this node k */ k++; - if (k == fd->num_nodes) { /* round-robin to first node */ + if (k == fd->node_ids.num_nodes) { /* round-robin to first node */ k = 0; j++; } @@ -92,7 +92,7 @@ int GEN_set_cb_node_list(PNCIO_File *fd) fd->is_agg = 1; fd->my_cb_nodes_index = i; } - if (k == fd->num_nodes) { /* round-robin to first node */ + if (k == fd->node_ids.num_nodes) { /* round-robin to first node */ k = 0; j++; } From b81a85e46cd3fe803e7eb39e0d7923323628c125 Mon Sep 17 00:00:00 2001 From: wkliao Date: Wed, 11 Feb 2026 18:33:39 -0600 Subject: [PATCH 6/6] Update file create/open APIs for receive node IDs of all processes Node IDs and the number of compute nodes are generated during file creation or opening at the dispatcher. --- src/drivers/nc4io/nc4io_driver.h | 4 ++-- src/drivers/nc4io/nc4io_file.c | 30 +++++++++++++----------- src/drivers/ncadios/ncadios_driver.h | 6 +++-- src/drivers/ncadios/ncadios_file.c | 30 +++++++++++++----------- src/drivers/ncbbio/ncbbio_driver.h | 6 +++-- src/drivers/ncbbio/ncbbio_file.c | 35 +++++++++++++++------------- 6 files changed, 61 insertions(+), 50 deletions(-) diff --git a/src/drivers/nc4io/nc4io_driver.h b/src/drivers/nc4io/nc4io_driver.h index 2b321210b..bf1c0b6fd 100644 --- a/src/drivers/nc4io/nc4io_driver.h +++ b/src/drivers/nc4io/nc4io_driver.h @@ -31,11 +31,11 @@ struct NC_nc4 { extern int nc4io_create(MPI_Comm comm, const char *path, int cmode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, void **ncdp); extern int nc4io_open(MPI_Comm comm, const char *path, int omode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, void **ncdp); extern int nc4io_close(void *ncdp); diff --git a/src/drivers/nc4io/nc4io_file.c b/src/drivers/nc4io/nc4io_file.c index ddf3a6a40..d89119122 100644 --- a/src/drivers/nc4io/nc4io_file.c +++ b/src/drivers/nc4io/nc4io_file.c @@ -50,13 +50,14 @@ #include int -nc4io_create(MPI_Comm comm, - const char *path, - int cmode, - int ncid, - int env_mode, - MPI_Info info, - void **ncpp) /* OUT */ +nc4io_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { char *filename; int err, ncidtmp; @@ -109,13 +110,14 @@ nc4io_create(MPI_Comm comm, } int -nc4io_open(MPI_Comm comm, - const char *path, - int omode, - int ncid, - int env_mode, - MPI_Info info, - void **ncpp) +nc4io_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { char *filename; int err, ncidtmp; diff --git a/src/drivers/ncadios/ncadios_driver.h b/src/drivers/ncadios/ncadios_driver.h index 8beddbd07..28432d1e1 100644 --- a/src/drivers/ncadios/ncadios_driver.h +++ b/src/drivers/ncadios/ncadios_driver.h @@ -92,11 +92,13 @@ struct NC_ad { extern int ncadios_create(MPI_Comm comm, const char *path, int cmode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncadios_open(MPI_Comm comm, const char *path, int omode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncadios_close(void *ncdp); diff --git a/src/drivers/ncadios/ncadios_file.c b/src/drivers/ncadios/ncadios_file.c index 65a25a66a..5ab07ec0c 100644 --- a/src/drivers/ncadios/ncadios_file.c +++ b/src/drivers/ncadios/ncadios_file.c @@ -47,26 +47,28 @@ #include int -ncadios_create(MPI_Comm comm, - const char *path, - int cmode, - int ncid, - int env_mode, - MPI_Info info, - void **ncpp) /* OUT */ +ncadios_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { /* Read only driver */ DEBUG_RETURN_ERROR(NC_ENOTSUPPORT); } int -ncadios_open(MPI_Comm comm, - const char *path, - int omode, - int ncid, - int env_mode, - MPI_Info info, - void **ncpp) +ncadios_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { int err, parse_done=0; NC_ad *ncadp; diff --git a/src/drivers/ncbbio/ncbbio_driver.h b/src/drivers/ncbbio/ncbbio_driver.h index 6c940ca83..6c60b8042 100644 --- a/src/drivers/ncbbio/ncbbio_driver.h +++ b/src/drivers/ncbbio/ncbbio_driver.h @@ -261,11 +261,13 @@ void ncbbio_export_hint (NC_bb *ncbbp, MPI_Info *info); extern int ncbbio_create(MPI_Comm comm, const char *path, int cmode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncbbio_open(MPI_Comm comm, const char *path, int omode, int ncid, - int env_mode, MPI_Info info, void **ncdp); + int env_mode, MPI_Info info, PNCIO_node_ids node_ids, + void **ncdp); extern int ncbbio_close (void *ncdp); diff --git a/src/drivers/ncbbio/ncbbio_file.c b/src/drivers/ncbbio/ncbbio_file.c index f6406dbf3..76f18b1f2 100644 --- a/src/drivers/ncbbio/ncbbio_file.c +++ b/src/drivers/ncbbio/ncbbio_file.c @@ -49,13 +49,14 @@ #include int -ncbbio_create(MPI_Comm comm, - const char *path, - int cmode, - int ncid, - int env_mode, - MPI_Info info, - void **ncpp) /* OUT */ +ncbbio_create(MPI_Comm comm, + const char *path, + int cmode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { int err; void *ncp=NULL; @@ -66,7 +67,8 @@ ncbbio_create(MPI_Comm comm, driver = ncmpio_inq_driver(); if (driver == NULL) DEBUG_RETURN_ERROR(NC_ENOTNC) - err = driver->create(comm, path, cmode, ncid, env_mode, info, &ncp); + err = driver->create(comm, path, cmode, ncid, env_mode, info, node_ids, + &ncp); if (err != NC_NOERR) return err; /* Create a NC_bb object and save its driver pointer */ @@ -105,13 +107,14 @@ ncbbio_create(MPI_Comm comm, } int -ncbbio_open(MPI_Comm comm, - const char *path, - int omode, - int ncid, - int env_mode, - MPI_Info info, - void **ncpp) +ncbbio_open(MPI_Comm comm, + const char *path, + int omode, + int ncid, + int env_mode, + MPI_Info info, + PNCIO_node_ids node_ids, /* node IDs of all processes */ + void **ncpp) /* OUT */ { int err; void *ncp=NULL; @@ -121,7 +124,7 @@ ncbbio_open(MPI_Comm comm, driver = ncmpio_inq_driver(); if (driver == NULL) DEBUG_RETURN_ERROR(NC_ENOTNC) - err = driver->open(comm, path, omode, ncid, env_mode, info, &ncp); + err = driver->open(comm, path, omode, ncid, env_mode, info, node_ids, &ncp); if (err != NC_NOERR) return err; /* Create a NC_bb object and save its driver pointer */