diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90 b/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90 index e1c1f305..edc1f9cd 100644 --- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90 +++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90 @@ -1,5 +1,5 @@ program gocean2d - use dl_timer + use dl_timer, only: timer_start, timer_stop, timer_init, timer_report, i_def64 use grid_mod use field_mod use initialisation_mod, only: initialisation @@ -30,22 +30,31 @@ program gocean2d type(r2d_field) :: ua_fld, va_fld ! time stepping index - integer :: istp - integer :: itimer0 + integer :: istp + integer :: itimer0 + integer :: warmup_iterations = 1 + ! Scratch space for logging messages + character(len=160) :: log_str + + ! Initialise GOcean infrastructure call gocean_initialise() ! Create the model grid. We use a NE offset (i.e. the U, V and F ! points immediately to the North and East of a T point all have the ! same i,j index). This is the same offset scheme as used by NEMO. model_grid = grid_type(GO_ARAKAWA_C, & - ! BC_PERIODIC, BC_NON_PERIODIC ?? (/GO_BC_EXTERNAL,GO_BC_EXTERNAL,GO_BC_NONE/), & GO_OFFSET_NE) !! read in model parameters and configure the model grid CALL model_init(model_grid) + ! Start timer for initialisation section (this must be after model_init + ! because dl_timer::timer_init() is called inside it) + CALL timer_start(itimer0, label='Initialise', & + num_repeats=INT(1,kind=i_def64) ) + ! Create fields on this grid ! Sea-surface height now (current time step) @@ -78,16 +87,38 @@ program gocean2d call model_write(model_grid, 0, ht_fld, sshn_t_fld, un_fld, vn_fld) + write(log_str, "('Simulation domain = (',I4,':',I4,',',I4,':',I4,')')") & + model_grid%subdomain%global%xstart, & + model_grid%subdomain%global%xstop, & + model_grid%subdomain%global%ystart, & + model_grid%subdomain%global%ystop + call model_write_log("((A))", TRIM(log_str)) + + ! Stop the timer for the initialisation section + call timer_stop(itimer0) + + ! Start timer for warm-up section + CALL timer_start(itimer0, label='Warm up', & + num_repeats=INT(warmup_iterations,kind=i_def64) ) + + do istp = nit000, nit000 + warmup_iterations, 1 + call step(istp, & + ua_fld, va_fld, un_fld, vn_fld, & + sshn_t_fld, sshn_u_fld, sshn_v_fld, & + ssha_t_fld, ssha_u_fld, ssha_v_fld, & + hu_fld, hv_fld, ht_fld) + enddo + + ! Stop the timer for the warm-up section + call timer_stop(itimer0) ! Start timer for time-stepping section CALL timer_start(itimer0, label='Time-stepping', & - num_repeats=int((nitend-nit000+1),8) ) + num_repeats=INT(nitend-(nit000+warmup_iterations),kind=i_def64)) !! time stepping - do istp = nit000, nitend, 1 - - !call model_write_log("('istp == ',I6)",istp) + do istp = nit000+warmup_iterations, nitend, 1 - call step(model_grid, istp, & + call step(istp, & ua_fld, va_fld, un_fld, vn_fld, & sshn_t_fld, sshn_u_fld, sshn_v_fld, & ssha_t_fld, ssha_u_fld, ssha_v_fld, & @@ -101,15 +132,22 @@ program gocean2d ! Stop the timer for the time-stepping section call timer_stop(itimer0) + ! Start timer for checksum section + CALL timer_start(itimer0, label='Checksum reductions', & + num_repeats=INT(1,kind=i_def64) ) + ! Compute and output some checksums for error checking - call model_write_log("('ua checksum = ',E16.8)", & + call model_write_log("('ua checksum = ', E16.8)", & field_checksum(ua_fld)) - call model_write_log("('va checksum = ',E16.8)", & + call model_write_log("('va checksum = ', E16.8)", & field_checksum(va_fld)) + ! Stop the timer for the checksum section + call timer_stop(itimer0) + !! finalise the model run call model_finalise() - + call model_write_log("((A))", 'Simulation finished!!') call gocean_finalise() @@ -118,7 +156,7 @@ end program gocean2d !+++++++++++++++++++++++++++++++++++ -subroutine step(grid, istp, & +subroutine step(istp, & ua, va, un, vn, & sshn, sshn_u, sshn_v, ssha, ssha_u, ssha_v, & hu, hv, ht) @@ -128,7 +166,6 @@ subroutine step(grid, istp, & use time_step_mod, only: invoke_time_step use gocean2d_io_mod, only: model_write implicit none - type(grid_type), intent(in) :: grid !> The current time step integer, intent(in) :: istp type(r2d_field), intent(inout) :: un, vn, sshn, sshn_u, sshn_v @@ -139,27 +176,5 @@ subroutine step(grid, istp, & sshn, sshn_u, sshn_v, & hu, hv, ht, ua, va, un, vn) -! call invoke( & -! continuity(istp, ssha, sshn_t, sshn_u, sshn_v, & -! hu, hv, un, vn), & -! momentum_u(ua, un, vn, & -! ssha_u, sshn_t, sshn_u, sshn_v), & -! momentum_v(va, un, vn, hu, hv, ht, & -! ssha_v, sshn_t, sshn_u, sshn_v), & -! bc_ssh(istp, ssha), & -! bc_solid_u(ua), & -! bc_solid_v(va), & -! bc_flather_u(ua, hu, sshn_u), & -! bc_flather_v(va, hv, sshn_v), & -! copy_field(ua, un), & -! copy_field(va, vn), & -! copy_field(ssha, sshn_t), & -! next_sshu(sshn_u, sshn_t), & -! next_sshv(sshn_v, sshn_t) & -! ) - - -! call model_write(grid, istp, ht, sshn, un, vn) - end subroutine step diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_cpp/nemolite2d.f90 b/benchmarks/nemo/nemolite2d/manual_versions/psykal_cpp/nemolite2d.f90 index 96c9a30f..49428770 100644 --- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_cpp/nemolite2d.f90 +++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_cpp/nemolite2d.f90 @@ -31,7 +31,6 @@ program gocean2d ! time stepping index integer :: istp - real(go_wp) :: rstp integer :: itimer0 ! Scratch space for logging messages @@ -91,12 +90,24 @@ program gocean2d model_grid%subdomain%global%ystop call model_write_log("((A))", TRIM(log_str)) + ! Start timer for time-stepping section + CALL timer_start(itimer0, label='Warm up', & + num_repeats=INT(1,kind=i_def64) ) + + call step(nit000, & + ua_fld, va_fld, un_fld, vn_fld, & + sshn_t_fld, sshn_u_fld, sshn_v_fld, & + ssha_t_fld, ssha_u_fld, ssha_v_fld, & + hu_fld, hv_fld, ht_fld) + + ! Stop the timer for the time-stepping section + call timer_stop(itimer0) ! Start timer for time-stepping section CALL timer_start(itimer0, label='Time-stepping', & - num_repeats=INT(nitend-nit000+1,kind=i_def64) ) + num_repeats=INT(nitend-nit000,kind=i_def64) ) !! time stepping - do istp = nit000, nitend, 1 + do istp = nit000+1, nitend, 1 !call model_write_log("('istp == ',I6)",istp) diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile index 9be53b00..3c1d083f 100644 --- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile +++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile @@ -35,12 +35,16 @@ KOKKOS_ARCH = Volta70 # Pascal64 CFLAGS := -O3 # Still use the selected compiler but using the Kokkos nvcc_wrapper NVCC_WRAPPER_DEFAULT_COMPILER = $(CXX) -CXX := $(KOKKOS_PATH)/bin/nvcc_wrapper +CXX := $(KOKKOS_PATH)/bin/nvcc_wrapper -allow-unsupported-compiler # The enable lambda option is necessary for the nvcc compiler to recognise # as CUDA kernels the lambda-inlined functions. KOKKOS_CUDA_OPTIONS = "enable_lambda" # If CUDA_LIB is not provided, infer path from the nvcc compiler location. -CUDA_LIB ?= $(shell echo $(shell which nvcc) | sed 's/bin\/nvcc/lib64/g') +CUDA_ROOT ?= $(shell echo $(shell which nvcc) | sed 's/bin\/nvcc//g') +else ifeq ($(KOKKOS_DEVICES),HIP) +$(info "Using HIP device") +CXX := hipcc +CFLAGS := -O3 else $(error "Unrecognised KOKKOS_DEVICES value: $(KOKKOS_DEVICES)") endif diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/README.md b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/README.md index cfe191a3..47e36d54 100644 --- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/README.md +++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/README.md @@ -52,15 +52,19 @@ to the Kokkos parallel dispatch. This allows Kokkos to control the data layout, the padding, and the synchonization between host and device (GPU execution) but it requires to keep two copies of the simulation data. This version is available in `time_step_views_kokkos.cpp` and can be built -with an OpenMP or a Cuda backend by setting the KOKKOS_DEVICES environment -variable. Note that the Cuda back-end requires that the `nvcc` compiler is -installed on the system and available in PATH. See below examples of how to +with an OpenMP, Cuda or HIP backend by setting the `KOKKOS_DEVICES` environment +variable. Note that the Cuda back-end requires that the `nvcc` compiler and +the HIP back-end requires the `hipcc` compiler. These need to be +installed on the system and the necessary paths be available in `PATH`, +`CPATH` and `LD_LIBRARY_PATH`. See below examples of how to compile the Kokkos View version for different devices: > make nemolite2d_views_kokkos KOKKOS_DEVICES=OpenMP > make nemolite2d_views_kokkos KOKKOS_DEVICES=Cuda + > make nemolite2d_views_kokkos KOKKOS_DEVICES=HIP + ## Running ## Model parameters (size of domain [jpiglo,jpjglo], number of time-steps diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/nemolite2d.f90 b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/nemolite2d.f90 index c8109789..49428770 100644 --- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/nemolite2d.f90 +++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/nemolite2d.f90 @@ -90,12 +90,24 @@ program gocean2d model_grid%subdomain%global%ystop call model_write_log("((A))", TRIM(log_str)) + ! Start timer for time-stepping section + CALL timer_start(itimer0, label='Warm up', & + num_repeats=INT(1,kind=i_def64) ) + + call step(nit000, & + ua_fld, va_fld, un_fld, vn_fld, & + sshn_t_fld, sshn_u_fld, sshn_v_fld, & + ssha_t_fld, ssha_u_fld, ssha_v_fld, & + hu_fld, hv_fld, ht_fld) + + ! Stop the timer for the time-stepping section + call timer_stop(itimer0) ! Start timer for time-stepping section CALL timer_start(itimer0, label='Time-stepping', & - num_repeats=INT(nitend-nit000+1,kind=i_def64) ) + num_repeats=INT(nitend-nit000,kind=i_def64) ) !! time stepping - do istp = nit000, nitend, 1 + do istp = nit000+1, nitend, 1 !call model_write_log("('istp == ',I6)",istp) diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp index bbdd4651..147a1ce4 100644 --- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp +++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp @@ -11,7 +11,7 @@ #include "timing.h" #endif -#define TILE {64,4} +#define TILE {64,1} // Create 2D View types for the Fields and Grid arrays typedef Kokkos::View double_2dview; @@ -116,6 +116,8 @@ extern "C" void c_invoke_time_step( // this file. e.g. `g++ -DEXEC_SPACE=OpenMP time_step_kokkos.cpp -c` #if defined (EXECUTION_SPACE) using execution_space = Kokkos::EXECUTION_SPACE; + // Replace execution_space with the line below for the HIP backend + // using execution_space = Kokkos::Experimental::EXECUTION_SPACE; #else using execution_space = Kokkos::DefaultExecutionSpace; #endif @@ -699,8 +701,11 @@ extern "C" void kokkos_read_from_device(double_2dview from, double * to, // Then, we copy the data from the mirror to the original location. // Since the mirror data layout is decided by kokkos, we make explicit // copies of each element to its location. - for(int jj=starty; jj < starty+ny; jj++){ - for(int ji=startx; ji < startx+nx; ji++){ + // We need to adjust the provided Fortran bounds to 0-indexing + int starty0 = starty - 1; + int startx0 = startx - 1; + for(int jj=starty0; jj < starty0+ny-1; jj++){ + for(int ji=startx0; ji < startx0+nx-1; ji++){ int idx = (jj * fortran_array_width + ji); to[idx] = mirror(jj, ji); } diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_opencl/nemolite2d.f90 b/benchmarks/nemo/nemolite2d/manual_versions/psykal_opencl/nemolite2d.f90 index a3e04177..5aa092b7 100644 --- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_opencl/nemolite2d.f90 +++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_opencl/nemolite2d.f90 @@ -50,7 +50,6 @@ program gocean2d !! read in model parameters and configure the model grid CALL model_init(model_grid) - !call likwid_markerInit() ! Create fields on this grid @@ -91,15 +90,22 @@ program gocean2d model_grid%subdomain%global%ystop call model_write_log("((A))", TRIM(log_str)) + ! Warming up step + CALL timer_start(itimer0, label='Warm up step', & + num_repeats=INT(1,kind=i_def64) ) + call step(nit000, & + ua_fld, va_fld, un_fld, vn_fld, & + sshn_t_fld, sshn_u_fld, sshn_v_fld, & + ssha_t_fld, ssha_u_fld, ssha_v_fld, & + hu_fld, hv_fld, ht_fld) + call timer_stop(itimer0) + ! Start timer for time-stepping section CALL timer_start(itimer0, label='Time-stepping', & - num_repeats=INT(nitend-nit000+1,kind=i_def64) ) + num_repeats=INT(nitend-nit000,kind=i_def64) ) !! time stepping - do istp = nit000, nitend, 1 - - !call model_write_log("('istp == ',I6)",istp) - rstp = real(istp, go_wp) + do istp = nit000+1, nitend, 1 call step(istp, & ua_fld, va_fld, un_fld, vn_fld, & diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_sycl/nemolite2d.f90 b/benchmarks/nemo/nemolite2d/manual_versions/psykal_sycl/nemolite2d.f90 index 604348a2..49428770 100644 --- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_sycl/nemolite2d.f90 +++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_sycl/nemolite2d.f90 @@ -7,6 +7,7 @@ program gocean2d use gocean2d_io_mod, only: model_write use gocean_mod, only: model_write_log, gocean_initialise, & gocean_finalise + !use likwid !> A Horizontal 2D hydrodynamic ocean model which !! 1) using structured grid @@ -30,7 +31,6 @@ program gocean2d ! time stepping index integer :: istp - real(go_wp) :: rstp integer :: itimer0 ! Scratch space for logging messages @@ -43,11 +43,13 @@ program gocean2d ! points immediately to the North and East of a T point all have the ! same i,j index). This is the same offset scheme as used by NEMO. model_grid = grid_type(GO_ARAKAWA_C, & + ! BC_PERIODIC, BC_NON_PERIODIC ?? (/GO_BC_EXTERNAL,GO_BC_EXTERNAL,GO_BC_NONE/), & GO_OFFSET_NE) !! read in model parameters and configure the model grid CALL model_init(model_grid) + !call likwid_markerInit() ! Create fields on this grid @@ -88,12 +90,26 @@ program gocean2d model_grid%subdomain%global%ystop call model_write_log("((A))", TRIM(log_str)) + ! Start timer for time-stepping section + CALL timer_start(itimer0, label='Warm up', & + num_repeats=INT(1,kind=i_def64) ) + + call step(nit000, & + ua_fld, va_fld, un_fld, vn_fld, & + sshn_t_fld, sshn_u_fld, sshn_v_fld, & + ssha_t_fld, ssha_u_fld, ssha_v_fld, & + hu_fld, hv_fld, ht_fld) + + ! Stop the timer for the time-stepping section + call timer_stop(itimer0) ! Start timer for time-stepping section CALL timer_start(itimer0, label='Time-stepping', & - num_repeats=INT(nitend-nit000+1,kind=i_def64) ) + num_repeats=INT(nitend-nit000,kind=i_def64) ) !! time stepping - do istp = nit000, nitend, 1 + do istp = nit000+1, nitend, 1 + + !call model_write_log("('istp == ',I6)",istp) call step(istp, & ua_fld, va_fld, un_fld, vn_fld, & @@ -117,6 +133,7 @@ program gocean2d !! finalise the model run call model_finalise() + !call likwid_markerClose() call model_write_log("((A))", 'Simulation finished!!') diff --git a/benchmarks/nemo/nemolite2d/psykal/nemolite2d_alg.f90 b/benchmarks/nemo/nemolite2d/psykal/nemolite2d_alg.f90 index c37d18cd..b620561b 100644 --- a/benchmarks/nemo/nemolite2d/psykal/nemolite2d_alg.f90 +++ b/benchmarks/nemo/nemolite2d/psykal/nemolite2d_alg.f90 @@ -31,7 +31,10 @@ program gocean2d ! time stepping index integer :: istp integer :: itimer0 - integer(i_def64) :: nrepeat + integer :: warmup_iterations = 1 + + ! Scratch space for logging messages + character(len=160) :: log_str call gocean_initialise() @@ -46,6 +49,11 @@ program gocean2d !! read in model parameters and configure the model grid CALL model_init(model_grid) + ! Start timer for initialisation section (this must be after model_init + ! because dl_timer::timer_init() is called inside it) + CALL timer_start(itimer0, label='Initialise', & + num_repeats=INT(1,kind=i_def64) ) + ! Create fields on this grid ! Sea-surface height now (current time step) @@ -78,13 +86,37 @@ program gocean2d call model_write(model_grid, 0, ht_fld, sshn_t_fld, un_fld, vn_fld) + write(log_str, "('Simulation domain = (',I4,':',I4,',',I4,':',I4,')')") & + model_grid%subdomain%global%xstart, & + model_grid%subdomain%global%xstop, & + model_grid%subdomain%global%ystart, & + model_grid%subdomain%global%ystop + call model_write_log("((A))", TRIM(log_str)) + + ! Stop the timer for the initialisation section + call timer_stop(itimer0) + + ! Start timer for warm-up section + CALL timer_start(itimer0, label='Warm up', & + num_repeats=INT(warmup_iterations,kind=i_def64) ) + + do istp = nit000, nit000 + warmup_iterations, 1 + call step(istp, & + ua_fld, va_fld, un_fld, vn_fld, & + sshn_t_fld, sshn_u_fld, sshn_v_fld, & + ssha_t_fld, ssha_u_fld, ssha_v_fld, & + hu_fld, hv_fld, ht_fld) + enddo + + ! Stop the timer for the warm-up section + call timer_stop(itimer0) + ! Start timer for time-stepping section - nrepeat = nitend - nit000 + 1 - call model_write_log("((A))", '=== Start Time-stepping ===') - CALL timer_start(itimer0, label='Time-stepping', num_repeats=nrepeat) + CALL timer_start(itimer0, label='Time-stepping', & + num_repeats=INT(nitend-(nit000+warmup_iterations),kind=i_def64)) !! time stepping - do istp = nit000, nitend, 1 + do istp = nit000+warmup_iterations, nitend, 1 call step(istp, & ua_fld, va_fld, un_fld, vn_fld, & @@ -100,23 +132,22 @@ program gocean2d ! Stop the timer for the time-stepping section call timer_stop(itimer0) - call model_write_log("((A))", '=== Time-stepping finished ===') + ! Start timer for checksum section + CALL timer_start(itimer0, label='Checksum reductions', & + num_repeats=INT(1,kind=i_def64) ) ! Compute and output some checksums for error checking - call model_write_log("('ua checksum = ',E16.8)", field_checksum(ua_fld)) - call model_write_log("('va checksum = ',E16.8)", field_checksum(va_fld)) - ! call model_write_log("('ssh_u checksum = ',E16.8)", & - ! field_checksum(sshn_u_fld)) - ! call model_write_log("('ssh_v checksum = ',E16.8)", & - ! field_checksum(sshn_v_fld)) - ! call model_write_log("('ssh_t checksum = ',E16.8)", & - ! field_checksum(sshn_t_fld)) + call model_write_log("('ua checksum = ', E16.8)", & + field_checksum(ua_fld)) + call model_write_log("('va checksum = ', E16.8)", & + field_checksum(va_fld)) + + ! Stop the timer for the checksum section + call timer_stop(itimer0) !! finalise the model run call model_finalise() - call model_write_log("((A))", 'Simulation finished!!') - call gocean_finalise() end program gocean2d diff --git a/compiler_setup/llvm.sh b/compiler_setup/llvm.sh index 33578f14..ce940aa3 100644 --- a/compiler_setup/llvm.sh +++ b/compiler_setup/llvm.sh @@ -2,30 +2,30 @@ # ================================================ # This is an experimental file so other flags may be # needed for accelerated compilation -# Alternative flags have been provided in the comments -# where they have been found to be useful # Fortran compiler F90=flang +# If flang is not available or causes compiler errors uncomment gfortran: +# F90=gfortran # C and C++ compiler CC=clang CXX=clang++ # C and C++ flags -# note that -g is used for debugging information -# as this is an experimental implementation -CFLAGS="-O3 -march=native -g" +CFLAGS="-O3" # Fortran compiler flags -# As above, -g provides debugging information -F90FLAGS="-O3 -march=native -g" +F90FLAGS="-O3" # Flags to use when compiling with OpenMP support OMPFLAGS="-fopenmp" # Flags to use when compiling with OpenMP GPU offloading support -OMPTARGETFLAGS="-fopenmp -fopenmp-targets=nvptx64" -# OMPTARGETFLAGS="–fopenmp-targets=nvptx64-nvidia-cuda" +# For AMD Rocm (march is MI50: fgx906, MI100: gfx908): +# OMPTARGETFLAGS="-target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908" +# For NVIDIA: +OMPTARGETFLAGS="–fopenmp-targets=nvptx64-nvidia-cuda" # Linker flags -LDFLAGS="-lomp -lomptarget" +LDFLAGS="-fopenmp" + # Location of various CUDA maths libraries LDFLAGS+=" -L${CUDA_MATH_DIR}/lib64"