Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP -- Parallelize GFDL TC detector using OpenMP #763

Open
wants to merge 6 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ env:
- BUILD_TYPE=Debug
- TECA_DIR=/travis_teca_dir
- TECA_PYTHON_VERSION=3
- TECA_DATA_REVISION=157
- TECA_DATA_REVISION=158
jobs:
- DOCKER_IMAGE=ubuntu IMAGE_VERSION=20.04 IMAGE_NAME=ubuntu_20_04 REQUIRE_NETCDF_MPI=TRUE
- DOCKER_IMAGE=ubuntu IMAGE_VERSION=20.04 IMAGE_NAME=ubuntu_20_04 REQUIRE_NETCDF_MPI=FALSE
Expand Down
18 changes: 16 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,10 @@ else ()
set(CMAKE_C_VISIBILITY_PRESET hidden)
endif()


# this prevents a relink when a shared library's implementation changes
set(CMAKE_LINK_DEPENDS_NO_SHARED ON)



# set build/install sub dirs for various components
if (NOT LIB_PREFIX)
set(LIB_PREFIX lib)
Expand Down Expand Up @@ -130,6 +129,7 @@ include(teca_interface_library)
# out if these are not found. for those times when you don't set the corresponding
# REQUIRE variable to FALSE
set(REQUIRE_CUDA FALSE CACHE BOOL "Forces build failure when CUDA is missing")
set(REQUIRE_OPENMP FALSE CACHE BOOL "Forces build failure when OpenMP is missing")
set(REQUIRE_MPI TRUE CACHE BOOL "Forces build failure when MPI is missing")
set(REQUIRE_NETCDF TRUE CACHE BOOL "Forces build failure when NetCDF is missing")
set(REQUIRE_NETCDF_MPI TRUE CACHE BOOL "Forces build failure when NetCDF_MPI is missing")
Expand Down Expand Up @@ -166,6 +166,20 @@ else()
set(HAMR_ENABLE_CUDA OFF CACHE BOOL "")
endif()

# configure for OpenMP
set(tmp OFF)
find_package(OpenMP COMPONENTS Fortran)
if (OpenMP_Fortran_FOUND AND ((DEFINED TECA_HAS_OPENMP AND TECA_HAS_OPENMP)
OR (NOT DEFINED TECA_HAS_OPENMP)))
message(STATUS "OpenMP features (${OpenMP_Fortran_VERSION}) -- enabled")
set(tmp ON)
elseif (REQUIRE_OPENMP)
message(STATUS "OpenMP features -- required but not found.")
else()
message(STATUS "OpenMP features -- not found.")
endif()
set(TECA_HAS_OPENMP ${tmp} CACHE BOOL "OpenMP features")

# configure for MPI
if (ENABLE_CRAY_MPICH)
set(ENV{PKG_CONFIG_PATH} "$ENV{CRAY_MPICH_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}")
Expand Down
7 changes: 7 additions & 0 deletions alg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ foreach(generic_src ${teca_alg_f90_generics})
set(iso_c_type_coord "${f_type}(c_${c_type_coord})")
configure_file(${generic_src}.f90.in ${generic_src}_${decorator}.f90 @ONLY)
list(APPEND teca_alg_f90_srcs ${generic_src}_${decorator}.f90)
if (TECA_HAS_OPENMP)
set_source_files_properties(${generic_src}_${decorator}.f90 PROPERTIES COMPILE_FLAGS ${OpenMP_Fortran_FLAGS})
endif()
endforeach()
endforeach()
endforeach()
Expand All @@ -109,6 +112,10 @@ if (TECA_HAS_CUDA)
set_source_files_properties(${teca_alg_cxx_srcs} PROPERTIES LANGUAGE CUDA)
endif()

if (TECA_HAS_OPENMP)
list(APPEND teca_alg_link OpenMP::OpenMP_Fortran)
endif()

add_library(teca_alg ${teca_alg_cxx_srcs} ${teca_alg_cuda_srcs} ${teca_alg_f90_srcs})

target_link_libraries(teca_alg teca_data teca_core ${teca_alg_link})
Expand Down
22 changes: 19 additions & 3 deletions alg/gfdl_tc_candidates.f90.in
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ integer(c_int) function gfdl_tc_candidates_@decorator@( &
max_psl_dy, max_psl_dr, max_twc_dy, max_twc_dr, &
max_thick_dy, max_thick_dr, &
Gwind, Gvort, Gtbar, Gpsl, Gthick, Grlat, &
Grlon, Gnlat, Gnlon, frprm_itmax, time_step, &
tc_table) result(ret_val) bind(C)
Grlon, Gnlat, Gnlon, frprm_itmax, n_threads, &
time_step, tc_table) result(ret_val) bind(C)

use spline_@decorator@_module, only : splie2_@decorator@, &
splie3_@decorator@, frprm_@decorator@, shape_@decorator@
Expand All @@ -157,7 +157,7 @@ integer(c_int) function gfdl_tc_candidates_@decorator@( &
dimension(Gnlon, Gnlat) :: Gwind, Gvort, Gtbar, Gpsl, Gthick
@iso_c_type_coord@, intent(in), dimension(Gnlon) :: Grlon
@iso_c_type_coord@, intent(in), dimension(Gnlat) :: Grlat
integer(c_int), intent(in) :: frprm_itmax
integer(c_int), intent(in) :: frprm_itmax, n_threads
type(c_ptr), intent(inout) :: tc_table

@iso_c_type_var@, parameter :: ftol = 0.01
Expand Down Expand Up @@ -262,6 +262,16 @@ integer(c_int) function gfdl_tc_candidates_@decorator@( &
call splie3_@decorator@(rlon, thick, thick_dx)

! loop over grid & look for storms
!$omp parallel do schedule(dynamic) num_threads(n_threads) default(none) &
!$omp& shared(tc_table,can_id,storm_id,nx,nx2,ix,jx,ixp6, &
!$omp& jxp6, max_core_radius,min_vort,vort_win_size,max_psl_dy,max_psl_dr, &
!$omp& max_twc_dy,max_twc_dr,max_thick_dy,max_thick_dr,Gwind,Gvort,Gtbar,Gpsl, &
!$omp& Gthick,Grlat,Grlon,Gnlat,Gnlon,frprm_itmax,time_step,rlon,rlat, &
!$omp& vort,wind,psl,psl_dx,psl_dy,tbar,tbar_dx,tbar_dy,thick,thick_dx,thick_dy) &
!$omp& private(im,ip,jm,jp,ierr_pos,ierr_mag,wind_max,psl_min,twc_max,thick_max,lon_vort, &
!$omp& lon_psl,lon_twc,lon_thick,lat_vort,lat_psl,lat_twc,lat_thick,exist_twc, &
!$omp& exist_thick,p,xx,yy,rr,fret,have_thick,have_twc,can_ij,w_msg) &
!$omp& firstprivate(nxp1,jxp3,ixp3)
do j = nxp1,jxp3
do i = nxp1,ixp3

Expand All @@ -276,6 +286,7 @@ integer(c_int) function gfdl_tc_candidates_@decorator@( &
.or. (vort(i,j) .lt. min_vort)) &
cycle

!$omp atomic
can_id = can_id + 1

lon_vort = rlon(i)
Expand Down Expand Up @@ -337,7 +348,9 @@ integer(c_int) function gfdl_tc_candidates_@decorator@( &
endif

! --- we have strom a candidate
!$omp atomic
storm_id = storm_id + 1
!$omp end atomic

! --- step 3: check for presence of a warm core
exist_twc = .false.
Expand Down Expand Up @@ -422,13 +435,16 @@ integer(c_int) function gfdl_tc_candidates_@decorator@( &

wind_max = maxval(wind(im:ip,jm:jp))

!$omp critical
call teca_tc_append_candidate_@decorator@( &
storm_id, lon_psl, lat_psl, wind_max, vort(i,j), &
psl_min, have_twc, have_thick, twc_max, thick_max, &
tc_table)
!$omp end critical

end do
end do
!$omp end parallel do

deallocate(rlon)
deallocate(rlat)
Expand Down
7 changes: 4 additions & 3 deletions alg/gfdl_tc_candidates.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ int gfdl_tc_candidates_c ## _c_name ## _v ## _v_name ( \
const _v_type *Gvort, const _v_type *Gtbar, \
const _v_type *Gpsl, const _v_type *Gthick, \
const _c_type *Grlat, const _c_type *Grlon, long *Gnlat, \
long *Gnlon, int *frprm_itmax, long *step, void *atable); \
long *Gnlon, int *frprm_itmax, int *n_threads, long *step, \
void *atable); \
\
namespace teca_gfdl { \
int tc_candidates( \
Expand All @@ -25,13 +26,13 @@ int tc_candidates( \
const _v_type *Gwind, const _v_type *Gvort, const _v_type *Gtbar, \
const _v_type *Gpsl, const _v_type *Gthick, const _c_type *Grlat, \
const _c_type *Grlon, long Gnlat, long Gnlon, int frprm_itmax, \
long step, void *atable) \
int n_threads, long step, void *atable) \
{ \
return gfdl_tc_candidates_c ## _c_name ## _v ## _v_name ( \
&core_rad, &min_vort, &vort_win, &max_psl_dy, &max_psl_dr, \
&max_twc_dy, &max_twc_dr, &max_thick_dy, &max_thick_dr, \
Gwind, Gvort, Gtbar, Gpsl, Gthick, Grlat, Grlon, &Gnlat, \
&Gnlon, &frprm_itmax, &step, atable); \
&Gnlon, &frprm_itmax, &n_threads, &step, atable); \
} \
};

Expand Down
8 changes: 6 additions & 2 deletions alg/teca_tc_candidates.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ teca_tc_candidates::teca_tc_candidates() :
search_lat_high(0.0),
search_lon_low(1.0),
search_lon_high(0.0),
minimizer_iterations(50)
minimizer_iterations(50),
omp_num_threads(1)
{
this->set_number_of_input_connections(1);
this->set_number_of_output_ports(1);
Expand Down Expand Up @@ -99,6 +100,8 @@ void teca_tc_candidates::get_properties_description(
"lowest longitude in degrees to search for stroms")
TECA_POPTS_GET(double, prefix, search_lon_high,
"highest longitude in degrees to search for storms")
TECA_POPTS_GET(int, prefix, omp_num_threads,
"the number of OpenMP threads to use in the main detector loop")
;

this->teca_algorithm::get_properties_description(prefix, opts);
Expand Down Expand Up @@ -129,6 +132,7 @@ void teca_tc_candidates::set_properties(
TECA_POPTS_SET(opts, double, prefix, search_lat_low)
TECA_POPTS_SET(opts, double, prefix, search_lon_high)
TECA_POPTS_SET(opts, double, prefix, search_lon_low)
TECA_POPTS_SET(opts, int, prefix, omp_num_threads)
}
#endif

Expand Down Expand Up @@ -434,7 +438,7 @@ const_p_teca_dataset teca_tc_candidates::execute(unsigned int port,
this->max_core_temperature_delta, this->max_core_temperature_radius,
this->max_thickness_delta, this->max_thickness_radius, v, w,
T, P, th, lat, lon, nlat, nlon, this->minimizer_iterations,
time_step, candidates.get()))
this->omp_num_threads, time_step, candidates.get()))
{
TECA_FATAL_ERROR("GFDL TC detector encountered an error")
return nullptr;
Expand Down
64 changes: 45 additions & 19 deletions alg/teca_tc_candidates.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,19 +60,26 @@ class TECA_EXPORT teca_tc_candidates : public teca_algorithm
TECA_GET_ALGORITHM_PROPERTIES_DESCRIPTION()
TECA_SET_ALGORITHM_PROPERTIES()

// set/get the name of input variables
/** @name input_variables
* Set the names of the variables that are required for TC detection.
*/
///@{
TECA_ALGORITHM_PROPERTY(std::string, surface_wind_speed_variable)
TECA_ALGORITHM_PROPERTY(std::string, vorticity_850mb_variable)
TECA_ALGORITHM_PROPERTY(std::string, sea_level_pressure_variable)
TECA_ALGORITHM_PROPERTY(std::string, core_temperature_variable)
TECA_ALGORITHM_PROPERTY(std::string, thickness_variable)

// a candidate is defined as having:
// 1) a local maximum in vorticity above vorticty_850mb_threshold,
// centered on a window of vorticty_850mb_window degrees
// 2) a local minimum in pressure within max_core_radius degrees
// 3) having max pressure delta within max_pressure_radius at
// that location
///@}

/** @name detector_controls
* Set the thresholds controling detector behavior. A TC candidate is
* defined as having:
* 1. a local maximum in vorticity above vorticty_850mb_threshold, centered
* on a window of vorticty_850mb_window degrees
* 2. a local minimum in pressure within max_core_radius degrees
* 3. having max pressure delta within max_pressure_radius at that location
*/
///@{
TECA_ALGORITHM_PROPERTY(double, max_core_radius)
TECA_ALGORITHM_PROPERTY(double, min_vorticity_850mb)
TECA_ALGORITHM_PROPERTY(double, vorticity_850mb_window)
Expand All @@ -86,21 +93,38 @@ class TECA_EXPORT teca_tc_candidates : public teca_algorithm
TECA_ALGORITHM_PROPERTY(double, max_thickness_delta)
TECA_ALGORITHM_PROPERTY(double, max_thickness_radius)

// set/get the bounding box to search for storms
// in units of degrees lat,lon
// set/get the number of iterations to search for the storm local minimum.
// raising this parameter might increase detections but the detector will
// run slower. default is 50.
TECA_ALGORITHM_PROPERTY(int, minimizer_iterations)
///@}

/** @name spatial_subset
* Set/get the bounding box to search for storms in units of degrees lat,lon
*/
///@{
TECA_ALGORITHM_PROPERTY(double, search_lat_low)
TECA_ALGORITHM_PROPERTY(double, search_lat_high)
TECA_ALGORITHM_PROPERTY(double, search_lon_low)
TECA_ALGORITHM_PROPERTY(double, search_lon_high)

// set/get the number of iterations to search for the
// storm local minimum. raising this parameter might increase
// detections but the detector will run slower. default is
// 50.
TECA_ALGORITHM_PROPERTY(int, minimizer_iterations)

// send human readable representation to the
// stream
///@}

/** @name omp_num_threads
* Set the number of OpenMP threads.
*
* @warning This is an experimaental setting and it is recommended to use
* this when running only 1 MPI rank per node.
*
* Using multiple OpenMP threads can speed up single step processing times
* but may interfere with TECA's MPI+threads pipeline execution. We
* haven't yet found a way to make OpenMP play nicely with our own internal
* threading.
*/
///@{
TECA_ALGORITHM_PROPERTY(int, omp_num_threads)
///@}

/// send human readable representation to the stream
virtual void to_stream(std::ostream &os) const override;

protected:
Expand Down Expand Up @@ -152,6 +176,8 @@ class TECA_EXPORT teca_tc_candidates : public teca_algorithm
double search_lon_high;

int minimizer_iterations;

int omp_num_threads;
};

#endif
8 changes: 8 additions & 0 deletions apps/teca_tc_detect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ int main(int argc, char **argv)
("n_threads", value<int>()->default_value(-1), "\nSets the thread pool size on each"
" MPI rank. When the default value of -1 is used TECA will coordinate the thread"
" pools across ranks such each thread is bound to a unique physical core.\n")
("n_omp_threads", value<int>()->default_value(1), "\nSets the number of OpenMP threads\n"
" to use in the main detector loop. WARNING: This is experimental, see\n"
" teca_tc_candidates Doxygen documentation for details\n")

("help", "\ndisplays documentation for application specific command line options\n")
("advanced_help", "\ndisplays documentation for algorithm specific command line options\n")
Expand Down Expand Up @@ -177,6 +180,7 @@ int main(int argc, char **argv)
candidates->set_max_thickness_radius(4.0);
candidates->set_search_lat_low(-80.0);
candidates->set_search_lat_high(80.0);
candidates->set_omp_num_threads(1);
candidates->get_properties_description("candidates", advanced_opt_defs);

p_teca_table_reduce map_reduce = teca_table_reduce::New();
Expand Down Expand Up @@ -335,6 +339,10 @@ int main(int argc, char **argv)
candidates->set_search_lat_high(
opt_vals["highest_lat"].as<double>());

if (!opt_vals["n_omp_threads"].defaulted())
candidates->set_omp_num_threads(
opt_vals["n_omp_threads"].as<int>());

if (!opt_vals["first_step"].defaulted())
map_reduce->set_start_index(opt_vals["first_step"].as<long>());

Expand Down
1 change: 1 addition & 0 deletions core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(teca_core_srcs
teca_bad_cast.cxx
teca_binary_stream.cxx
teca_common.cxx
teca_core_init.cxx
teca_dataset.cxx
teca_dataset_capture.cxx
teca_index_executive.cxx
Expand Down
38 changes: 38 additions & 0 deletions core/teca_core_init.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#include <iostream>
#include <string>
#include <cstdlib>
#include <thread>

__attribute__((constructor)) void init(void)
{
#if defined(TECA_DEBUG)
std::cerr << "teca_core initializing ... " << std::endl;
#endif
/* TODO -- problems with MPI
* with multiple MPI ranks per node, MPI OMP_PROC_BIND should be false, otherwise true.
* we have no way here to know if MPI is in use and how many ranks per node.
* these variables need to be initialized here to have any affect.
if (!getenv("OMP_NUM_THREADS"))
{
int n_threads = std::max(1u, std::thread::hardware_concurrency() / 2);
setenv("OMP_NUM_THREADS", std::to_string(n_threads).c_str(), 1);
}

if (!getenv("OMP_PROC_BIND"))
setenv("OMP_PROC_BIND", "true", 1);

if (!getenv("OMP_PLACES"))
setenv("OMP_PLACES", "cores", 1);
*/
#if defined(TECA_DEBUG)
setenv("OMP_DISPLAY_ENV", "true", 1);
setenv("OMP_DISPLAY_AFFINITY", "true", 1);
#endif
}

#if defined(TECA_DEBUG)
__attribute__((destructor)) void fini(void)
{
std::cerr << "teca_core finalizing ... " << std::endl;
}
#endif
Loading