From 3878e4318e8f0ba83e5b3383c8ac798e26fee6fa Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Wed, 21 Nov 2018 13:03:57 -0800 Subject: [PATCH 01/19] Fix new klocwork error reports on 11/20/2018 --- src/ngraph_deassign_clusters.cc | 1 + src/tf_deadness_analysis.cc | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ngraph_deassign_clusters.cc b/src/ngraph_deassign_clusters.cc index 856e0a23..7e78e47e 100644 --- a/src/ngraph_deassign_clusters.cc +++ b/src/ngraph_deassign_clusters.cc @@ -75,6 +75,7 @@ static void MaybeLogPlacement(const Graph* graph) { } final_cluster_map[cluster_idx].insert(node); } + if (number_of_nodes == 0) return; int perc_marked_for_clustering_of_total = (int)((nodes_marked_for_clustering * 100.0) / number_of_nodes); diff --git a/src/tf_deadness_analysis.cc b/src/tf_deadness_analysis.cc index 07fd50f7..539c04ae 100644 --- a/src/tf_deadness_analysis.cc +++ b/src/tf_deadness_analysis.cc @@ -490,7 +490,9 @@ Status DeadnessAnalysisImpl::GetNodePredicate(const Node& node, } // All outputs have the same predicate - pred_string = pred->ToString(); + if (pred != nullptr) { + pred_string = pred->ToString(); + } return Status::OK(); } @@ -530,4 +532,4 @@ DeadnessAnalysis::~DeadnessAnalysis() {} } // namespace ngraph_bridge } // namespace tensorflow -#endif // NGRAPH_TF_DISABLE_DEADNESS_CHECK \ No newline at end of file +#endif // NGRAPH_TF_DISABLE_DEADNESS_CHECK From 4decf5798d26019b911740d31ce28bbefb996748 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Tue, 4 Dec 2018 10:24:27 -0800 Subject: [PATCH 02/19] Fix the FusedBatchNorm with the Bessel correction in variance --- src/ngraph_builder.cc | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/ngraph_builder.cc b/src/ngraph_builder.cc index 0c5ae65e..5a5a05f0 100644 --- a/src/ngraph_builder.cc +++ b/src/ngraph_builder.cc @@ -1724,12 +1724,31 @@ static Status TranslateFusedBatchNormOp( ng_y = make_shared(ng_batch_norm, 0); ng_mean = make_shared(ng_batch_norm, 1); ng_variance = make_shared(ng_batch_norm, 2); + // This is for Bessel's correction in ng_variance: + size_t ng_input_size = 1.0; + size_t ng_scale_size = 1.0; + for (size_t i=0; iget_shape().size(); i++) { + ng_input_size *= ng_input->get_shape()[i]; + } + for (size_t i=0; iget_shape().size(); i++) { + ng_scale_size *= ng_scale->get_shape()[i]; + } + size_t sample_size = ng_input_size/ng_scale_size; + size_t sample_size_minus_one = sample_size > 1.0 ? (sample_size - 1.0) : 1.0; + std::vector sample_values(ng::shape_size(ng_variance->get_shape()), sample_size); + auto sample = std::make_shared(ng_variance->get_element_type(), ng_variance->get_shape(), + sample_values); + std::vector minus_values(ng::shape_size(ng_variance->get_shape()), sample_size_minus_one); + auto minus_one = std::make_shared(ng_variance->get_element_type(), ng_variance->get_shape(), + minus_values); + auto Bess_scale = sample/minus_one; + auto variance = ng_variance * Bess_scale; BatchToTensorflow(is_nhwc, ng_y); SaveNgOp(ng_op_map, op->name(), ng_y); SaveNgOp(ng_op_map, op->name(), ng_mean); - SaveNgOp(ng_op_map, op->name(), ng_variance); + SaveNgOp(ng_op_map, op->name(), variance); // Output reserve_space_1: A 1D Tensor for the computed batch mean, to be // reused in the gradient computation. SaveNgOp(ng_op_map, op->name(), ng_mean); @@ -1744,7 +1763,7 @@ static Status TranslateFusedBatchNormOp( SaveNgOp(ng_op_map, op->name(), ng_batch_norm); } - SaveNgOp(ng_op_map, op->name(), ng_batch_norm); + //SaveNgOp(ng_op_map, op->name(), ng_batch_norm); return Status::OK(); } From cfb0a83a2ee505542f7d8067a3be55649bbaade7 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Tue, 4 Dec 2018 10:35:04 -0800 Subject: [PATCH 03/19] Fix the format --- src/ngraph_builder.cc | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/ngraph_builder.cc b/src/ngraph_builder.cc index 5a5a05f0..2d6c47a3 100644 --- a/src/ngraph_builder.cc +++ b/src/ngraph_builder.cc @@ -1727,21 +1727,26 @@ static Status TranslateFusedBatchNormOp( // This is for Bessel's correction in ng_variance: size_t ng_input_size = 1.0; size_t ng_scale_size = 1.0; - for (size_t i=0; iget_shape().size(); i++) { - ng_input_size *= ng_input->get_shape()[i]; + for (size_t i = 0; i < ng_input->get_shape().size(); i++) { + ng_input_size *= ng_input->get_shape()[i]; } - for (size_t i=0; iget_shape().size(); i++) { - ng_scale_size *= ng_scale->get_shape()[i]; + for (size_t i = 0; i < ng_scale->get_shape().size(); i++) { + ng_scale_size *= ng_scale->get_shape()[i]; } - size_t sample_size = ng_input_size/ng_scale_size; - size_t sample_size_minus_one = sample_size > 1.0 ? (sample_size - 1.0) : 1.0; - std::vector sample_values(ng::shape_size(ng_variance->get_shape()), sample_size); - auto sample = std::make_shared(ng_variance->get_element_type(), ng_variance->get_shape(), + size_t sample_size = ng_input_size / ng_scale_size; + size_t sample_size_minus_one = + sample_size > 1.0 ? (sample_size - 1.0) : 1.0; + std::vector sample_values(ng::shape_size(ng_variance->get_shape()), + sample_size); + auto sample = std::make_shared( + ng_variance->get_element_type(), ng_variance->get_shape(), sample_values); - std::vector minus_values(ng::shape_size(ng_variance->get_shape()), sample_size_minus_one); - auto minus_one = std::make_shared(ng_variance->get_element_type(), ng_variance->get_shape(), + std::vector minus_values(ng::shape_size(ng_variance->get_shape()), + sample_size_minus_one); + auto minus_one = std::make_shared( + ng_variance->get_element_type(), ng_variance->get_shape(), minus_values); - auto Bess_scale = sample/minus_one; + auto Bess_scale = sample / minus_one; auto variance = ng_variance * Bess_scale; BatchToTensorflow(is_nhwc, ng_y); @@ -1763,7 +1768,7 @@ static Status TranslateFusedBatchNormOp( SaveNgOp(ng_op_map, op->name(), ng_batch_norm); } - //SaveNgOp(ng_op_map, op->name(), ng_batch_norm); + // SaveNgOp(ng_op_map, op->name(), ng_batch_norm); return Status::OK(); } From 289a43d1fc2bc993105da89888b008425becf8df Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Tue, 4 Dec 2018 12:33:28 -0800 Subject: [PATCH 04/19] Add distributed macro --- CMakeLists.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 86e5b5de..6946bce2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,7 +144,7 @@ option(USE_PRE_BUILT_NGRAPH "Use pre-built nGraph located in NGRAPH_ARTIFACTS_DI option(NGRAPH_ARTIFACTS_DIR "Where would nGraph be installed after build" FALSE) option(UNIT_TEST_ENABLE "Control the building of unit tests" FALSE) option(UNIT_TEST_TF_CC_DIR "Location where TensorFlow CC library is located" ) -option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" FALSE) +option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" TRUE) option(NGRAPH_PLAIDML_ENABLE "Build PlaidML backend" FALSE) # Validate the options @@ -222,6 +222,12 @@ message(STATUS "NGRAPH_PLAIDML_ENABLE: ${NGRAPH_PLAIDML_ENABLE}") message(STATUS "NGRAPH_TARGET_ARCH: ${NGRAPH_TARGET_ARCH}") message(STATUS "NGRAPH_TUNE_ARCH: ${NGRAPH_TUNE_ARCH}") +if(NGRAPH_DISTRIBUTED_ENABLE) + find_package(MPI REQUIRED) + add_definitions(-DNGRAPH_DISTRIBUTED) + include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH}) + link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES}) +endif() # Find and build ngraph - if not using pre-built one if (NOT USE_PRE_BUILT_NGRAPH) ExternalProject_Add( From 2163ac10df77b2f9bf142768bb0d35036fe5c2e8 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Thu, 6 Dec 2018 12:13:53 -0800 Subject: [PATCH 05/19] Add multi-node .json file output --- src/ngraph_encapsulate_op.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/ngraph_encapsulate_op.cc b/src/ngraph_encapsulate_op.cc index 559de1ba..9b533791 100644 --- a/src/ngraph_encapsulate_op.cc +++ b/src/ngraph_encapsulate_op.cc @@ -39,6 +39,10 @@ #include "ngraph/runtime/interpreter/int_backend.hpp" +#ifdef NGRAPH_DISTRIBUTED +#include > +#endif + using namespace std; namespace ng = ngraph; @@ -265,6 +269,12 @@ class NGraphEncapsulateOp : public OpKernel { if (std::getenv("NGRAPH_ENABLE_SERIALIZE") != nullptr) { std::string file_name = "tf_function_" + ctx->op_kernel().name() + ".json"; +#ifdef NGRAPH_DISTRIBUTED + int Rank_ID; + MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID); + file_name = + "tf_function_" + ctx->op_kernel().name() + "_" + to_string(Rank_ID) + ".json"; +#endif NGRAPH_VLOG(0) << "Serializing graph to: " << file_name; std::string js = ngraph::serialize(ng_function, 4); std::ofstream f; From 2e70b9703f21116220e2eb259643f639ce2c6e3c Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Thu, 6 Dec 2018 12:21:34 -0800 Subject: [PATCH 06/19] Change CMake file to be consistent with master --- CMakeLists.txt | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6946bce2..86e5b5de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,7 +144,7 @@ option(USE_PRE_BUILT_NGRAPH "Use pre-built nGraph located in NGRAPH_ARTIFACTS_DI option(NGRAPH_ARTIFACTS_DIR "Where would nGraph be installed after build" FALSE) option(UNIT_TEST_ENABLE "Control the building of unit tests" FALSE) option(UNIT_TEST_TF_CC_DIR "Location where TensorFlow CC library is located" ) -option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" TRUE) +option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" FALSE) option(NGRAPH_PLAIDML_ENABLE "Build PlaidML backend" FALSE) # Validate the options @@ -222,12 +222,6 @@ message(STATUS "NGRAPH_PLAIDML_ENABLE: ${NGRAPH_PLAIDML_ENABLE}") message(STATUS "NGRAPH_TARGET_ARCH: ${NGRAPH_TARGET_ARCH}") message(STATUS "NGRAPH_TUNE_ARCH: ${NGRAPH_TUNE_ARCH}") -if(NGRAPH_DISTRIBUTED_ENABLE) - find_package(MPI REQUIRED) - add_definitions(-DNGRAPH_DISTRIBUTED) - include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH}) - link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES}) -endif() # Find and build ngraph - if not using pre-built one if (NOT USE_PRE_BUILT_NGRAPH) ExternalProject_Add( From 1c34c3d249c4bec51e0f3e416690f11916deadb9 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Thu, 6 Dec 2018 12:23:19 -0800 Subject: [PATCH 07/19] Format change --- src/ngraph_encapsulate_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ngraph_encapsulate_op.cc b/src/ngraph_encapsulate_op.cc index 9b533791..8b9d6228 100644 --- a/src/ngraph_encapsulate_op.cc +++ b/src/ngraph_encapsulate_op.cc @@ -272,8 +272,8 @@ class NGraphEncapsulateOp : public OpKernel { #ifdef NGRAPH_DISTRIBUTED int Rank_ID; MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID); - file_name = - "tf_function_" + ctx->op_kernel().name() + "_" + to_string(Rank_ID) + ".json"; + file_name = "tf_function_" + ctx->op_kernel().name() + "_" + + to_string(Rank_ID) + ".json"; #endif NGRAPH_VLOG(0) << "Serializing graph to: " << file_name; std::string js = ngraph::serialize(ng_function, 4); From 70449bee471ccf10a449af487dda8fdc2ab80f2d Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Mon, 10 Dec 2018 12:26:24 -0800 Subject: [PATCH 08/19] Add a simple distributed mnist model --- examples/mnist/mnist_softmax_distributed.py | 124 ++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 examples/mnist/mnist_softmax_distributed.py diff --git a/examples/mnist/mnist_softmax_distributed.py b/examples/mnist/mnist_softmax_distributed.py new file mode 100644 index 00000000..3233efd7 --- /dev/null +++ b/examples/mnist/mnist_softmax_distributed.py @@ -0,0 +1,124 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""A very simple MNIST classifier. + +See extensive documentation at +https://www.tensorflow.org/get_started/mnist/beginners +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys +import time + +from tensorflow.examples.tutorials.mnist import input_data + +import tensorflow as tf +import ngraph_bridge +import horovod.tensorflow as hvd +learn = tf.contrib.learn + +FLAGS = None + +hvd.init() + +def main(_): + run_mnist(_) + +def run_mnist(_): + # Import data + mnist = learn.datasets.mnist.read_data_sets(FLAGS.data_dir+'MNIST-data-%d' % hvd.rank(),one_hot=True) + #mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) + + # Create the model + with tf.name_scope("mnist_placholder"): + x = tf.placeholder(tf.float32, [None, 784]) + W = tf.Variable(tf.zeros([784, 10])) + b = tf.Variable(tf.zeros([10])) + y = tf.matmul(x, W) + b + + # Define loss and optimizer + y_ = tf.placeholder(tf.float32, [None, 10]) + + # The raw formulation of cross-entropy, + # + # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), + # reduction_indices=[1])) + # + # can be numerically unstable. + # + # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw + # outputs of 'y', and then average across the batch. + cross_entropy = tf.reduce_mean( + tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) + #global_step = tf.train.get_or_create_global_step() + global_step = tf.contrib.framework.get_or_create_global_step() + opt = tf.train.GradientDescentOptimizer(0.5) + # Add MPI Distributed Optimizer + #import pdb; pdb.set_trace() + with tf.name_scope("horovod_opt"): + opt = hvd.DistributedOptimizer(opt) + train_step = opt.minimize(cross_entropy, global_step=global_step) + + # The StopAtStepHook handles stopping after running given steps. + hooks=[tf.train.StopAtStepHook(last_step=10)] + + # Test trained model + correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + + # Enable soft placement and tracing as needed + config = tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=True, + inter_op_parallelism_threads=1) + + #config.graph_options.optimizer_options.global_jit_level = jit_level + run_metadata = tf.RunMetadata() + + init_op = tf.global_variables_initializer() + print("Variables initialized ...") + + # The MonitoredTrainingSession takes care of session initialization + with tf.train.MonitoredTrainingSession(hooks=hooks, + config=config) as mon_sess: + start = time.time() + train_writer = tf.summary.FileWriter(FLAGS.log_dir, mon_sess.graph) + while not mon_sess.should_stop(): + # Train + batch_xs, batch_ys = mnist.train.next_batch(100) + mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) + + # Test trained model + if not mon_sess.should_stop(): + print("Accuracy: ", mon_sess.run(accuracy, feed_dict={x: mnist.test.images, + y_: mnist.test.labels})) + + end = time.time() + + if hvd.rank() == 0: + print("Training time: %f seconds" % (end-start)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data', + help='Directory for storing input data') + parser.add_argument('--log_dir',type=str, default='/tmp/tensorflow/mnist/logs/mnist_with_summaries', + help='Summaries log directory') + FLAGS, unparsed = parser.parse_known_args() + tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) From 73cf9d08fa7cd7824a9ca9acbe1a3ae2cdc154e9 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Wed, 19 Dec 2018 12:24:20 -0800 Subject: [PATCH 09/19] Add distributed option for Makefile --- CMakeLists.txt | 54 ++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c206df2f..6946bce2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,9 +71,9 @@ execute_process( OUTPUT_STRIP_TRAILING_WHITESPACE ) -if(NOT APPLE) +if(NOT APPLE) # FIXME: Doesn't work for Ubuntu - execute_process(COMMAND cat /etc/os-release + execute_process(COMMAND cat /etc/os-release OUTPUT_VARIABLE LSB_RELEASE_ID_SHORT OUTPUT_STRIP_TRAILING_WHITESPACE ) @@ -106,7 +106,7 @@ endif() # Therefore we are setting two entries in the RPATH: # 1. $ORIGIN/. # 2. $ORIGIN/../tensorflow/ -# +# set(CMAKE_MACOSX_RPATH 1) if(APPLE) set(CMAKE_INSTALL_RPATH "@loader_path/;@loader_path/../tensorflow;") @@ -120,18 +120,19 @@ endif() find_package(TensorFlow REQUIRED) add_library(tensorflow_framework_lib SHARED IMPORTED) -set_target_properties( - tensorflow_framework_lib - PROPERTIES IMPORTED_LOCATION +set_target_properties( + tensorflow_framework_lib + PROPERTIES IMPORTED_LOCATION ${TensorFlow_DIR}/libtensorflow_framework.so ) if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(NGRAPH_TF_CXX11_ABI "${TensorFlow_CXX_ABI}") message( STATUS "nGraph-TensorFlow using CXX11 ABI: ${NGRAPH_TF_CXX11_ABI}" ) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=${NGRAPH_TF_CXX11_ABI}") endif() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=${NGRAPH_TF_CXX11_ABI}") + if(APPLE) set(LIBNGRAPH_SO "libngraph.dylib") else() @@ -143,7 +144,7 @@ option(USE_PRE_BUILT_NGRAPH "Use pre-built nGraph located in NGRAPH_ARTIFACTS_DI option(NGRAPH_ARTIFACTS_DIR "Where would nGraph be installed after build" FALSE) option(UNIT_TEST_ENABLE "Control the building of unit tests" FALSE) option(UNIT_TEST_TF_CC_DIR "Location where TensorFlow CC library is located" ) -option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" FALSE) +option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" TRUE) option(NGRAPH_PLAIDML_ENABLE "Build PlaidML backend" FALSE) # Validate the options @@ -157,7 +158,7 @@ endif() if(USE_PRE_BUILT_NGRAPH) if(NOT NGRAPH_ARTIFACTS_DIR) message( - FATAL_ERROR + FATAL_ERROR "USE_PRE_BUILT_NGRAPH is ON but NGRAPH_ARTIFACTS_DIR is missing" ) endif() @@ -173,9 +174,9 @@ if(USE_PRE_BUILT_NGRAPH) ABSOLUTE ) if (NOT EXISTS ${NGRAPH_ARTIFACTS_DIR}) - message(FATAL_ERROR + message(FATAL_ERROR "nGraph artifacts directory doesn't exist: " ${NGRAPH_ARTIFACTS_DIR} ) - endif() + endif() else() set(NGRAPH_ARTIFACTS_DIR ${CMAKE_CURRENT_BINARY_DIR}/ngraph/ngraph_dist) endif() @@ -193,7 +194,7 @@ if(UNIT_TEST_TF_CC_DIR) ABSOLUTE ) if (NOT EXISTS ${TF_PRE_BUILT_LOCATION}) - message(FATAL_ERROR + message(FATAL_ERROR "TensorFlow pre-built directory doesn't exist: " ${TF_PRE_BUILT_LOCATION} ) endif() @@ -201,14 +202,14 @@ endif() # Enable build target CPU features set( - NGRAPH_TARGET_ARCH - native + NGRAPH_TARGET_ARCH + native CACHE STRING "Target CPU architecture to build for.") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${NGRAPH_TARGET_ARCH}") set( - NGRAPH_TUNE_ARCH - native + NGRAPH_TUNE_ARCH + native CACHE STRING "Target CPU architecture to tune for") message(STATUS "UNIT_TEST_ENABLE: ${UNIT_TEST_ENABLE}") @@ -221,21 +222,26 @@ message(STATUS "NGRAPH_PLAIDML_ENABLE: ${NGRAPH_PLAIDML_ENABLE}") message(STATUS "NGRAPH_TARGET_ARCH: ${NGRAPH_TARGET_ARCH}") message(STATUS "NGRAPH_TUNE_ARCH: ${NGRAPH_TUNE_ARCH}") +if(NGRAPH_DISTRIBUTED_ENABLE) + find_package(MPI REQUIRED) + add_definitions(-DNGRAPH_DISTRIBUTED) + include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH}) + link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES}) +endif() # Find and build ngraph - if not using pre-built one if (NOT USE_PRE_BUILT_NGRAPH) ExternalProject_Add( ext_ngraph GIT_REPOSITORY https://github.com/NervanaSystems/ngraph - GIT_TAG v0.11.0 + GIT_TAG v0.10.1 CMAKE_ARGS -DNGRAPH_DISTRIBUTED_ENABLE=${NGRAPH_DISTRIBUTED_ENABLE} - -DNGRAPH_INSTALL_PREFIX=${NGRAPH_ARTIFACTS_DIR} + -DNGRAPH_INSTALL_PREFIX=${NGRAPH_ARTIFACTS_DIR} -DNGRAPH_USE_CXX_ABI=${TensorFlow_CXX_ABI} -DNGRAPH_UNIT_TEST_ENABLE=FALSE -DNGRAPH_TOOLS_ENABLE=${NGRAPH_TOOLS_ENABLE} - -DNGRAPH_DEX_ONLY=TRUE + -DNGRAPH_DEX_ONLY=TRUE -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DNGRAPH_GPU_ENABLE=${NGRAPH_GPU_ENABLE} -DNGRAPH_DEBUG_ENABLE=${NGRAPH_DEBUG_ENABLE} -DNGRAPH_PLAIDML_ENABLE=${NGRAPH_PLAIDML_ENABLE} @@ -251,7 +257,7 @@ if (NOT USE_PRE_BUILT_NGRAPH) UPDATE_COMMAND "" INSTALL_DIR "${CMAKE_INSTALL_PREFIX}" ) -endif() +endif() set(NGRAPH_INSTALL_DIR ${NGRAPH_ARTIFACTS_DIR}) @@ -262,9 +268,9 @@ else() endif() add_library(ngraph_lib SHARED IMPORTED) -set_target_properties( - ngraph_lib - PROPERTIES IMPORTED_LOCATION +set_target_properties( + ngraph_lib + PROPERTIES IMPORTED_LOCATION ${NGRAPH_IMPORTED_LOCATION} ) From 2c594ee55a064982e5f1261da09b2d2a73ed3789 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Wed, 19 Dec 2018 12:26:39 -0800 Subject: [PATCH 10/19] modify distributed example --- examples/mnist/mnist_softmax_distributed.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/mnist/mnist_softmax_distributed.py b/examples/mnist/mnist_softmax_distributed.py index 3233efd7..8440c84f 100644 --- a/examples/mnist/mnist_softmax_distributed.py +++ b/examples/mnist/mnist_softmax_distributed.py @@ -76,7 +76,9 @@ def run_mnist(_): train_step = opt.minimize(cross_entropy, global_step=global_step) # The StopAtStepHook handles stopping after running given steps. - hooks=[tf.train.StopAtStepHook(last_step=10)] + hooks=[ + hvd.BroadcastGlobalVariablesHook(0), + tf.train.StopAtStepHook(last_step=10)] # Test trained model correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) @@ -91,7 +93,7 @@ def run_mnist(_): #config.graph_options.optimizer_options.global_jit_level = jit_level run_metadata = tf.RunMetadata() - init_op = tf.global_variables_initializer() + #init_op = tf.global_variables_initializer() print("Variables initialized ...") # The MonitoredTrainingSession takes care of session initialization @@ -105,9 +107,9 @@ def run_mnist(_): mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) # Test trained model - if not mon_sess.should_stop(): - print("Accuracy: ", mon_sess.run(accuracy, feed_dict={x: mnist.test.images, - y_: mnist.test.labels})) +# if not mon_sess.should_stop(): +# print("Accuracy: ", mon_sess.run(accuracy, feed_dict={x: mnist.test.images, +# y_: mnist.test.labels})) end = time.time() From d7882472bbfd4c5e62d521e1fc49be9dbc579d1f Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Wed, 19 Dec 2018 16:01:42 -0800 Subject: [PATCH 11/19] Add distributed flags for multi-process graph dumps --- src/ngraph_rewrite_pass.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/ngraph_rewrite_pass.cc b/src/ngraph_rewrite_pass.cc index 912cc4de..174e8a2d 100644 --- a/src/ngraph_rewrite_pass.cc +++ b/src/ngraph_rewrite_pass.cc @@ -29,6 +29,10 @@ #include +#ifdef NGRAPH_DISTRIBUTED +#include > +#endif + using namespace std; namespace tensorflow { @@ -102,6 +106,11 @@ class NGraphRewritePass : public GraphOptimizationPass { static std::string GraphFilenamePrefix(std::string kind, int idx) { std::stringstream ss; ss << kind << "_" << std::setfill('0') << std::setw(4) << idx; +#ifdef NGRAPH_DISTRIBUTED + int Rank_ID; + MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID); + ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID; +#endif return ss.str(); } static std::string GraphFilenamePrefix(std::string kind, int idx, @@ -109,6 +118,11 @@ class NGraphRewritePass : public GraphOptimizationPass { std::stringstream ss; ss << GraphFilenamePrefix(kind, idx) << "_" << std::setfill('0') << std::setw(4) << sub_idx; +#ifdef NGRAPH_DISTRIBUTED + int Rank_ID; + MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID); + ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID; +#endif return ss.str(); } From 0371935754bc3b7b578222390c9f3b46d294ee39 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Thu, 20 Dec 2018 16:01:11 -0800 Subject: [PATCH 12/19] Changes the Makefile to enable distributed build --- CMakeLists.txt | 49 +++++++++++++++++++++++++------------------------ build_ngtf.py | 3 ++- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6946bce2..b75a3863 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,9 +71,9 @@ execute_process( OUTPUT_STRIP_TRAILING_WHITESPACE ) -if(NOT APPLE) +if(NOT APPLE) # FIXME: Doesn't work for Ubuntu - execute_process(COMMAND cat /etc/os-release + execute_process(COMMAND cat /etc/os-release OUTPUT_VARIABLE LSB_RELEASE_ID_SHORT OUTPUT_STRIP_TRAILING_WHITESPACE ) @@ -106,7 +106,7 @@ endif() # Therefore we are setting two entries in the RPATH: # 1. $ORIGIN/. # 2. $ORIGIN/../tensorflow/ -# +# set(CMAKE_MACOSX_RPATH 1) if(APPLE) set(CMAKE_INSTALL_RPATH "@loader_path/;@loader_path/../tensorflow;") @@ -120,19 +120,18 @@ endif() find_package(TensorFlow REQUIRED) add_library(tensorflow_framework_lib SHARED IMPORTED) -set_target_properties( - tensorflow_framework_lib - PROPERTIES IMPORTED_LOCATION +set_target_properties( + tensorflow_framework_lib + PROPERTIES IMPORTED_LOCATION ${TensorFlow_DIR}/libtensorflow_framework.so ) if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(NGRAPH_TF_CXX11_ABI "${TensorFlow_CXX_ABI}") message( STATUS "nGraph-TensorFlow using CXX11 ABI: ${NGRAPH_TF_CXX11_ABI}" ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=${NGRAPH_TF_CXX11_ABI}") endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=${NGRAPH_TF_CXX11_ABI}") - if(APPLE) set(LIBNGRAPH_SO "libngraph.dylib") else() @@ -144,7 +143,7 @@ option(USE_PRE_BUILT_NGRAPH "Use pre-built nGraph located in NGRAPH_ARTIFACTS_DI option(NGRAPH_ARTIFACTS_DIR "Where would nGraph be installed after build" FALSE) option(UNIT_TEST_ENABLE "Control the building of unit tests" FALSE) option(UNIT_TEST_TF_CC_DIR "Location where TensorFlow CC library is located" ) -option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" TRUE) +option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" FALSE) option(NGRAPH_PLAIDML_ENABLE "Build PlaidML backend" FALSE) # Validate the options @@ -158,7 +157,7 @@ endif() if(USE_PRE_BUILT_NGRAPH) if(NOT NGRAPH_ARTIFACTS_DIR) message( - FATAL_ERROR + FATAL_ERROR "USE_PRE_BUILT_NGRAPH is ON but NGRAPH_ARTIFACTS_DIR is missing" ) endif() @@ -174,9 +173,9 @@ if(USE_PRE_BUILT_NGRAPH) ABSOLUTE ) if (NOT EXISTS ${NGRAPH_ARTIFACTS_DIR}) - message(FATAL_ERROR + message(FATAL_ERROR "nGraph artifacts directory doesn't exist: " ${NGRAPH_ARTIFACTS_DIR} ) - endif() + endif() else() set(NGRAPH_ARTIFACTS_DIR ${CMAKE_CURRENT_BINARY_DIR}/ngraph/ngraph_dist) endif() @@ -194,7 +193,7 @@ if(UNIT_TEST_TF_CC_DIR) ABSOLUTE ) if (NOT EXISTS ${TF_PRE_BUILT_LOCATION}) - message(FATAL_ERROR + message(FATAL_ERROR "TensorFlow pre-built directory doesn't exist: " ${TF_PRE_BUILT_LOCATION} ) endif() @@ -202,14 +201,14 @@ endif() # Enable build target CPU features set( - NGRAPH_TARGET_ARCH - native + NGRAPH_TARGET_ARCH + native CACHE STRING "Target CPU architecture to build for.") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${NGRAPH_TARGET_ARCH}") set( - NGRAPH_TUNE_ARCH - native + NGRAPH_TUNE_ARCH + native CACHE STRING "Target CPU architecture to tune for") message(STATUS "UNIT_TEST_ENABLE: ${UNIT_TEST_ENABLE}") @@ -228,20 +227,22 @@ if(NGRAPH_DISTRIBUTED_ENABLE) include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH}) link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES}) endif() + # Find and build ngraph - if not using pre-built one if (NOT USE_PRE_BUILT_NGRAPH) ExternalProject_Add( ext_ngraph GIT_REPOSITORY https://github.com/NervanaSystems/ngraph - GIT_TAG v0.10.1 + GIT_TAG v0.11.0 CMAKE_ARGS -DNGRAPH_DISTRIBUTED_ENABLE=${NGRAPH_DISTRIBUTED_ENABLE} - -DNGRAPH_INSTALL_PREFIX=${NGRAPH_ARTIFACTS_DIR} + -DNGRAPH_INSTALL_PREFIX=${NGRAPH_ARTIFACTS_DIR} -DNGRAPH_USE_CXX_ABI=${TensorFlow_CXX_ABI} -DNGRAPH_UNIT_TEST_ENABLE=FALSE -DNGRAPH_TOOLS_ENABLE=${NGRAPH_TOOLS_ENABLE} - -DNGRAPH_DEX_ONLY=TRUE + -DNGRAPH_DEX_ONLY=TRUE -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DNGRAPH_GPU_ENABLE=${NGRAPH_GPU_ENABLE} -DNGRAPH_DEBUG_ENABLE=${NGRAPH_DEBUG_ENABLE} -DNGRAPH_PLAIDML_ENABLE=${NGRAPH_PLAIDML_ENABLE} @@ -257,7 +258,7 @@ if (NOT USE_PRE_BUILT_NGRAPH) UPDATE_COMMAND "" INSTALL_DIR "${CMAKE_INSTALL_PREFIX}" ) -endif() +endif() set(NGRAPH_INSTALL_DIR ${NGRAPH_ARTIFACTS_DIR}) @@ -268,9 +269,9 @@ else() endif() add_library(ngraph_lib SHARED IMPORTED) -set_target_properties( - ngraph_lib - PROPERTIES IMPORTED_LOCATION +set_target_properties( + ngraph_lib + PROPERTIES IMPORTED_LOCATION ${NGRAPH_IMPORTED_LOCATION} ) diff --git a/build_ngtf.py b/build_ngtf.py index fbb8db31..0b049328 100755 --- a/build_ngtf.py +++ b/build_ngtf.py @@ -456,7 +456,7 @@ def main(): ngraph_cmake_flags = [ "-DNGRAPH_INSTALL_PREFIX=" + artifacts_location, - "-DNGRAPH_DISTRIBUTED_ENABLE=FALSE", + "-DNGRAPH_DISTRIBUTED_ENABLE=TRUE", "-DNGRAPH_USE_CXX_ABI=" + cxx_abi, "-DNGRAPH_UNIT_TEST_ENABLE=NO", "-DNGRAPH_DEX_ONLY=TRUE", @@ -484,6 +484,7 @@ def main(): "-DNGRAPH_TUNE_ARCH=native", "-DNGRAPH_ARTIFACTS_DIR=" + artifacts_location, "-DUNIT_TEST_ENABLE=ON", + "-DNGRAPH_DISTRIBUTED_ENABLE=TRUE", "-DTF_SRC_DIR=" + tf_src_dir, "-DUNIT_TEST_TF_CC_DIR=" + os.path.join( artifacts_location, "tensorflow") ] From f205cb23969c0ef9540539fe23934f1c4fbc15e8 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Thu, 20 Dec 2018 16:08:23 -0800 Subject: [PATCH 13/19] Format fix --- src/ngraph_rewrite_pass.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ngraph_rewrite_pass.cc b/src/ngraph_rewrite_pass.cc index 174e8a2d..f54413e5 100644 --- a/src/ngraph_rewrite_pass.cc +++ b/src/ngraph_rewrite_pass.cc @@ -29,7 +29,7 @@ #include -#ifdef NGRAPH_DISTRIBUTED +#ifdef NGRAPH_DISTRIBUTED #include > #endif @@ -121,7 +121,7 @@ class NGraphRewritePass : public GraphOptimizationPass { #ifdef NGRAPH_DISTRIBUTED int Rank_ID; MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID); - ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID; + ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID; #endif return ss.str(); } From 0ecd43be207ebe155459a74dce08936c880df741 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Thu, 20 Dec 2018 16:49:23 -0800 Subject: [PATCH 14/19] Fix the typo and delete debug comment and add run command --- examples/mnist/mnist_softmax_distributed.py | 9 +++++---- src/ngraph_encapsulate_op.cc | 2 +- src/ngraph_rewrite_pass.cc | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/mnist/mnist_softmax_distributed.py b/examples/mnist/mnist_softmax_distributed.py index 8440c84f..3c0bab50 100644 --- a/examples/mnist/mnist_softmax_distributed.py +++ b/examples/mnist/mnist_softmax_distributed.py @@ -70,7 +70,6 @@ def run_mnist(_): global_step = tf.contrib.framework.get_or_create_global_step() opt = tf.train.GradientDescentOptimizer(0.5) # Add MPI Distributed Optimizer - #import pdb; pdb.set_trace() with tf.name_scope("horovod_opt"): opt = hvd.DistributedOptimizer(opt) train_step = opt.minimize(cross_entropy, global_step=global_step) @@ -107,9 +106,9 @@ def run_mnist(_): mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) # Test trained model -# if not mon_sess.should_stop(): -# print("Accuracy: ", mon_sess.run(accuracy, feed_dict={x: mnist.test.images, -# y_: mnist.test.labels})) + if not mon_sess.should_stop(): + print("Accuracy: ", mon_sess.run(accuracy, feed_dict={x: mnist.test.images, + y_: mnist.test.labels})) end = time.time() @@ -124,3 +123,5 @@ def run_mnist(_): help='Summaries log directory') FLAGS, unparsed = parser.parse_known_args() tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) +# run comand for this distributed script +# mpirun -np 2 python mnist_softmax_distributed.py diff --git a/src/ngraph_encapsulate_op.cc b/src/ngraph_encapsulate_op.cc index 57ef219a..345a6522 100644 --- a/src/ngraph_encapsulate_op.cc +++ b/src/ngraph_encapsulate_op.cc @@ -40,7 +40,7 @@ #include "ngraph/runtime/interpreter/int_backend.hpp" #ifdef NGRAPH_DISTRIBUTED -#include > +#include #endif using namespace std; diff --git a/src/ngraph_rewrite_pass.cc b/src/ngraph_rewrite_pass.cc index f54413e5..32c473f1 100644 --- a/src/ngraph_rewrite_pass.cc +++ b/src/ngraph_rewrite_pass.cc @@ -30,7 +30,7 @@ #include #ifdef NGRAPH_DISTRIBUTED -#include > +#include #endif using namespace std; From 100c1294746ed141c0b1dcb678242e22a9a7d89d Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Thu, 20 Dec 2018 17:07:25 -0800 Subject: [PATCH 15/19] Fix the python file format --- examples/mnist/mnist_softmax_distributed.py | 182 +++++++++++--------- 1 file changed, 99 insertions(+), 83 deletions(-) diff --git a/examples/mnist/mnist_softmax_distributed.py b/examples/mnist/mnist_softmax_distributed.py index 3c0bab50..3d50eb98 100644 --- a/examples/mnist/mnist_softmax_distributed.py +++ b/examples/mnist/mnist_softmax_distributed.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """A very simple MNIST classifier. See extensive documentation at @@ -37,91 +36,108 @@ hvd.init() + def main(_): - run_mnist(_) + run_mnist(_) + def run_mnist(_): - # Import data - mnist = learn.datasets.mnist.read_data_sets(FLAGS.data_dir+'MNIST-data-%d' % hvd.rank(),one_hot=True) - #mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) - - # Create the model - with tf.name_scope("mnist_placholder"): - x = tf.placeholder(tf.float32, [None, 784]) - W = tf.Variable(tf.zeros([784, 10])) - b = tf.Variable(tf.zeros([10])) - y = tf.matmul(x, W) + b - - # Define loss and optimizer - y_ = tf.placeholder(tf.float32, [None, 10]) - - # The raw formulation of cross-entropy, - # - # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), - # reduction_indices=[1])) - # - # can be numerically unstable. - # - # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw - # outputs of 'y', and then average across the batch. - cross_entropy = tf.reduce_mean( - tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) - #global_step = tf.train.get_or_create_global_step() - global_step = tf.contrib.framework.get_or_create_global_step() - opt = tf.train.GradientDescentOptimizer(0.5) - # Add MPI Distributed Optimizer - with tf.name_scope("horovod_opt"): - opt = hvd.DistributedOptimizer(opt) - train_step = opt.minimize(cross_entropy, global_step=global_step) - - # The StopAtStepHook handles stopping after running given steps. - hooks=[ - hvd.BroadcastGlobalVariablesHook(0), - tf.train.StopAtStepHook(last_step=10)] - - # Test trained model - correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) - accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) - - # Enable soft placement and tracing as needed - config = tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=True, - inter_op_parallelism_threads=1) - - #config.graph_options.optimizer_options.global_jit_level = jit_level - run_metadata = tf.RunMetadata() - - #init_op = tf.global_variables_initializer() - print("Variables initialized ...") - - # The MonitoredTrainingSession takes care of session initialization - with tf.train.MonitoredTrainingSession(hooks=hooks, - config=config) as mon_sess: - start = time.time() - train_writer = tf.summary.FileWriter(FLAGS.log_dir, mon_sess.graph) - while not mon_sess.should_stop(): - # Train - batch_xs, batch_ys = mnist.train.next_batch(100) - mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) - - # Test trained model - if not mon_sess.should_stop(): - print("Accuracy: ", mon_sess.run(accuracy, feed_dict={x: mnist.test.images, - y_: mnist.test.labels})) - - end = time.time() - - if hvd.rank() == 0: - print("Training time: %f seconds" % (end-start)) + # Import data + mnist = learn.datasets.mnist.read_data_sets( + FLAGS.data_dir + 'MNIST-data-%d' % hvd.rank(), one_hot=True) + #mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) + + # Create the model + with tf.name_scope("mnist_placholder"): + x = tf.placeholder(tf.float32, [None, 784]) + W = tf.Variable(tf.zeros([784, 10])) + b = tf.Variable(tf.zeros([10])) + y = tf.matmul(x, W) + b + + # Define loss and optimizer + y_ = tf.placeholder(tf.float32, [None, 10]) + + # The raw formulation of cross-entropy, + # + # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), + # reduction_indices=[1])) + # + # can be numerically unstable. + # + # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw + # outputs of 'y', and then average across the batch. + cross_entropy = tf.reduce_mean( + tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) + #global_step = tf.train.get_or_create_global_step() + global_step = tf.contrib.framework.get_or_create_global_step() + opt = tf.train.GradientDescentOptimizer(0.5) + # Add MPI Distributed Optimizer + with tf.name_scope("horovod_opt"): + opt = hvd.DistributedOptimizer(opt) + train_step = opt.minimize(cross_entropy, global_step=global_step) + + # The StopAtStepHook handles stopping after running given steps. + hooks = [ + hvd.BroadcastGlobalVariablesHook(0), + tf.train.StopAtStepHook(last_step=10) + ] + + # Test trained model + correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + + # Enable soft placement and tracing as needed + config = tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=True, + inter_op_parallelism_threads=1) + + #config.graph_options.optimizer_options.global_jit_level = jit_level + run_metadata = tf.RunMetadata() + + #init_op = tf.global_variables_initializer() + print("Variables initialized ...") + + # The MonitoredTrainingSession takes care of session initialization + with tf.train.MonitoredTrainingSession( + hooks=hooks, config=config) as mon_sess: + start = time.time() + train_writer = tf.summary.FileWriter(FLAGS.log_dir, mon_sess.graph) + while not mon_sess.should_stop(): + # Train + batch_xs, batch_ys = mnist.train.next_batch(100) + mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) + + # Test trained model + if not mon_sess.should_stop(): + print( + "Accuracy: ", + mon_sess.run( + accuracy, + feed_dict={ + x: mnist.test.images, + y_: mnist.test.labels + })) + + end = time.time() + + if hvd.rank() == 0: + print("Training time: %f seconds" % (end - start)) + if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data', - help='Directory for storing input data') - parser.add_argument('--log_dir',type=str, default='/tmp/tensorflow/mnist/logs/mnist_with_summaries', - help='Summaries log directory') - FLAGS, unparsed = parser.parse_known_args() - tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/tensorflow/mnist/input_data', + help='Directory for storing input data') + parser.add_argument( + '--log_dir', + type=str, + default='/tmp/tensorflow/mnist/logs/mnist_with_summaries', + help='Summaries log directory') + FLAGS, unparsed = parser.parse_known_args() + tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) # run comand for this distributed script -# mpirun -np 2 python mnist_softmax_distributed.py +# mpirun -np 2 python mnist_softmax_distributed.py From dda11aa12e68b6ebae64292515cd3198d0c3ca21 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Fri, 21 Dec 2018 10:26:03 -0800 Subject: [PATCH 16/19] Fix python file format --- examples/mnist/mnist_softmax_distributed.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/mnist/mnist_softmax_distributed.py b/examples/mnist/mnist_softmax_distributed.py index 3d50eb98..78e93a23 100644 --- a/examples/mnist/mnist_softmax_distributed.py +++ b/examples/mnist/mnist_softmax_distributed.py @@ -110,14 +110,13 @@ def run_mnist(_): # Test trained model if not mon_sess.should_stop(): - print( - "Accuracy: ", - mon_sess.run( - accuracy, - feed_dict={ - x: mnist.test.images, - y_: mnist.test.labels - })) + print("Accuracy: ", + mon_sess.run( + accuracy, + feed_dict={ + x: mnist.test.images, + y_: mnist.test.labels + })) end = time.time() From 6caf97782d3acd7861ca672aaba0fd2f5432faf1 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Fri, 21 Dec 2018 11:50:18 -0800 Subject: [PATCH 17/19] Add mnist data directory --- examples/mnist/mnist_softmax_distributed.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/mnist/mnist_softmax_distributed.py b/examples/mnist/mnist_softmax_distributed.py index 78e93a23..a902e4e6 100644 --- a/examples/mnist/mnist_softmax_distributed.py +++ b/examples/mnist/mnist_softmax_distributed.py @@ -45,7 +45,6 @@ def run_mnist(_): # Import data mnist = learn.datasets.mnist.read_data_sets( FLAGS.data_dir + 'MNIST-data-%d' % hvd.rank(), one_hot=True) - #mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) # Create the model with tf.name_scope("mnist_placholder"): @@ -139,4 +138,4 @@ def run_mnist(_): FLAGS, unparsed = parser.parse_known_args() tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) # run comand for this distributed script -# mpirun -np 2 python mnist_softmax_distributed.py +# mpirun -np 2 python mnist_softmax_distributed.py --data_dir=/mnt/data/mnist From c32b3893a125ecd4f2d6878159ca8dc2b25dd160 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Wed, 26 Dec 2018 16:41:00 -0800 Subject: [PATCH 18/19] Integrate with mlsl --- CMakeLists.txt | 5 ++--- src/ngraph_encapsulate_op.cc | 7 ++++--- src/ngraph_rewrite_pass.cc | 7 ++++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b75a3863..f5142773 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -222,10 +222,9 @@ message(STATUS "NGRAPH_TARGET_ARCH: ${NGRAPH_TARGET_ARCH}") message(STATUS "NGRAPH_TUNE_ARCH: ${NGRAPH_TUNE_ARCH}") if(NGRAPH_DISTRIBUTED_ENABLE) - find_package(MPI REQUIRED) + #get_target_property(MLSL_INCLUDE_DIR libmlsl INTERFACE_INCLUDE_DIRECTORIES) + #list(APPEND HEADER_SEARCH_DEFINES MLSL_HEADER_PATH="${MLSL_INCLUDE_DIR}") add_definitions(-DNGRAPH_DISTRIBUTED) - include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH}) - link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES}) endif() # Find and build ngraph - if not using pre-built one diff --git a/src/ngraph_encapsulate_op.cc b/src/ngraph_encapsulate_op.cc index 345a6522..db54f45f 100644 --- a/src/ngraph_encapsulate_op.cc +++ b/src/ngraph_encapsulate_op.cc @@ -40,7 +40,7 @@ #include "ngraph/runtime/interpreter/int_backend.hpp" #ifdef NGRAPH_DISTRIBUTED -#include +#include "ngraph/mlsl.hpp" #endif using namespace std; @@ -270,8 +270,9 @@ class NGraphEncapsulateOp : public OpKernel { std::string file_name = "tf_function_" + ctx->op_kernel().name() + ".json"; #ifdef NGRAPH_DISTRIBUTED - int Rank_ID; - MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID); + MLSL::Environment* mlsl_env; + MLSL::Distribution* mlsl_dist; + int Rank_ID = MLSL::Environment::GetEnv().GetProcessIdx(); file_name = "tf_function_" + ctx->op_kernel().name() + "_" + to_string(Rank_ID) + ".json"; #endif diff --git a/src/ngraph_rewrite_pass.cc b/src/ngraph_rewrite_pass.cc index 32c473f1..f494b3f6 100644 --- a/src/ngraph_rewrite_pass.cc +++ b/src/ngraph_rewrite_pass.cc @@ -30,7 +30,7 @@ #include #ifdef NGRAPH_DISTRIBUTED -#include +#include "ngraph/mlsl.h" #endif using namespace std; @@ -107,8 +107,9 @@ class NGraphRewritePass : public GraphOptimizationPass { std::stringstream ss; ss << kind << "_" << std::setfill('0') << std::setw(4) << idx; #ifdef NGRAPH_DISTRIBUTED - int Rank_ID; - MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID); + MLSL::Environment* mlsl_env; + MLSL::Distribution* mlsl_dist; + int Rank_ID = MLSL::Environment::GetEnv().GetProcessIdx(); ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID; #endif return ss.str(); From f0a94fb361913154b33194e4b8f46374412f4d20 Mon Sep 17 00:00:00 2001 From: jianyinglang Date: Thu, 27 Dec 2018 17:08:21 -0800 Subject: [PATCH 19/19] Integrate with MLSL --- CMakeLists.txt | 2 -- build_ngtf.py | 19 ++++++++++++++++--- src/ngraph_encapsulate_op.cc | 2 -- src/ngraph_rewrite_pass.cc | 7 ++----- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 73e9c0fd..84d94714 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -227,8 +227,6 @@ message(STATUS "NGRAPH_TARGET_ARCH: ${NGRAPH_TARGET_ARCH}") message(STATUS "NGRAPH_TUNE_ARCH: ${NGRAPH_TUNE_ARCH}") if(NGRAPH_DISTRIBUTED_ENABLE) - #get_target_property(MLSL_INCLUDE_DIR libmlsl INTERFACE_INCLUDE_DIRECTORIES) - #list(APPEND HEADER_SEARCH_DEFINES MLSL_HEADER_PATH="${MLSL_INCLUDE_DIR}") add_definitions(-DNGRAPH_DISTRIBUTED) endif() diff --git a/build_ngtf.py b/build_ngtf.py index dc0918af..67169f26 100755 --- a/build_ngtf.py +++ b/build_ngtf.py @@ -388,6 +388,11 @@ def main(): action="store_true" ) + parser.add_argument( + '--distributed_build', + help="Builds a distributed version of the nGraph components\n", + action="store_true") + arguments = parser.parse_args() if (arguments.debug_build): @@ -403,7 +408,7 @@ def main(): #------------------------------- # Component versions - ngraph_version = "v0.11.0" + ngraph_version = "master" #"v0.11.0" tf_version = "v1.12.0" # Default directories @@ -467,7 +472,6 @@ def main(): ngraph_cmake_flags = [ "-DNGRAPH_INSTALL_PREFIX=" + artifacts_location, - "-DNGRAPH_DISTRIBUTED_ENABLE=FALSE", "-DNGRAPH_USE_CXX_ABI=" + cxx_abi, "-DNGRAPH_UNIT_TEST_ENABLE=NO", "-DNGRAPH_DEX_ONLY=TRUE", @@ -485,6 +489,11 @@ def main(): if (arguments.debug_build): ngraph_cmake_flags.extend(["-DCMAKE_BUILD_TYPE=Debug"]) + if (arguments.distributed_build): + ngraph_cmake_flags.extend(["-DNGRAPH_DISTRIBUTED_ENABLE=TRUE"]) + else: + ngraph_cmake_flags.extend(["-DNGRAPH_DISTRIBUTED_ENABLE=FALSE"]) + build_ngraph("./ngraph", ngraph_cmake_flags, verbosity) # Next build CMAKE options for the bridge @@ -495,13 +504,17 @@ def main(): "-DNGRAPH_TUNE_ARCH=" + target_arch, "-DNGRAPH_ARTIFACTS_DIR=" + artifacts_location, "-DUNIT_TEST_ENABLE=ON", - "-DNGRAPH_DISTRIBUTED_ENABLE=TRUE", "-DTF_SRC_DIR=" + tf_src_dir, "-DUNIT_TEST_TF_CC_DIR=" + os.path.join( artifacts_location, "tensorflow") ] if (arguments.debug_build): ngraph_tf_cmake_flags.extend(["-DCMAKE_BUILD_TYPE=Debug"]) + if (arguments.distributed_build): + ngraph_tf_cmake_flags.extend(["-DNGRAPH_DISTRIBUTED_ENABLE=TRUE"]) + else: + ngraph_tf_cmake_flags.extend(["-DNGRAPH_DISTRIBUTED_ENABLE=FALSE"]) + # Now build the bridge ng_tf_whl = build_ngraph_tf(artifacts_location, "../", venv_dir, ngraph_tf_cmake_flags, verbosity) diff --git a/src/ngraph_encapsulate_op.cc b/src/ngraph_encapsulate_op.cc index db54f45f..1f780973 100644 --- a/src/ngraph_encapsulate_op.cc +++ b/src/ngraph_encapsulate_op.cc @@ -270,8 +270,6 @@ class NGraphEncapsulateOp : public OpKernel { std::string file_name = "tf_function_" + ctx->op_kernel().name() + ".json"; #ifdef NGRAPH_DISTRIBUTED - MLSL::Environment* mlsl_env; - MLSL::Distribution* mlsl_dist; int Rank_ID = MLSL::Environment::GetEnv().GetProcessIdx(); file_name = "tf_function_" + ctx->op_kernel().name() + "_" + to_string(Rank_ID) + ".json"; diff --git a/src/ngraph_rewrite_pass.cc b/src/ngraph_rewrite_pass.cc index f494b3f6..a54312c0 100644 --- a/src/ngraph_rewrite_pass.cc +++ b/src/ngraph_rewrite_pass.cc @@ -30,7 +30,7 @@ #include #ifdef NGRAPH_DISTRIBUTED -#include "ngraph/mlsl.h" +#include "ngraph/mlsl.hpp" #endif using namespace std; @@ -107,8 +107,6 @@ class NGraphRewritePass : public GraphOptimizationPass { std::stringstream ss; ss << kind << "_" << std::setfill('0') << std::setw(4) << idx; #ifdef NGRAPH_DISTRIBUTED - MLSL::Environment* mlsl_env; - MLSL::Distribution* mlsl_dist; int Rank_ID = MLSL::Environment::GetEnv().GetProcessIdx(); ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID; #endif @@ -120,8 +118,7 @@ class NGraphRewritePass : public GraphOptimizationPass { ss << GraphFilenamePrefix(kind, idx) << "_" << std::setfill('0') << std::setw(4) << sub_idx; #ifdef NGRAPH_DISTRIBUTED - int Rank_ID; - MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID); + int Rank_ID = MLSL::Environment::GetEnv().GetProcessIdx(); ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID; #endif return ss.str();