From 3878e4318e8f0ba83e5b3383c8ac798e26fee6fa Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Wed, 21 Nov 2018 13:03:57 -0800
Subject: [PATCH 01/19] Fix new klocwork error reports on 11/20/2018

---
 src/ngraph_deassign_clusters.cc | 1 +
 src/tf_deadness_analysis.cc     | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/ngraph_deassign_clusters.cc b/src/ngraph_deassign_clusters.cc
index 856e0a23..7e78e47e 100644
--- a/src/ngraph_deassign_clusters.cc
+++ b/src/ngraph_deassign_clusters.cc
@@ -75,6 +75,7 @@ static void MaybeLogPlacement(const Graph* graph) {
     }
     final_cluster_map[cluster_idx].insert(node);
   }
+  if (number_of_nodes == 0) return;
 
   int perc_marked_for_clustering_of_total =
       (int)((nodes_marked_for_clustering * 100.0) / number_of_nodes);
diff --git a/src/tf_deadness_analysis.cc b/src/tf_deadness_analysis.cc
index 07fd50f7..539c04ae 100644
--- a/src/tf_deadness_analysis.cc
+++ b/src/tf_deadness_analysis.cc
@@ -490,7 +490,9 @@ Status DeadnessAnalysisImpl::GetNodePredicate(const Node& node,
   }
 
   // All outputs have the same predicate
-  pred_string = pred->ToString();
+  if (pred != nullptr) {
+    pred_string = pred->ToString();
+  }
   return Status::OK();
 }
 
@@ -530,4 +532,4 @@ DeadnessAnalysis::~DeadnessAnalysis() {}
 }  // namespace ngraph_bridge
 
 }  // namespace tensorflow
-#endif  // NGRAPH_TF_DISABLE_DEADNESS_CHECK
\ No newline at end of file
+#endif  // NGRAPH_TF_DISABLE_DEADNESS_CHECK

From 4decf5798d26019b911740d31ce28bbefb996748 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Tue, 4 Dec 2018 10:24:27 -0800
Subject: [PATCH 02/19] Fix the FusedBatchNorm with the Bessel correction in
 variance

---
 src/ngraph_builder.cc | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/ngraph_builder.cc b/src/ngraph_builder.cc
index 0c5ae65e..5a5a05f0 100644
--- a/src/ngraph_builder.cc
+++ b/src/ngraph_builder.cc
@@ -1724,12 +1724,31 @@ static Status TranslateFusedBatchNormOp(
     ng_y = make_shared<ng::op::GetOutputElement>(ng_batch_norm, 0);
     ng_mean = make_shared<ng::op::GetOutputElement>(ng_batch_norm, 1);
     ng_variance = make_shared<ng::op::GetOutputElement>(ng_batch_norm, 2);
+    // This is for Bessel's correction in ng_variance:
+    size_t ng_input_size = 1.0;
+    size_t ng_scale_size = 1.0;
+    for (size_t i=0; i<ng_input->get_shape().size(); i++) {
+      ng_input_size *= ng_input->get_shape()[i]; 
+    }
+    for (size_t i=0; i<ng_scale->get_shape().size(); i++) {
+      ng_scale_size *= ng_scale->get_shape()[i]; 
+    }
+    size_t sample_size = ng_input_size/ng_scale_size;
+    size_t sample_size_minus_one = sample_size > 1.0 ? (sample_size - 1.0) : 1.0;
+    std::vector<size_t> sample_values(ng::shape_size(ng_variance->get_shape()), sample_size);
+    auto sample = std::make_shared<ng::op::Constant>(ng_variance->get_element_type(), ng_variance->get_shape(),
+        sample_values);
+    std::vector<size_t> minus_values(ng::shape_size(ng_variance->get_shape()), sample_size_minus_one);
+    auto minus_one = std::make_shared<ng::op::Constant>(ng_variance->get_element_type(), ng_variance->get_shape(),
+        minus_values);
+    auto Bess_scale = sample/minus_one;
+    auto variance = ng_variance * Bess_scale;
 
     BatchToTensorflow(is_nhwc, ng_y);
 
     SaveNgOp(ng_op_map, op->name(), ng_y);
     SaveNgOp(ng_op_map, op->name(), ng_mean);
-    SaveNgOp(ng_op_map, op->name(), ng_variance);
+    SaveNgOp(ng_op_map, op->name(), variance);
     // Output reserve_space_1: A 1D Tensor for the computed batch mean, to be
     // reused in the gradient computation.
     SaveNgOp(ng_op_map, op->name(), ng_mean);
@@ -1744,7 +1763,7 @@ static Status TranslateFusedBatchNormOp(
     SaveNgOp(ng_op_map, op->name(), ng_batch_norm);
   }
 
-  SaveNgOp(ng_op_map, op->name(), ng_batch_norm);
+  //SaveNgOp(ng_op_map, op->name(), ng_batch_norm);
   return Status::OK();
 }
 

From cfb0a83a2ee505542f7d8067a3be55649bbaade7 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Tue, 4 Dec 2018 10:35:04 -0800
Subject: [PATCH 03/19] Fix the format

---
 src/ngraph_builder.cc | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/ngraph_builder.cc b/src/ngraph_builder.cc
index 5a5a05f0..2d6c47a3 100644
--- a/src/ngraph_builder.cc
+++ b/src/ngraph_builder.cc
@@ -1727,21 +1727,26 @@ static Status TranslateFusedBatchNormOp(
     // This is for Bessel's correction in ng_variance:
     size_t ng_input_size = 1.0;
     size_t ng_scale_size = 1.0;
-    for (size_t i=0; i<ng_input->get_shape().size(); i++) {
-      ng_input_size *= ng_input->get_shape()[i]; 
+    for (size_t i = 0; i < ng_input->get_shape().size(); i++) {
+      ng_input_size *= ng_input->get_shape()[i];
     }
-    for (size_t i=0; i<ng_scale->get_shape().size(); i++) {
-      ng_scale_size *= ng_scale->get_shape()[i]; 
+    for (size_t i = 0; i < ng_scale->get_shape().size(); i++) {
+      ng_scale_size *= ng_scale->get_shape()[i];
     }
-    size_t sample_size = ng_input_size/ng_scale_size;
-    size_t sample_size_minus_one = sample_size > 1.0 ? (sample_size - 1.0) : 1.0;
-    std::vector<size_t> sample_values(ng::shape_size(ng_variance->get_shape()), sample_size);
-    auto sample = std::make_shared<ng::op::Constant>(ng_variance->get_element_type(), ng_variance->get_shape(),
+    size_t sample_size = ng_input_size / ng_scale_size;
+    size_t sample_size_minus_one =
+        sample_size > 1.0 ? (sample_size - 1.0) : 1.0;
+    std::vector<size_t> sample_values(ng::shape_size(ng_variance->get_shape()),
+                                      sample_size);
+    auto sample = std::make_shared<ng::op::Constant>(
+        ng_variance->get_element_type(), ng_variance->get_shape(),
         sample_values);
-    std::vector<size_t> minus_values(ng::shape_size(ng_variance->get_shape()), sample_size_minus_one);
-    auto minus_one = std::make_shared<ng::op::Constant>(ng_variance->get_element_type(), ng_variance->get_shape(),
+    std::vector<size_t> minus_values(ng::shape_size(ng_variance->get_shape()),
+                                     sample_size_minus_one);
+    auto minus_one = std::make_shared<ng::op::Constant>(
+        ng_variance->get_element_type(), ng_variance->get_shape(),
         minus_values);
-    auto Bess_scale = sample/minus_one;
+    auto Bess_scale = sample / minus_one;
     auto variance = ng_variance * Bess_scale;
 
     BatchToTensorflow(is_nhwc, ng_y);
@@ -1763,7 +1768,7 @@ static Status TranslateFusedBatchNormOp(
     SaveNgOp(ng_op_map, op->name(), ng_batch_norm);
   }
 
-  //SaveNgOp(ng_op_map, op->name(), ng_batch_norm);
+  // SaveNgOp(ng_op_map, op->name(), ng_batch_norm);
   return Status::OK();
 }
 

From 289a43d1fc2bc993105da89888b008425becf8df Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Tue, 4 Dec 2018 12:33:28 -0800
Subject: [PATCH 04/19] Add distributed macro

---
 CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86e5b5de..6946bce2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,7 +144,7 @@ option(USE_PRE_BUILT_NGRAPH "Use pre-built nGraph located in NGRAPH_ARTIFACTS_DI
 option(NGRAPH_ARTIFACTS_DIR "Where would nGraph be installed after build" FALSE)
 option(UNIT_TEST_ENABLE "Control the building of unit tests" FALSE)
 option(UNIT_TEST_TF_CC_DIR "Location where TensorFlow CC library is located" )
-option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" FALSE)
+option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" TRUE)
 option(NGRAPH_PLAIDML_ENABLE "Build PlaidML backend" FALSE)
 
 # Validate the options
@@ -222,6 +222,12 @@ message(STATUS "NGRAPH_PLAIDML_ENABLE:      ${NGRAPH_PLAIDML_ENABLE}")
 message(STATUS "NGRAPH_TARGET_ARCH:         ${NGRAPH_TARGET_ARCH}")
 message(STATUS "NGRAPH_TUNE_ARCH:           ${NGRAPH_TUNE_ARCH}")
 
+if(NGRAPH_DISTRIBUTED_ENABLE)
+    find_package(MPI REQUIRED)
+    add_definitions(-DNGRAPH_DISTRIBUTED)
+    include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH})
+    link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES})
+endif()
 # Find and build ngraph - if not using pre-built one
 if (NOT USE_PRE_BUILT_NGRAPH)
     ExternalProject_Add(

From 2163ac10df77b2f9bf142768bb0d35036fe5c2e8 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Thu, 6 Dec 2018 12:13:53 -0800
Subject: [PATCH 05/19] Add multi-node .json file output

---
 src/ngraph_encapsulate_op.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/ngraph_encapsulate_op.cc b/src/ngraph_encapsulate_op.cc
index 559de1ba..9b533791 100644
--- a/src/ngraph_encapsulate_op.cc
+++ b/src/ngraph_encapsulate_op.cc
@@ -39,6 +39,10 @@
 
 #include "ngraph/runtime/interpreter/int_backend.hpp"
 
+#ifdef NGRAPH_DISTRIBUTED
+#include <mpi.h>>
+#endif
+
 using namespace std;
 namespace ng = ngraph;
 
@@ -265,6 +269,12 @@ class NGraphEncapsulateOp : public OpKernel {
       if (std::getenv("NGRAPH_ENABLE_SERIALIZE") != nullptr) {
         std::string file_name =
             "tf_function_" + ctx->op_kernel().name() + ".json";
+#ifdef NGRAPH_DISTRIBUTED
+        int Rank_ID;
+        MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID);
+        file_name = 
+            "tf_function_" + ctx->op_kernel().name() + "_" + to_string(Rank_ID) +  ".json";
+#endif
         NGRAPH_VLOG(0) << "Serializing graph to: " << file_name;
         std::string js = ngraph::serialize(ng_function, 4);
         std::ofstream f;

From 2e70b9703f21116220e2eb259643f639ce2c6e3c Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Thu, 6 Dec 2018 12:21:34 -0800
Subject: [PATCH 06/19] Change CMake file to be consistent with master

---
 CMakeLists.txt | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6946bce2..86e5b5de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,7 +144,7 @@ option(USE_PRE_BUILT_NGRAPH "Use pre-built nGraph located in NGRAPH_ARTIFACTS_DI
 option(NGRAPH_ARTIFACTS_DIR "Where would nGraph be installed after build" FALSE)
 option(UNIT_TEST_ENABLE "Control the building of unit tests" FALSE)
 option(UNIT_TEST_TF_CC_DIR "Location where TensorFlow CC library is located" )
-option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" TRUE)
+option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" FALSE)
 option(NGRAPH_PLAIDML_ENABLE "Build PlaidML backend" FALSE)
 
 # Validate the options
@@ -222,12 +222,6 @@ message(STATUS "NGRAPH_PLAIDML_ENABLE:      ${NGRAPH_PLAIDML_ENABLE}")
 message(STATUS "NGRAPH_TARGET_ARCH:         ${NGRAPH_TARGET_ARCH}")
 message(STATUS "NGRAPH_TUNE_ARCH:           ${NGRAPH_TUNE_ARCH}")
 
-if(NGRAPH_DISTRIBUTED_ENABLE)
-    find_package(MPI REQUIRED)
-    add_definitions(-DNGRAPH_DISTRIBUTED)
-    include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH})
-    link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES})
-endif()
 # Find and build ngraph - if not using pre-built one
 if (NOT USE_PRE_BUILT_NGRAPH)
     ExternalProject_Add(

From 1c34c3d249c4bec51e0f3e416690f11916deadb9 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Thu, 6 Dec 2018 12:23:19 -0800
Subject: [PATCH 07/19] Format change

---
 src/ngraph_encapsulate_op.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ngraph_encapsulate_op.cc b/src/ngraph_encapsulate_op.cc
index 9b533791..8b9d6228 100644
--- a/src/ngraph_encapsulate_op.cc
+++ b/src/ngraph_encapsulate_op.cc
@@ -272,8 +272,8 @@ class NGraphEncapsulateOp : public OpKernel {
 #ifdef NGRAPH_DISTRIBUTED
         int Rank_ID;
         MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID);
-        file_name = 
-            "tf_function_" + ctx->op_kernel().name() + "_" + to_string(Rank_ID) +  ".json";
+        file_name = "tf_function_" + ctx->op_kernel().name() + "_" +
+                    to_string(Rank_ID) + ".json";
 #endif
         NGRAPH_VLOG(0) << "Serializing graph to: " << file_name;
         std::string js = ngraph::serialize(ng_function, 4);

From 70449bee471ccf10a449af487dda8fdc2ab80f2d Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Mon, 10 Dec 2018 12:26:24 -0800
Subject: [PATCH 08/19] Add a simple distributed mnist model

---
 examples/mnist/mnist_softmax_distributed.py | 124 ++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 examples/mnist/mnist_softmax_distributed.py

diff --git a/examples/mnist/mnist_softmax_distributed.py b/examples/mnist/mnist_softmax_distributed.py
new file mode 100644
index 00000000..3233efd7
--- /dev/null
+++ b/examples/mnist/mnist_softmax_distributed.py
@@ -0,0 +1,124 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A very simple MNIST classifier.
+
+See extensive documentation at
+https://www.tensorflow.org/get_started/mnist/beginners
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+import time
+
+from tensorflow.examples.tutorials.mnist import input_data
+
+import tensorflow as tf
+import ngraph_bridge
+import horovod.tensorflow as hvd
+learn = tf.contrib.learn
+
+FLAGS = None
+
+hvd.init()
+
+def main(_):
+  run_mnist(_)
+
+def run_mnist(_):
+  # Import data
+  mnist = learn.datasets.mnist.read_data_sets(FLAGS.data_dir+'MNIST-data-%d' % hvd.rank(),one_hot=True)
+  #mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+
+  # Create the model
+  with tf.name_scope("mnist_placholder"):
+    x = tf.placeholder(tf.float32, [None, 784])
+    W = tf.Variable(tf.zeros([784, 10]))
+    b = tf.Variable(tf.zeros([10]))
+    y = tf.matmul(x, W) + b
+
+  # Define loss and optimizer
+    y_ = tf.placeholder(tf.float32, [None, 10])
+
+  # The raw formulation of cross-entropy,
+  #
+  #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
+  #                                 reduction_indices=[1]))
+  #
+  # can be numerically unstable.
+  #
+  # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
+  # outputs of 'y', and then average across the batch.
+  cross_entropy = tf.reduce_mean(
+      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
+  #global_step = tf.train.get_or_create_global_step()
+  global_step = tf.contrib.framework.get_or_create_global_step()
+  opt = tf.train.GradientDescentOptimizer(0.5)
+  # Add MPI Distributed Optimizer
+  #import pdb; pdb.set_trace()
+  with tf.name_scope("horovod_opt"):
+    opt = hvd.DistributedOptimizer(opt) 
+  train_step = opt.minimize(cross_entropy, global_step=global_step)
+
+  # The StopAtStepHook handles stopping after running given steps. 
+  hooks=[tf.train.StopAtStepHook(last_step=10)]
+
+  # Test trained model
+  correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) 
+  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 
+
+  # Enable soft placement and tracing as needed
+  config = tf.ConfigProto(
+    allow_soft_placement=True, 
+    log_device_placement=True,
+    inter_op_parallelism_threads=1)
+
+  #config.graph_options.optimizer_options.global_jit_level = jit_level
+  run_metadata = tf.RunMetadata()
+
+  init_op = tf.global_variables_initializer()
+  print("Variables initialized ...")
+
+  # The MonitoredTrainingSession takes care of session initialization
+  with tf.train.MonitoredTrainingSession(hooks=hooks,
+                                         config=config) as mon_sess:
+    start = time.time()
+    train_writer = tf.summary.FileWriter(FLAGS.log_dir, mon_sess.graph)
+    while not mon_sess.should_stop():  
+      # Train
+      batch_xs, batch_ys = mnist.train.next_batch(100)
+      mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
+      
+      # Test trained model
+      if not mon_sess.should_stop():
+        print("Accuracy: ", mon_sess.run(accuracy, feed_dict={x: mnist.test.images,
+                                            y_: mnist.test.labels}))
+
+    end = time.time()
+
+  if hvd.rank() == 0:
+    print("Training time: %f seconds" % (end-start))
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
+                      help='Directory for storing input data')
+  parser.add_argument('--log_dir',type=str, default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
+                      help='Summaries log directory')
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

From 73cf9d08fa7cd7824a9ca9acbe1a3ae2cdc154e9 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Wed, 19 Dec 2018 12:24:20 -0800
Subject: [PATCH 09/19] Add distributed option for Makefile

---
 CMakeLists.txt | 54 ++++++++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c206df2f..6946bce2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,9 +71,9 @@ execute_process(
     OUTPUT_STRIP_TRAILING_WHITESPACE
 )
 
-if(NOT APPLE)
+if(NOT APPLE)    
     # FIXME: Doesn't work for Ubuntu
-    execute_process(COMMAND cat /etc/os-release
+    execute_process(COMMAND cat /etc/os-release 
         OUTPUT_VARIABLE LSB_RELEASE_ID_SHORT
         OUTPUT_STRIP_TRAILING_WHITESPACE
     )
@@ -106,7 +106,7 @@ endif()
 # Therefore we are setting two entries in the RPATH:
 # 1. $ORIGIN/.
 # 2. $ORIGIN/../tensorflow/
-#
+# 
 set(CMAKE_MACOSX_RPATH 1)
 if(APPLE)
     set(CMAKE_INSTALL_RPATH "@loader_path/;@loader_path/../tensorflow;")
@@ -120,18 +120,19 @@ endif()
 find_package(TensorFlow REQUIRED)
 
 add_library(tensorflow_framework_lib SHARED IMPORTED)
-set_target_properties(
-    tensorflow_framework_lib
-    PROPERTIES IMPORTED_LOCATION
+set_target_properties( 
+    tensorflow_framework_lib 
+    PROPERTIES IMPORTED_LOCATION 
     ${TensorFlow_DIR}/libtensorflow_framework.so
 )
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     set(NGRAPH_TF_CXX11_ABI "${TensorFlow_CXX_ABI}")
     message( STATUS "nGraph-TensorFlow using CXX11 ABI:  ${NGRAPH_TF_CXX11_ABI}" )
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=${NGRAPH_TF_CXX11_ABI}")
 endif()
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=${NGRAPH_TF_CXX11_ABI}")
+
 if(APPLE)
     set(LIBNGRAPH_SO "libngraph.dylib")
 else()
@@ -143,7 +144,7 @@ option(USE_PRE_BUILT_NGRAPH "Use pre-built nGraph located in NGRAPH_ARTIFACTS_DI
 option(NGRAPH_ARTIFACTS_DIR "Where would nGraph be installed after build" FALSE)
 option(UNIT_TEST_ENABLE "Control the building of unit tests" FALSE)
 option(UNIT_TEST_TF_CC_DIR "Location where TensorFlow CC library is located" )
-option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" FALSE)
+option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" TRUE)
 option(NGRAPH_PLAIDML_ENABLE "Build PlaidML backend" FALSE)
 
 # Validate the options
@@ -157,7 +158,7 @@ endif()
 if(USE_PRE_BUILT_NGRAPH)
     if(NOT NGRAPH_ARTIFACTS_DIR)
         message(
-            FATAL_ERROR
+            FATAL_ERROR 
             "USE_PRE_BUILT_NGRAPH is ON but NGRAPH_ARTIFACTS_DIR is missing"
         )
     endif()
@@ -173,9 +174,9 @@ if(USE_PRE_BUILT_NGRAPH)
         ABSOLUTE
     )
     if (NOT EXISTS ${NGRAPH_ARTIFACTS_DIR})
-        message(FATAL_ERROR
+        message(FATAL_ERROR 
             "nGraph artifacts directory doesn't exist: " ${NGRAPH_ARTIFACTS_DIR} )
-    endif()
+    endif() 
 else()
     set(NGRAPH_ARTIFACTS_DIR ${CMAKE_CURRENT_BINARY_DIR}/ngraph/ngraph_dist)
 endif()
@@ -193,7 +194,7 @@ if(UNIT_TEST_TF_CC_DIR)
         ABSOLUTE
     )
     if (NOT EXISTS ${TF_PRE_BUILT_LOCATION})
-        message(FATAL_ERROR
+        message(FATAL_ERROR 
             "TensorFlow pre-built directory doesn't exist: " ${TF_PRE_BUILT_LOCATION} )
     endif()
 
@@ -201,14 +202,14 @@ endif()
 
 # Enable build target CPU features
 set(
-    NGRAPH_TARGET_ARCH
-    native
+    NGRAPH_TARGET_ARCH 
+    native  
     CACHE STRING "Target CPU architecture to build for.")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${NGRAPH_TARGET_ARCH}")
 
 set(
-    NGRAPH_TUNE_ARCH
-    native
+    NGRAPH_TUNE_ARCH 
+    native  
     CACHE STRING "Target CPU architecture to tune for")
 
 message(STATUS "UNIT_TEST_ENABLE:           ${UNIT_TEST_ENABLE}")
@@ -221,21 +222,26 @@ message(STATUS "NGRAPH_PLAIDML_ENABLE:      ${NGRAPH_PLAIDML_ENABLE}")
 message(STATUS "NGRAPH_TARGET_ARCH:         ${NGRAPH_TARGET_ARCH}")
 message(STATUS "NGRAPH_TUNE_ARCH:           ${NGRAPH_TUNE_ARCH}")
 
+if(NGRAPH_DISTRIBUTED_ENABLE)
+    find_package(MPI REQUIRED)
+    add_definitions(-DNGRAPH_DISTRIBUTED)
+    include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH})
+    link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES})
+endif()
 # Find and build ngraph - if not using pre-built one
 if (NOT USE_PRE_BUILT_NGRAPH)
     ExternalProject_Add(
         ext_ngraph
         GIT_REPOSITORY https://github.com/NervanaSystems/ngraph
-        GIT_TAG v0.11.0
+        GIT_TAG v0.10.1
         CMAKE_ARGS 
             -DNGRAPH_DISTRIBUTED_ENABLE=${NGRAPH_DISTRIBUTED_ENABLE}
-            -DNGRAPH_INSTALL_PREFIX=${NGRAPH_ARTIFACTS_DIR}
+            -DNGRAPH_INSTALL_PREFIX=${NGRAPH_ARTIFACTS_DIR} 
             -DNGRAPH_USE_CXX_ABI=${TensorFlow_CXX_ABI}
             -DNGRAPH_UNIT_TEST_ENABLE=FALSE
             -DNGRAPH_TOOLS_ENABLE=${NGRAPH_TOOLS_ENABLE}
-            -DNGRAPH_DEX_ONLY=TRUE
+            -DNGRAPH_DEX_ONLY=TRUE         
             -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-            -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
             -DNGRAPH_GPU_ENABLE=${NGRAPH_GPU_ENABLE}
             -DNGRAPH_DEBUG_ENABLE=${NGRAPH_DEBUG_ENABLE}
             -DNGRAPH_PLAIDML_ENABLE=${NGRAPH_PLAIDML_ENABLE}
@@ -251,7 +257,7 @@ if (NOT USE_PRE_BUILT_NGRAPH)
         UPDATE_COMMAND ""
         INSTALL_DIR "${CMAKE_INSTALL_PREFIX}"
     )
-endif()
+endif() 
 set(NGRAPH_INSTALL_DIR ${NGRAPH_ARTIFACTS_DIR})
 
 
@@ -262,9 +268,9 @@ else()
 endif()
 
 add_library(ngraph_lib SHARED IMPORTED)
-set_target_properties(
-    ngraph_lib
-    PROPERTIES IMPORTED_LOCATION
+set_target_properties( 
+    ngraph_lib 
+    PROPERTIES IMPORTED_LOCATION 
     ${NGRAPH_IMPORTED_LOCATION}
 )
 

From 2c594ee55a064982e5f1261da09b2d2a73ed3789 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Wed, 19 Dec 2018 12:26:39 -0800
Subject: [PATCH 10/19] modify distributed example

---
 examples/mnist/mnist_softmax_distributed.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/mnist/mnist_softmax_distributed.py b/examples/mnist/mnist_softmax_distributed.py
index 3233efd7..8440c84f 100644
--- a/examples/mnist/mnist_softmax_distributed.py
+++ b/examples/mnist/mnist_softmax_distributed.py
@@ -76,7 +76,9 @@ def run_mnist(_):
   train_step = opt.minimize(cross_entropy, global_step=global_step)
 
   # The StopAtStepHook handles stopping after running given steps. 
-  hooks=[tf.train.StopAtStepHook(last_step=10)]
+  hooks=[
+    hvd.BroadcastGlobalVariablesHook(0),
+    tf.train.StopAtStepHook(last_step=10)]
 
   # Test trained model
   correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) 
@@ -91,7 +93,7 @@ def run_mnist(_):
   #config.graph_options.optimizer_options.global_jit_level = jit_level
   run_metadata = tf.RunMetadata()
 
-  init_op = tf.global_variables_initializer()
+  #init_op = tf.global_variables_initializer()
   print("Variables initialized ...")
 
   # The MonitoredTrainingSession takes care of session initialization
@@ -105,9 +107,9 @@ def run_mnist(_):
       mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
       
       # Test trained model
-      if not mon_sess.should_stop():
-        print("Accuracy: ", mon_sess.run(accuracy, feed_dict={x: mnist.test.images,
-                                            y_: mnist.test.labels}))
+#      if not mon_sess.should_stop():
+#        print("Accuracy: ", mon_sess.run(accuracy, feed_dict={x: mnist.test.images,
+#                                            y_: mnist.test.labels}))
 
     end = time.time()
 

From d7882472bbfd4c5e62d521e1fc49be9dbc579d1f Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Wed, 19 Dec 2018 16:01:42 -0800
Subject: [PATCH 11/19] Add distributed flags for multi-process graph dumps

---
 src/ngraph_rewrite_pass.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/ngraph_rewrite_pass.cc b/src/ngraph_rewrite_pass.cc
index 912cc4de..174e8a2d 100644
--- a/src/ngraph_rewrite_pass.cc
+++ b/src/ngraph_rewrite_pass.cc
@@ -29,6 +29,10 @@
 
 #include <iomanip>
 
+#ifdef NGRAPH_DISTRIBUTED 
+#include <mpi.h>>
+#endif
+
 using namespace std;
 
 namespace tensorflow {
@@ -102,6 +106,11 @@ class NGraphRewritePass : public GraphOptimizationPass {
   static std::string GraphFilenamePrefix(std::string kind, int idx) {
     std::stringstream ss;
     ss << kind << "_" << std::setfill('0') << std::setw(4) << idx;
+#ifdef NGRAPH_DISTRIBUTED
+    int Rank_ID;
+    MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID);
+    ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID;
+#endif
     return ss.str();
   }
   static std::string GraphFilenamePrefix(std::string kind, int idx,
@@ -109,6 +118,11 @@ class NGraphRewritePass : public GraphOptimizationPass {
     std::stringstream ss;
     ss << GraphFilenamePrefix(kind, idx) << "_" << std::setfill('0')
        << std::setw(4) << sub_idx;
+#ifdef NGRAPH_DISTRIBUTED
+    int Rank_ID;
+    MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID);
+    ss << "_" << std::setfill('0') << std::setw(4)  << Rank_ID;
+#endif
     return ss.str();
   }
 

From 0371935754bc3b7b578222390c9f3b46d294ee39 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Thu, 20 Dec 2018 16:01:11 -0800
Subject: [PATCH 12/19] Changes the Makefile to enable distributed build

---
 CMakeLists.txt | 49 +++++++++++++++++++++++++------------------------
 build_ngtf.py  |  3 ++-
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6946bce2..b75a3863 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,9 +71,9 @@ execute_process(
     OUTPUT_STRIP_TRAILING_WHITESPACE
 )
 
-if(NOT APPLE)    
+if(NOT APPLE)
     # FIXME: Doesn't work for Ubuntu
-    execute_process(COMMAND cat /etc/os-release 
+    execute_process(COMMAND cat /etc/os-release
         OUTPUT_VARIABLE LSB_RELEASE_ID_SHORT
         OUTPUT_STRIP_TRAILING_WHITESPACE
     )
@@ -106,7 +106,7 @@ endif()
 # Therefore we are setting two entries in the RPATH:
 # 1. $ORIGIN/.
 # 2. $ORIGIN/../tensorflow/
-# 
+#
 set(CMAKE_MACOSX_RPATH 1)
 if(APPLE)
     set(CMAKE_INSTALL_RPATH "@loader_path/;@loader_path/../tensorflow;")
@@ -120,19 +120,18 @@ endif()
 find_package(TensorFlow REQUIRED)
 
 add_library(tensorflow_framework_lib SHARED IMPORTED)
-set_target_properties( 
-    tensorflow_framework_lib 
-    PROPERTIES IMPORTED_LOCATION 
+set_target_properties(
+    tensorflow_framework_lib
+    PROPERTIES IMPORTED_LOCATION
     ${TensorFlow_DIR}/libtensorflow_framework.so
 )
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     set(NGRAPH_TF_CXX11_ABI "${TensorFlow_CXX_ABI}")
     message( STATUS "nGraph-TensorFlow using CXX11 ABI:  ${NGRAPH_TF_CXX11_ABI}" )
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=${NGRAPH_TF_CXX11_ABI}")
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=${NGRAPH_TF_CXX11_ABI}")
-
 if(APPLE)
     set(LIBNGRAPH_SO "libngraph.dylib")
 else()
@@ -144,7 +143,7 @@ option(USE_PRE_BUILT_NGRAPH "Use pre-built nGraph located in NGRAPH_ARTIFACTS_DI
 option(NGRAPH_ARTIFACTS_DIR "Where would nGraph be installed after build" FALSE)
 option(UNIT_TEST_ENABLE "Control the building of unit tests" FALSE)
 option(UNIT_TEST_TF_CC_DIR "Location where TensorFlow CC library is located" )
-option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" TRUE)
+option(NGRAPH_DISTRIBUTED_ENABLE "Add distributed mode to the CPU backend" FALSE)
 option(NGRAPH_PLAIDML_ENABLE "Build PlaidML backend" FALSE)
 
 # Validate the options
@@ -158,7 +157,7 @@ endif()
 if(USE_PRE_BUILT_NGRAPH)
     if(NOT NGRAPH_ARTIFACTS_DIR)
         message(
-            FATAL_ERROR 
+            FATAL_ERROR
             "USE_PRE_BUILT_NGRAPH is ON but NGRAPH_ARTIFACTS_DIR is missing"
         )
     endif()
@@ -174,9 +173,9 @@ if(USE_PRE_BUILT_NGRAPH)
         ABSOLUTE
     )
     if (NOT EXISTS ${NGRAPH_ARTIFACTS_DIR})
-        message(FATAL_ERROR 
+        message(FATAL_ERROR
             "nGraph artifacts directory doesn't exist: " ${NGRAPH_ARTIFACTS_DIR} )
-    endif() 
+    endif()
 else()
     set(NGRAPH_ARTIFACTS_DIR ${CMAKE_CURRENT_BINARY_DIR}/ngraph/ngraph_dist)
 endif()
@@ -194,7 +193,7 @@ if(UNIT_TEST_TF_CC_DIR)
         ABSOLUTE
     )
     if (NOT EXISTS ${TF_PRE_BUILT_LOCATION})
-        message(FATAL_ERROR 
+        message(FATAL_ERROR
             "TensorFlow pre-built directory doesn't exist: " ${TF_PRE_BUILT_LOCATION} )
     endif()
 
@@ -202,14 +201,14 @@ endif()
 
 # Enable build target CPU features
 set(
-    NGRAPH_TARGET_ARCH 
-    native  
+    NGRAPH_TARGET_ARCH
+    native
     CACHE STRING "Target CPU architecture to build for.")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${NGRAPH_TARGET_ARCH}")
 
 set(
-    NGRAPH_TUNE_ARCH 
-    native  
+    NGRAPH_TUNE_ARCH
+    native
     CACHE STRING "Target CPU architecture to tune for")
 
 message(STATUS "UNIT_TEST_ENABLE:           ${UNIT_TEST_ENABLE}")
@@ -228,20 +227,22 @@ if(NGRAPH_DISTRIBUTED_ENABLE)
     include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH})
     link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES})
 endif()
+
 # Find and build ngraph - if not using pre-built one
 if (NOT USE_PRE_BUILT_NGRAPH)
     ExternalProject_Add(
         ext_ngraph
         GIT_REPOSITORY https://github.com/NervanaSystems/ngraph
-        GIT_TAG v0.10.1
+        GIT_TAG v0.11.0
         CMAKE_ARGS 
             -DNGRAPH_DISTRIBUTED_ENABLE=${NGRAPH_DISTRIBUTED_ENABLE}
-            -DNGRAPH_INSTALL_PREFIX=${NGRAPH_ARTIFACTS_DIR} 
+            -DNGRAPH_INSTALL_PREFIX=${NGRAPH_ARTIFACTS_DIR}
             -DNGRAPH_USE_CXX_ABI=${TensorFlow_CXX_ABI}
             -DNGRAPH_UNIT_TEST_ENABLE=FALSE
             -DNGRAPH_TOOLS_ENABLE=${NGRAPH_TOOLS_ENABLE}
-            -DNGRAPH_DEX_ONLY=TRUE         
+            -DNGRAPH_DEX_ONLY=TRUE
             -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+            -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
             -DNGRAPH_GPU_ENABLE=${NGRAPH_GPU_ENABLE}
             -DNGRAPH_DEBUG_ENABLE=${NGRAPH_DEBUG_ENABLE}
             -DNGRAPH_PLAIDML_ENABLE=${NGRAPH_PLAIDML_ENABLE}
@@ -257,7 +258,7 @@ if (NOT USE_PRE_BUILT_NGRAPH)
         UPDATE_COMMAND ""
         INSTALL_DIR "${CMAKE_INSTALL_PREFIX}"
     )
-endif() 
+endif()
 set(NGRAPH_INSTALL_DIR ${NGRAPH_ARTIFACTS_DIR})
 
 
@@ -268,9 +269,9 @@ else()
 endif()
 
 add_library(ngraph_lib SHARED IMPORTED)
-set_target_properties( 
-    ngraph_lib 
-    PROPERTIES IMPORTED_LOCATION 
+set_target_properties(
+    ngraph_lib
+    PROPERTIES IMPORTED_LOCATION
     ${NGRAPH_IMPORTED_LOCATION}
 )
 
diff --git a/build_ngtf.py b/build_ngtf.py
index fbb8db31..0b049328 100755
--- a/build_ngtf.py
+++ b/build_ngtf.py
@@ -456,7 +456,7 @@ def main():
 
     ngraph_cmake_flags = [
         "-DNGRAPH_INSTALL_PREFIX=" + artifacts_location,
-        "-DNGRAPH_DISTRIBUTED_ENABLE=FALSE",
+        "-DNGRAPH_DISTRIBUTED_ENABLE=TRUE",
         "-DNGRAPH_USE_CXX_ABI=" + cxx_abi,
         "-DNGRAPH_UNIT_TEST_ENABLE=NO",
         "-DNGRAPH_DEX_ONLY=TRUE",
@@ -484,6 +484,7 @@ def main():
         "-DNGRAPH_TUNE_ARCH=native",
         "-DNGRAPH_ARTIFACTS_DIR=" + artifacts_location,
         "-DUNIT_TEST_ENABLE=ON",
+        "-DNGRAPH_DISTRIBUTED_ENABLE=TRUE",
         "-DTF_SRC_DIR=" + tf_src_dir, "-DUNIT_TEST_TF_CC_DIR=" + os.path.join(
             artifacts_location, "tensorflow")
     ]

From f205cb23969c0ef9540539fe23934f1c4fbc15e8 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Thu, 20 Dec 2018 16:08:23 -0800
Subject: [PATCH 13/19] Format fix

---
 src/ngraph_rewrite_pass.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ngraph_rewrite_pass.cc b/src/ngraph_rewrite_pass.cc
index 174e8a2d..f54413e5 100644
--- a/src/ngraph_rewrite_pass.cc
+++ b/src/ngraph_rewrite_pass.cc
@@ -29,7 +29,7 @@
 
 #include <iomanip>
 
-#ifdef NGRAPH_DISTRIBUTED 
+#ifdef NGRAPH_DISTRIBUTED
 #include <mpi.h>>
 #endif
 
@@ -121,7 +121,7 @@ class NGraphRewritePass : public GraphOptimizationPass {
 #ifdef NGRAPH_DISTRIBUTED
     int Rank_ID;
     MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID);
-    ss << "_" << std::setfill('0') << std::setw(4)  << Rank_ID;
+    ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID;
 #endif
     return ss.str();
   }

From 0ecd43be207ebe155459a74dce08936c880df741 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Thu, 20 Dec 2018 16:49:23 -0800
Subject: [PATCH 14/19] Fix the typo and delete debug comment and add run
 command

---
 examples/mnist/mnist_softmax_distributed.py | 9 +++++----
 src/ngraph_encapsulate_op.cc                | 2 +-
 src/ngraph_rewrite_pass.cc                  | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/mnist/mnist_softmax_distributed.py b/examples/mnist/mnist_softmax_distributed.py
index 8440c84f..3c0bab50 100644
--- a/examples/mnist/mnist_softmax_distributed.py
+++ b/examples/mnist/mnist_softmax_distributed.py
@@ -70,7 +70,6 @@ def run_mnist(_):
   global_step = tf.contrib.framework.get_or_create_global_step()
   opt = tf.train.GradientDescentOptimizer(0.5)
   # Add MPI Distributed Optimizer
-  #import pdb; pdb.set_trace()
   with tf.name_scope("horovod_opt"):
     opt = hvd.DistributedOptimizer(opt) 
   train_step = opt.minimize(cross_entropy, global_step=global_step)
@@ -107,9 +106,9 @@ def run_mnist(_):
       mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
       
       # Test trained model
-#      if not mon_sess.should_stop():
-#        print("Accuracy: ", mon_sess.run(accuracy, feed_dict={x: mnist.test.images,
-#                                            y_: mnist.test.labels}))
+      if not mon_sess.should_stop():
+        print("Accuracy: ", mon_sess.run(accuracy, feed_dict={x: mnist.test.images,
+                                            y_: mnist.test.labels}))
 
     end = time.time()
 
@@ -124,3 +123,5 @@ def run_mnist(_):
                       help='Summaries log directory')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+# run comand for this distributed script
+# mpirun -np 2 python mnist_softmax_distributed.py  
diff --git a/src/ngraph_encapsulate_op.cc b/src/ngraph_encapsulate_op.cc
index 57ef219a..345a6522 100644
--- a/src/ngraph_encapsulate_op.cc
+++ b/src/ngraph_encapsulate_op.cc
@@ -40,7 +40,7 @@
 #include "ngraph/runtime/interpreter/int_backend.hpp"
 
 #ifdef NGRAPH_DISTRIBUTED
-#include <mpi.h>>
+#include <mpi.h>
 #endif
 
 using namespace std;
diff --git a/src/ngraph_rewrite_pass.cc b/src/ngraph_rewrite_pass.cc
index f54413e5..32c473f1 100644
--- a/src/ngraph_rewrite_pass.cc
+++ b/src/ngraph_rewrite_pass.cc
@@ -30,7 +30,7 @@
 #include <iomanip>
 
 #ifdef NGRAPH_DISTRIBUTED
-#include <mpi.h>>
+#include <mpi.h>
 #endif
 
 using namespace std;

From 100c1294746ed141c0b1dcb678242e22a9a7d89d Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Thu, 20 Dec 2018 17:07:25 -0800
Subject: [PATCH 15/19] Fix the python file format

---
 examples/mnist/mnist_softmax_distributed.py | 182 +++++++++++---------
 1 file changed, 99 insertions(+), 83 deletions(-)

diff --git a/examples/mnist/mnist_softmax_distributed.py b/examples/mnist/mnist_softmax_distributed.py
index 3c0bab50..3d50eb98 100644
--- a/examples/mnist/mnist_softmax_distributed.py
+++ b/examples/mnist/mnist_softmax_distributed.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A very simple MNIST classifier.
 
 See extensive documentation at
@@ -37,91 +36,108 @@
 
 hvd.init()
 
+
 def main(_):
-  run_mnist(_)
+    run_mnist(_)
+
 
 def run_mnist(_):
-  # Import data
-  mnist = learn.datasets.mnist.read_data_sets(FLAGS.data_dir+'MNIST-data-%d' % hvd.rank(),one_hot=True)
-  #mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
-
-  # Create the model
-  with tf.name_scope("mnist_placholder"):
-    x = tf.placeholder(tf.float32, [None, 784])
-    W = tf.Variable(tf.zeros([784, 10]))
-    b = tf.Variable(tf.zeros([10]))
-    y = tf.matmul(x, W) + b
-
-  # Define loss and optimizer
-    y_ = tf.placeholder(tf.float32, [None, 10])
-
-  # The raw formulation of cross-entropy,
-  #
-  #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
-  #                                 reduction_indices=[1]))
-  #
-  # can be numerically unstable.
-  #
-  # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
-  # outputs of 'y', and then average across the batch.
-  cross_entropy = tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
-  #global_step = tf.train.get_or_create_global_step()
-  global_step = tf.contrib.framework.get_or_create_global_step()
-  opt = tf.train.GradientDescentOptimizer(0.5)
-  # Add MPI Distributed Optimizer
-  with tf.name_scope("horovod_opt"):
-    opt = hvd.DistributedOptimizer(opt) 
-  train_step = opt.minimize(cross_entropy, global_step=global_step)
-
-  # The StopAtStepHook handles stopping after running given steps. 
-  hooks=[
-    hvd.BroadcastGlobalVariablesHook(0),
-    tf.train.StopAtStepHook(last_step=10)]
-
-  # Test trained model
-  correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) 
-  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 
-
-  # Enable soft placement and tracing as needed
-  config = tf.ConfigProto(
-    allow_soft_placement=True, 
-    log_device_placement=True,
-    inter_op_parallelism_threads=1)
-
-  #config.graph_options.optimizer_options.global_jit_level = jit_level
-  run_metadata = tf.RunMetadata()
-
-  #init_op = tf.global_variables_initializer()
-  print("Variables initialized ...")
-
-  # The MonitoredTrainingSession takes care of session initialization
-  with tf.train.MonitoredTrainingSession(hooks=hooks,
-                                         config=config) as mon_sess:
-    start = time.time()
-    train_writer = tf.summary.FileWriter(FLAGS.log_dir, mon_sess.graph)
-    while not mon_sess.should_stop():  
-      # Train
-      batch_xs, batch_ys = mnist.train.next_batch(100)
-      mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
-      
-      # Test trained model
-      if not mon_sess.should_stop():
-        print("Accuracy: ", mon_sess.run(accuracy, feed_dict={x: mnist.test.images,
-                                            y_: mnist.test.labels}))
-
-    end = time.time()
-
-  if hvd.rank() == 0:
-    print("Training time: %f seconds" % (end-start))
+    # Import data
+    mnist = learn.datasets.mnist.read_data_sets(
+        FLAGS.data_dir + 'MNIST-data-%d' % hvd.rank(), one_hot=True)
+    #mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+
+    # Create the model
+    with tf.name_scope("mnist_placholder"):
+        x = tf.placeholder(tf.float32, [None, 784])
+        W = tf.Variable(tf.zeros([784, 10]))
+        b = tf.Variable(tf.zeros([10]))
+        y = tf.matmul(x, W) + b
+
+        # Define loss and optimizer
+        y_ = tf.placeholder(tf.float32, [None, 10])
+
+    # The raw formulation of cross-entropy,
+    #
+    #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
+    #                                 reduction_indices=[1]))
+    #
+    # can be numerically unstable.
+    #
+    # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
+    # outputs of 'y', and then average across the batch.
+    cross_entropy = tf.reduce_mean(
+        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
+    #global_step = tf.train.get_or_create_global_step()
+    global_step = tf.contrib.framework.get_or_create_global_step()
+    opt = tf.train.GradientDescentOptimizer(0.5)
+    # Add MPI Distributed Optimizer
+    with tf.name_scope("horovod_opt"):
+        opt = hvd.DistributedOptimizer(opt)
+    train_step = opt.minimize(cross_entropy, global_step=global_step)
+
+    # The StopAtStepHook handles stopping after running given steps.
+    hooks = [
+        hvd.BroadcastGlobalVariablesHook(0),
+        tf.train.StopAtStepHook(last_step=10)
+    ]
+
+    # Test trained model
+    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+
+    # Enable soft placement and tracing as needed
+    config = tf.ConfigProto(
+        allow_soft_placement=True,
+        log_device_placement=True,
+        inter_op_parallelism_threads=1)
+
+    #config.graph_options.optimizer_options.global_jit_level = jit_level
+    run_metadata = tf.RunMetadata()
+
+    #init_op = tf.global_variables_initializer()
+    print("Variables initialized ...")
+
+    # The MonitoredTrainingSession takes care of session initialization
+    with tf.train.MonitoredTrainingSession(
+            hooks=hooks, config=config) as mon_sess:
+        start = time.time()
+        train_writer = tf.summary.FileWriter(FLAGS.log_dir, mon_sess.graph)
+        while not mon_sess.should_stop():
+            # Train
+            batch_xs, batch_ys = mnist.train.next_batch(100)
+            mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
+
+            # Test trained model
+            if not mon_sess.should_stop():
+                print(
+                    "Accuracy: ",
+                    mon_sess.run(
+                        accuracy,
+                        feed_dict={
+                            x: mnist.test.images,
+                            y_: mnist.test.labels
+                        }))
+
+        end = time.time()
+
+    if hvd.rank() == 0:
+        print("Training time: %f seconds" % (end - start))
+
 
 if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
-                      help='Directory for storing input data')
-  parser.add_argument('--log_dir',type=str, default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
-                      help='Summaries log directory')
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--data_dir',
+        type=str,
+        default='/tmp/tensorflow/mnist/input_data',
+        help='Directory for storing input data')
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
+        help='Summaries log directory')
+    FLAGS, unparsed = parser.parse_known_args()
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
 # run comand for this distributed script
-# mpirun -np 2 python mnist_softmax_distributed.py  
+# mpirun -np 2 python mnist_softmax_distributed.py

From dda11aa12e68b6ebae64292515cd3198d0c3ca21 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Fri, 21 Dec 2018 10:26:03 -0800
Subject: [PATCH 16/19] Fix python file format

---
 examples/mnist/mnist_softmax_distributed.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/mnist/mnist_softmax_distributed.py b/examples/mnist/mnist_softmax_distributed.py
index 3d50eb98..78e93a23 100644
--- a/examples/mnist/mnist_softmax_distributed.py
+++ b/examples/mnist/mnist_softmax_distributed.py
@@ -110,14 +110,13 @@ def run_mnist(_):
 
             # Test trained model
             if not mon_sess.should_stop():
-                print(
-                    "Accuracy: ",
-                    mon_sess.run(
-                        accuracy,
-                        feed_dict={
-                            x: mnist.test.images,
-                            y_: mnist.test.labels
-                        }))
+                print("Accuracy: ",
+                      mon_sess.run(
+                          accuracy,
+                          feed_dict={
+                              x: mnist.test.images,
+                              y_: mnist.test.labels
+                          }))
 
         end = time.time()
 

From 6caf97782d3acd7861ca672aaba0fd2f5432faf1 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Fri, 21 Dec 2018 11:50:18 -0800
Subject: [PATCH 17/19] Add mnist data directory

---
 examples/mnist/mnist_softmax_distributed.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/mnist/mnist_softmax_distributed.py b/examples/mnist/mnist_softmax_distributed.py
index 78e93a23..a902e4e6 100644
--- a/examples/mnist/mnist_softmax_distributed.py
+++ b/examples/mnist/mnist_softmax_distributed.py
@@ -45,7 +45,6 @@ def run_mnist(_):
     # Import data
     mnist = learn.datasets.mnist.read_data_sets(
         FLAGS.data_dir + 'MNIST-data-%d' % hvd.rank(), one_hot=True)
-    #mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
 
     # Create the model
     with tf.name_scope("mnist_placholder"):
@@ -139,4 +138,4 @@ def run_mnist(_):
     FLAGS, unparsed = parser.parse_known_args()
     tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
 # run comand for this distributed script
-# mpirun -np 2 python mnist_softmax_distributed.py
+# mpirun -np 2 python mnist_softmax_distributed.py --data_dir=/mnt/data/mnist

From c32b3893a125ecd4f2d6878159ca8dc2b25dd160 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Wed, 26 Dec 2018 16:41:00 -0800
Subject: [PATCH 18/19] Integrate with mlsl

---
 CMakeLists.txt               | 5 ++---
 src/ngraph_encapsulate_op.cc | 7 ++++---
 src/ngraph_rewrite_pass.cc   | 7 ++++---
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b75a3863..f5142773 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -222,10 +222,9 @@ message(STATUS "NGRAPH_TARGET_ARCH:         ${NGRAPH_TARGET_ARCH}")
 message(STATUS "NGRAPH_TUNE_ARCH:           ${NGRAPH_TUNE_ARCH}")
 
 if(NGRAPH_DISTRIBUTED_ENABLE)
-    find_package(MPI REQUIRED)
+    #get_target_property(MLSL_INCLUDE_DIR libmlsl INTERFACE_INCLUDE_DIRECTORIES) 
+    #list(APPEND HEADER_SEARCH_DEFINES MLSL_HEADER_PATH="${MLSL_INCLUDE_DIR}")
     add_definitions(-DNGRAPH_DISTRIBUTED)
-    include_directories(SYSTEM ${MPI_C_INCLUDE_PATH} ${MPI_CXX_INCLUDE_PATH})
-    link_directories(${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES})
 endif()
 
 # Find and build ngraph - if not using pre-built one
diff --git a/src/ngraph_encapsulate_op.cc b/src/ngraph_encapsulate_op.cc
index 345a6522..db54f45f 100644
--- a/src/ngraph_encapsulate_op.cc
+++ b/src/ngraph_encapsulate_op.cc
@@ -40,7 +40,7 @@
 #include "ngraph/runtime/interpreter/int_backend.hpp"
 
 #ifdef NGRAPH_DISTRIBUTED
-#include <mpi.h>
+#include "ngraph/mlsl.hpp"
 #endif
 
 using namespace std;
@@ -270,8 +270,9 @@ class NGraphEncapsulateOp : public OpKernel {
         std::string file_name =
             "tf_function_" + ctx->op_kernel().name() + ".json";
 #ifdef NGRAPH_DISTRIBUTED
-        int Rank_ID;
-        MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID);
+        MLSL::Environment* mlsl_env;
+        MLSL::Distribution* mlsl_dist;
+        int Rank_ID = MLSL::Environment::GetEnv().GetProcessIdx();
         file_name = "tf_function_" + ctx->op_kernel().name() + "_" +
                     to_string(Rank_ID) + ".json";
 #endif
diff --git a/src/ngraph_rewrite_pass.cc b/src/ngraph_rewrite_pass.cc
index 32c473f1..f494b3f6 100644
--- a/src/ngraph_rewrite_pass.cc
+++ b/src/ngraph_rewrite_pass.cc
@@ -30,7 +30,7 @@
 #include <iomanip>
 
 #ifdef NGRAPH_DISTRIBUTED
-#include <mpi.h>
+#include "ngraph/mlsl.h"
 #endif
 
 using namespace std;
@@ -107,8 +107,9 @@ class NGraphRewritePass : public GraphOptimizationPass {
     std::stringstream ss;
     ss << kind << "_" << std::setfill('0') << std::setw(4) << idx;
 #ifdef NGRAPH_DISTRIBUTED
-    int Rank_ID;
-    MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID);
+    MLSL::Environment* mlsl_env;
+    MLSL::Distribution* mlsl_dist;
+    int Rank_ID = MLSL::Environment::GetEnv().GetProcessIdx();
     ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID;
 #endif
     return ss.str();

From f0a94fb361913154b33194e4b8f46374412f4d20 Mon Sep 17 00:00:00 2001
From: jianyinglang <jianying.lang@intel.com>
Date: Thu, 27 Dec 2018 17:08:21 -0800
Subject: [PATCH 19/19] Integrate with MLSL

---
 CMakeLists.txt               |  2 --
 build_ngtf.py                | 19 ++++++++++++++++---
 src/ngraph_encapsulate_op.cc |  2 --
 src/ngraph_rewrite_pass.cc   |  7 ++-----
 4 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73e9c0fd..84d94714 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,8 +227,6 @@ message(STATUS "NGRAPH_TARGET_ARCH:         ${NGRAPH_TARGET_ARCH}")
 message(STATUS "NGRAPH_TUNE_ARCH:           ${NGRAPH_TUNE_ARCH}")
 
 if(NGRAPH_DISTRIBUTED_ENABLE)
-    #get_target_property(MLSL_INCLUDE_DIR libmlsl INTERFACE_INCLUDE_DIRECTORIES) 
-    #list(APPEND HEADER_SEARCH_DEFINES MLSL_HEADER_PATH="${MLSL_INCLUDE_DIR}")
     add_definitions(-DNGRAPH_DISTRIBUTED)
 endif()
 
diff --git a/build_ngtf.py b/build_ngtf.py
index dc0918af..67169f26 100755
--- a/build_ngtf.py
+++ b/build_ngtf.py
@@ -388,6 +388,11 @@ def main():
         action="store_true"
         )
 
+    parser.add_argument(
+        '--distributed_build',
+        help="Builds a distributed version of the nGraph components\n",
+        action="store_true")
+
     arguments = parser.parse_args()
 
     if (arguments.debug_build):
@@ -403,7 +408,7 @@ def main():
     #-------------------------------
 
     # Component versions
-    ngraph_version = "v0.11.0"
+    ngraph_version = "master" #"v0.11.0"
     tf_version = "v1.12.0"
 
     # Default directories
@@ -467,7 +472,6 @@ def main():
 
         ngraph_cmake_flags = [
             "-DNGRAPH_INSTALL_PREFIX=" + artifacts_location,
-            "-DNGRAPH_DISTRIBUTED_ENABLE=FALSE",
             "-DNGRAPH_USE_CXX_ABI=" + cxx_abi,
             "-DNGRAPH_UNIT_TEST_ENABLE=NO",
             "-DNGRAPH_DEX_ONLY=TRUE",
@@ -485,6 +489,11 @@ def main():
         if (arguments.debug_build):
             ngraph_cmake_flags.extend(["-DCMAKE_BUILD_TYPE=Debug"])
 
+        if (arguments.distributed_build):
+            ngraph_cmake_flags.extend(["-DNGRAPH_DISTRIBUTED_ENABLE=TRUE"])
+        else:
+            ngraph_cmake_flags.extend(["-DNGRAPH_DISTRIBUTED_ENABLE=FALSE"])
+
         build_ngraph("./ngraph", ngraph_cmake_flags, verbosity)
 
     # Next build CMAKE options for the bridge
@@ -495,13 +504,17 @@ def main():
         "-DNGRAPH_TUNE_ARCH=" + target_arch,
         "-DNGRAPH_ARTIFACTS_DIR=" + artifacts_location,
         "-DUNIT_TEST_ENABLE=ON",
-        "-DNGRAPH_DISTRIBUTED_ENABLE=TRUE",
         "-DTF_SRC_DIR=" + tf_src_dir, "-DUNIT_TEST_TF_CC_DIR=" + os.path.join(
             artifacts_location, "tensorflow")
     ]
     if (arguments.debug_build):
         ngraph_tf_cmake_flags.extend(["-DCMAKE_BUILD_TYPE=Debug"])
 
+    if (arguments.distributed_build):
+        ngraph_tf_cmake_flags.extend(["-DNGRAPH_DISTRIBUTED_ENABLE=TRUE"])
+    else:
+        ngraph_tf_cmake_flags.extend(["-DNGRAPH_DISTRIBUTED_ENABLE=FALSE"])
+
     # Now build the bridge
     ng_tf_whl = build_ngraph_tf(artifacts_location, "../", venv_dir,
                                 ngraph_tf_cmake_flags, verbosity)
diff --git a/src/ngraph_encapsulate_op.cc b/src/ngraph_encapsulate_op.cc
index db54f45f..1f780973 100644
--- a/src/ngraph_encapsulate_op.cc
+++ b/src/ngraph_encapsulate_op.cc
@@ -270,8 +270,6 @@ class NGraphEncapsulateOp : public OpKernel {
         std::string file_name =
             "tf_function_" + ctx->op_kernel().name() + ".json";
 #ifdef NGRAPH_DISTRIBUTED
-        MLSL::Environment* mlsl_env;
-        MLSL::Distribution* mlsl_dist;
         int Rank_ID = MLSL::Environment::GetEnv().GetProcessIdx();
         file_name = "tf_function_" + ctx->op_kernel().name() + "_" +
                     to_string(Rank_ID) + ".json";
diff --git a/src/ngraph_rewrite_pass.cc b/src/ngraph_rewrite_pass.cc
index f494b3f6..a54312c0 100644
--- a/src/ngraph_rewrite_pass.cc
+++ b/src/ngraph_rewrite_pass.cc
@@ -30,7 +30,7 @@
 #include <iomanip>
 
 #ifdef NGRAPH_DISTRIBUTED
-#include "ngraph/mlsl.h"
+#include "ngraph/mlsl.hpp"
 #endif
 
 using namespace std;
@@ -107,8 +107,6 @@ class NGraphRewritePass : public GraphOptimizationPass {
     std::stringstream ss;
     ss << kind << "_" << std::setfill('0') << std::setw(4) << idx;
 #ifdef NGRAPH_DISTRIBUTED
-    MLSL::Environment* mlsl_env;
-    MLSL::Distribution* mlsl_dist;
     int Rank_ID = MLSL::Environment::GetEnv().GetProcessIdx();
     ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID;
 #endif
@@ -120,8 +118,7 @@ class NGraphRewritePass : public GraphOptimizationPass {
     ss << GraphFilenamePrefix(kind, idx) << "_" << std::setfill('0')
        << std::setw(4) << sub_idx;
 #ifdef NGRAPH_DISTRIBUTED
-    int Rank_ID;
-    MPI_Comm_rank(MPI_COMM_WORLD, &Rank_ID);
+    int Rank_ID = MLSL::Environment::GetEnv().GetProcessIdx();
     ss << "_" << std::setfill('0') << std::setw(4) << Rank_ID;
 #endif
     return ss.str();