openvinotoolkit · dmatveev · Nov 30, 2024 · Oct 17, 2024 · Oct 17, 2024 · Nov 21, 2024
@@ -434,6 +434,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
 
     // Finalize memory in closures and weight banks
     finalize_weights_bank();
+    detach_memory();
 
     // Print stats report when possible
     {
@@ -499,6 +500,23 @@ void ov::npuw::CompiledModel::finalize_weights_bank() {
     LOG_INFO("Done.");
 }
 
+void ov::npuw::CompiledModel::detach_memory() {
+    LOG_INFO("Detaching model & weight memory...");
+    LOG_BLOCK();
+    for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
+        auto& comp_model_desc = m_compiled_submodels[idx];
+        auto& proto_comp_model_desc = m_compiled_submodels[comp_model_desc.replaced_by.value_or(idx)];
+        if (!proto_comp_model_desc.model || !proto_comp_model_desc.compiled_model) {
+            continue;  // optimized-out OR already cleared - skip
+        }
+        if (proto_comp_model_desc.device_it + 1 == m_dev_list.end()) {
+            LOG_INFO("No fallback expected - clear the OV model for Subgraph[" << idx << "]");
+            proto_comp_model_desc.model.reset();
+        }
+    }
+    LOG_INFO("Done");
+}
+
 std::string ov::npuw::CompiledModel::global_mem_device() const {
     // Force globally set device if set
     const std::string device_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>();

@@ -78,6 +78,8 @@ class CompiledModel : public ov::ICompiledModel {
     void implement_properties();
 
     void finalize_weights_bank();
+    void detach_memory();
+
     std::string global_mem_device() const;
     std::string funcall_mem_device(const std::size_t idx) const;
 

@@ -19,23 +19,34 @@ namespace npuw {
 namespace weights {
 namespace op {
 struct Const {
-    std::shared_ptr<ov::op::v0::Constant> node;
-
+    std::shared_ptr<ov::op::v0::Constant> m_node;
+    ov::element::Type m_cached_type;
+    ov::Shape m_cached_shape;
+    const void* m_cached_ptr = nullptr;
+
+    explicit Const(std::shared_ptr<ov::op::v0::Constant> n) : m_node(n) {
+        m_cached_type = m_node->get_element_type();
+        m_cached_shape = m_node->get_shape();
+        m_cached_ptr = m_node->get_data_ptr();
+    }
     std::size_t hash() const {
-        std::size_t seed = std::hash<const void*>()(node->get_data_ptr()) + 0x9e3779b9;
-        seed ^= node->get_element_type().hash() + 0x9e3779b9;
-        for (const auto& dim : node->get_shape()) {
+        std::size_t seed = std::hash<const void*>()(m_cached_ptr) + 0x9e3779b9;
+        seed ^= m_cached_type.hash() + 0x9e3779b9;
+        for (const auto& dim : m_cached_shape) {
             seed ^= std::hash<std::size_t>()(dim) + 0x9e3779b9;
         }
         return seed;
     }
     bool operator==(const Const& other) const {
-        return (node->get_shape() == other.node->get_shape() &&
-                node->get_element_type() == other.node->get_element_type() &&
-                node->get_data_ptr() == other.node->get_data_ptr());
+        return (m_cached_type == other.m_cached_type && m_cached_shape == other.m_cached_shape &&
+                m_cached_ptr == other.m_cached_ptr);
     }
     ov::Tensor eval() const {
-        return ov::npuw::util::tensor_from_const(node);
+        NPUW_ASSERT(m_node && "Const::eval() can only happen before detach");
+        return ov::npuw::util::tensor_from_const(m_node);
+    }
+    void detach() {
+        m_node.reset();
     }
 };
 struct Concat {
@@ -59,6 +70,11 @@ struct Concat {
         }
         return ov::npuw::util::concat(to_concat, axis);
     }
+    void detach() {
+        for (auto&& lt : tensors) {
+            lt.detach();
+        }
+    }
 };
 
 struct Unpack {
@@ -95,6 +111,11 @@ struct Unpack {
         }
         return dst;
     }
+    void detach() {
+        w.detach();
+        z.detach();
+        s.detach();
+    }
 };
 struct Permute {
     LazyTensor tensor;
@@ -113,6 +134,9 @@ struct Permute {
     ov::Tensor eval() const {
         return ov::npuw::util::permute(tensor.eval(), axes);
     }
+    void detach() {
+        tensor.detach();
+    }
 };
 struct Convert {
     LazyTensor tensor;
@@ -130,23 +154,26 @@ struct Convert {
         NPUW_ASSERT(ov::element::f16 == type);
         return ov::npuw::util::to_f16(tensor.eval());
     }
+    void detach() {
+        tensor.detach();
+    }
 };
 }  // namespace op
 
 using Transform = std::variant<op::Const, op::Concat, op::Unpack, op::Permute, op::Convert>;
 
 struct LazyTensorImpl {
 public:
-    LazyTensorImpl() = default;
     explicit LazyTensorImpl(Transform&& t);
+    bool operator==(const LazyTensorImpl& other) const;
 
     ov::Tensor eval() const;
-
-    bool operator==(const LazyTensorImpl& other) const;
     std::size_t get_hash() const;
 
+    void detach();
+
     Transform m_transform;
-    std::size_t m_hash = 0;
+    const std::size_t m_hash = 0;
 };
 
 }  // namespace weights
@@ -165,26 +192,12 @@ struct overloaded : Ts... {
 template <class... Ts>
 overloaded(Ts...) -> overloaded<Ts...>;
 
-std::size_t LazyTensorImpl::get_hash() const {
-    // Already calculated
-    if (m_hash != 0) {
-        return m_hash;
-    }
-
-    // Get hash
-    std::size_t seed = 0;
-    std::visit(overloaded{[&seed](const auto& op) {
-                   seed ^= op.hash();
-               }},
-               m_transform);
-
-    return seed;
-}
-
-LazyTensorImpl::LazyTensorImpl(Transform&& t) {
-    m_transform = std::move(t);
-    m_hash = get_hash();
-}
+LazyTensorImpl::LazyTensorImpl(Transform&& t)
+    : m_transform(std::move(t)),
+      m_hash(std::visit(overloaded{[](const auto& op) {
+                            return op.hash();
+                        }},
+                        m_transform)) {}
 
 bool LazyTensorImpl::operator==(const LazyTensorImpl& other) const {
     return m_hash == other.m_hash && m_transform == other.m_transform;
@@ -200,17 +213,25 @@ ov::Tensor LazyTensorImpl::eval() const {
     some kind of indicator that the only difference is concat and we should look for an existing ov::Tensor.
     Perhaps it should be done after model compilation and not handled here.
     */
+    return std::visit(overloaded{[](const auto& op) {
+                          return op.eval();
+                      }},
+                      m_transform);
+}
+
+std::size_t LazyTensorImpl::get_hash() const {
+    return m_hash;
+}
 
-    ov::Tensor result = std::visit(overloaded{[](const auto& op) {
-                                       return op.eval();
-                                   }},
-                                   m_transform);
-    NPUW_ASSERT(result);
-    return result;
+void LazyTensorImpl::detach() {
+    std::visit(overloaded{[](auto& op) {
+                   op.detach();
+               }},
+               m_transform);
 }
 
 LazyTensor::LazyTensor(const std::shared_ptr<ov::op::v0::Constant>& const_ptr)
-    : m_impl(std::make_shared<LazyTensorImpl>(op::Const{const_ptr})) {}
+    : m_impl(std::make_shared<LazyTensorImpl>(op::Const(const_ptr))) {}
 LazyTensor::LazyTensor(const std::vector<LazyTensor>& to_concat, const std::size_t axis)
     : m_impl(std::make_shared<LazyTensorImpl>(op::Concat{to_concat, axis})) {}
 LazyTensor::LazyTensor(const LazyTensor& cw,
@@ -233,11 +254,17 @@ LazyTensor LazyTensor::convert(const ov::element::Type& type) {
 }
 
 bool LazyTensor::operator==(const LazyTensor& other) const {
+    if (!m_impl && !other.m_impl) {
+        return true;
+    }
+    if ((!m_impl && other.m_impl) || (m_impl && !other.m_impl)) {
+        return false;
+    }
     return *m_impl.get() == *other.m_impl.get();
 }
 
 bool LazyTensor::operator!=(const LazyTensor& other) const {
-    return !(*m_impl.get() == *other.m_impl.get());
+    return !(*this == other);
 }
 
 ov::Tensor LazyTensor::eval() const {
@@ -254,6 +281,12 @@ std::size_t LazyTensor::get_hash() const {
     return m_impl->get_hash();
 }
 
+void LazyTensor::detach() {
+    if (m_impl) {
+        m_impl->detach();
+    }
+}
+
 std::size_t LazyTensor::Hash::operator()(const LazyTensor& lt) const {
     return lt.get_hash();
 }
@@ -39,8 +39,8 @@ class LazyTensor {
     bool operator!=(const LazyTensor& other) const;
 
     ov::Tensor eval() const;
-
     std::size_t get_hash() const;
+    void detach();
 
 private:
     std::shared_ptr<LazyTensorImpl> m_impl = nullptr;

@@ -23,7 +23,7 @@ using ov::npuw::online::detail::isOp;
 Group::Group(const std::shared_ptr<ov::Node>& node,
              size_t gid,
              own::ade::NodeHandle nh,
-             const std::shared_ptr<own::ade::Graph>& g,
+             const std::weak_ptr<own::ade::Graph>& g,
              const std::weak_ptr<Snapshot>& snapshot)
     : m_nh(std::move(nh)),
       m_id(gid),
@@ -36,7 +36,7 @@ Group::Group(const std::shared_ptr<ov::Node>& node,
 
 Group::Group(size_t gid,
              own::ade::NodeHandle nh,
-             const std::shared_ptr<own::ade::Graph>& g,
+             const std::weak_ptr<own::ade::Graph>& g,
              const std::weak_ptr<Snapshot>& snapshot)
     : m_nh(std::move(nh)),
       m_id(gid),
@@ -214,23 +214,25 @@ void Group::relinkGraph(const Group::GPtr& gptr_other) {
     auto consumers = gptr_other->dstNodes();
 
     // Remove gptr_other node from the graph. Note: also removes all it's edges
-    m_graph->remove(gptr_other->getHandle());
+    auto&& graph = m_graph.lock();
+    NPUW_ASSERT(graph);
+    graph->remove(gptr_other->getHandle());
     for (const auto& nh : producers) {
         if (m_nh == nh) {
             continue;
         }
         // relink the graph
-        if (!m_graph->linked(nh, m_nh)) {
-            m_graph->link(nh, m_nh);
+        if (!graph->linked(nh, m_nh)) {
+            graph->link(nh, m_nh);
         }
     }
     for (const auto& nh : consumers) {
         if (m_nh == nh) {
             continue;
         }
         // relink the graph
-        if (!m_graph->linked(m_nh, nh)) {
-            m_graph->link(m_nh, nh);
+        if (!graph->linked(m_nh, nh)) {
+            graph->link(m_nh, nh);
         }
     }
 }

@@ -33,11 +33,11 @@ class Group : public std::enable_shared_from_this<Group> {
     Group(const std::shared_ptr<ov::Node>& node,
           size_t gid,
           own::ade::NodeHandle nh,
-          const std::shared_ptr<own::ade::Graph>& g,
+          const std::weak_ptr<own::ade::Graph>& g,
           const std::weak_ptr<Snapshot>& snapshot);
     Group(size_t gid,
           own::ade::NodeHandle nh,
-          const std::shared_ptr<own::ade::Graph>& g,
+          const std::weak_ptr<own::ade::Graph>& g,
           const std::weak_ptr<Snapshot>& snapshot);
 
     // After we formed a final structure of partitioning,
@@ -100,7 +100,7 @@ class Group : public std::enable_shared_from_this<Group> {
 
     own::ade::NodeHandle m_nh;
     size_t m_id;  // used for utility prints only
-    std::shared_ptr<own::ade::Graph> m_graph;
+    std::weak_ptr<own::ade::Graph> m_graph;
     std::weak_ptr<Snapshot> m_snapshot;
     bool m_frozen = false;
     bool m_nofold = false;

@@ -127,6 +127,14 @@ Impl<M> _(std::shared_ptr<M> pM) {
 
 }  // namespace at
 
+// Written here to be a drop-in replacement for ov::parallel_for for the debug purposes
+template <typename F>
+void non_parallel_for(std::size_t count, F&& f) {
+    for (std::size_t idx = 0u; idx < count; idx++) {
+        f(idx);
+    }
+}
+
 }  // namespace util
 }  // namespace npuw
 }  // namespace ov