diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp index b3b474c3b989ed..f1c51477602e32 100644 --- a/src/common/transformations/src/transformations/convert_precision.cpp +++ b/src/common/transformations/src/transformations/convert_precision.cpp @@ -871,6 +871,14 @@ inline int32_t convert_value(uint32_t val) { return static_cast(val); } +template <> +inline int64_t convert_value(uint64_t val) { + if (val > static_cast(std::numeric_limits::max())) { + return std::numeric_limits::max(); + } + return static_cast(val); +} + namespace { template std::shared_ptr change_constant_precision(std::shared_ptr& constant) { @@ -1110,7 +1118,9 @@ bool fuse_type_to_constant(const std::shared_ptr& node, const auto& to = it->second; if (auto constant = ov::as_type_ptr(node)) { std::shared_ptr new_const; - if (from == ov::element::u64 && to == ov::element::i32) { + if (from == ov::element::u64 && to == ov::element::i64) { + new_const = change_constant_precision(constant); + } else if (from == ov::element::u64 && to == ov::element::i32) { new_const = change_constant_precision(constant); } else if (from == ov::element::i64 && to == ov::element::i32) { new_const = change_constant_precision(constant); diff --git a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp index eeac793acc7dcc..4af3d785d0fd41 100644 --- a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp +++ b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp @@ -110,6 +110,11 @@ INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(ENABLE); INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(IGNORE_CALLBACK); INFERENCE_ENGINE_1_0_DEPRECATED DECLARE_CONFIG_VALUE(DISABLE); +/** + * @brief Enables inference with INT64 data type in CPU plugin if it's presented in the original model. + */ +DECLARE_CONFIG_KEY(CPU_NATIVE_I64); + } // namespace PluginConfigInternalParams } // namespace InferenceEngine diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 4351a3876bd913..28673c69d5bfa6 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -230,6 +230,15 @@ void Config::readProperties(const std::map &prop) { IE_THROW() << "Wrong value for property key " << ov::hint::execution_mode.name() << ". Supported values: PERFORMANCE, ACCURACY"; } + } else if (key == PluginConfigInternalParams::KEY_CPU_NATIVE_I64) { + if (val == PluginConfigParams::YES) { + enableNativeI64 = true; + } else if (val == PluginConfigParams::NO) { + enableNativeI64 = false; + } else { + IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_CPU_NATIVE_I64 << ": " << val + << ". Expected only YES or NO values."; + } } else { IE_THROW(NotFound) << "Unsupported property " << key << " by CPU plugin"; } @@ -308,4 +317,4 @@ void Config::updateProperties() { } } // namespace intel_cpu -} // namespace ov +} // namespace ov diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 65237c52e20138..fc9b8c63aba3ad 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -57,6 +57,7 @@ struct Config { // TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives size_t rtCacheCapacity = 0ul; #endif + bool enableNativeI64 = false; InferenceEngine::IStreamsExecutor::Config streamExecutorConfig; InferenceEngine::PerfHintsConfig perfHintsConfig; bool enableCpuPinning = true; diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp index 1cef0551d1eb08..0146c0cfa7b9af 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp @@ -4,45 +4,43 @@ #include "dnnl_extension_utils.h" -#include "utils/general_utils.h" #include #include "memory_desc/dnnl_blocked_memory_desc.h" -#include "onednn/iml_type_mapper.h" -#include #include -#include - using namespace dnnl; namespace ov { namespace intel_cpu { -uint8_t DnnlExtensionUtils::sizeOfDataType(dnnl::memory::data_type dataType) { +uint8_t DnnlExtensionUtils::sizeOfDataType(memory::data_type dataType) { switch (dataType) { - case dnnl::memory::data_type::f32: - return 4; - case dnnl::memory::data_type::s32: + case memory::data_type::f64: + case memory::data_type::s64: + return 8; + case memory::data_type::f32: + case memory::data_type::s32: return 4; - case dnnl::memory::data_type::bf16: + case memory::data_type::bf16: + case memory::data_type::f16: return 2; - case dnnl::memory::data_type::s8: - return 1; - case dnnl::memory::data_type::u8: + case memory::data_type::s8: + case memory::data_type::u8: + case memory::data_type::bin: return 1; - case dnnl::memory::data_type::bin: - return 1; - case dnnl::memory::data_type::f16: - return 2; - case dnnl::memory::data_type::undef: + case memory::data_type::undef: return 0; default: - IE_THROW() << "Unsupported data type."; + IE_THROW() << "Unsupported data type: " << DataTypeToIEPrecision(dataType); } } memory::data_type DnnlExtensionUtils::IEPrecisionToDataType(const InferenceEngine::Precision& prec) { switch (prec) { + case InferenceEngine::Precision::FP64: + return memory::data_type::f64; + case InferenceEngine::Precision::I64: + return memory::data_type::s64; case InferenceEngine::Precision::FP32: return memory::data_type::f32; case InferenceEngine::Precision::I32: @@ -68,6 +66,10 @@ memory::data_type DnnlExtensionUtils::IEPrecisionToDataType(const InferenceEngin InferenceEngine::Precision DnnlExtensionUtils::DataTypeToIEPrecision(memory::data_type dataType) { switch (dataType) { + case memory::data_type::f64: + return InferenceEngine::Precision::FP64; + case memory::data_type::s64: + return InferenceEngine::Precision::I64; case memory::data_type::f32: return InferenceEngine::Precision::FP32; case memory::data_type::s32: @@ -90,11 +92,11 @@ InferenceEngine::Precision DnnlExtensionUtils::DataTypeToIEPrecision(memory::dat } } -Dim DnnlExtensionUtils::convertToDim(const dnnl::memory::dim &dim) { +Dim DnnlExtensionUtils::convertToDim(const memory::dim &dim) { return dim == DNNL_RUNTIME_DIM_VAL ? Shape::UNDEFINED_DIM : static_cast(dim); } -dnnl::memory::dim DnnlExtensionUtils::convertToDnnlDim(const Dim &dim) { - return dim == Shape::UNDEFINED_DIM ? DNNL_RUNTIME_DIM_VAL : static_cast(dim); +memory::dim DnnlExtensionUtils::convertToDnnlDim(const Dim &dim) { + return dim == Shape::UNDEFINED_DIM ? DNNL_RUNTIME_DIM_VAL : static_cast(dim); } VectorDims DnnlExtensionUtils::convertToVectorDims(const memory::dims& dims) { @@ -133,19 +135,19 @@ memory::format_tag DnnlExtensionUtils::GetPlainFormatByRank(size_t rank) { } } -DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const dnnl::memory::desc &desc) { +DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const memory::desc &desc) { return makeDescriptor(desc.get()); } DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const_dnnl_memory_desc_t desc) { - if (desc->format_kind == dnnl::impl::format_kind_t::dnnl_blocked) { + if (desc->format_kind == impl::format_kind_t::dnnl_blocked) { return std::shared_ptr(new DnnlBlockedMemoryDesc(desc)); } else { return std::shared_ptr(new DnnlMemoryDesc(desc)); } } -size_t DnnlExtensionUtils::getMemSizeForDnnlDesc(const dnnl::memory::desc& desc) { +size_t DnnlExtensionUtils::getMemSizeForDnnlDesc(const memory::desc& desc) { auto tmpDesc = desc; const auto offset0 = tmpDesc.get()->offset0; @@ -167,8 +169,8 @@ std::shared_ptr DnnlExtensionUtils::makeUndefinedDesc(con } } -DnnlMemoryDescPtr DnnlExtensionUtils::query_md(const const_dnnl_primitive_desc_t& pd, const dnnl::query& what, int idx) { - auto query = dnnl::convert_to_c(what); +DnnlMemoryDescPtr DnnlExtensionUtils::query_md(const const_dnnl_primitive_desc_t& pd, const query& what, int idx) { + auto query = convert_to_c(what); const auto* cdesc = dnnl_primitive_desc_query_md(pd, query, idx); if (!cdesc) diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 81c1440ece418a..cb1662fac258ac 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -45,7 +45,6 @@ #include "memory_desc/cpu_memory_desc_utils.h" #include -#include #include #include #include @@ -306,7 +305,7 @@ void Graph::Replicate(const CNNNetwork &network) { // change precision for input/output nodes to avoid extra data conversion when set input/output blobs // also we need to change input/output precisions for consumers/producers to avoid inserting reorder for (auto &input : inputNodesMap) { - const auto precToSet = normalizeToSupportedPrecision(inputsInfo.at(input.first)->getPrecision()); + auto precToSet = normalizeToSupportedPrecision(inputsInfo.at(input.first)->getPrecision(), getConfig().enableNativeI64); input.second->setOriginalOutputPrecisionAtPort(0, precToSet); const auto childEdges = input.second->getChildEdgesAtPort(0); for (size_t i = 0; i < childEdges.size(); i++) { @@ -320,7 +319,7 @@ void Graph::Replicate(const CNNNetwork &network) { } for (auto &output : outputNodesMap) { - const auto precToSet = normalizeToSupportedPrecision(outputsInfo.at(output.first)->getPrecision()); + auto precToSet = normalizeToSupportedPrecision(outputsInfo.at(output.first)->getPrecision(), getConfig().enableNativeI64); output.second->setOriginalInputPrecisionAtPort(0, precToSet); const auto parentEdges = output.second->getParentEdgesAtPort(0); for (size_t i = 0; i < parentEdges.size(); i++) { @@ -1004,7 +1003,7 @@ void Graph::PushInputData(const std::string& name, const InferenceEngine::Blob:: // todo: make sure 'name' exists in this map... if (_normalizePreprocMap.find(name) != _normalizePreprocMap.end()) { - if (inTensorDesc.getPrecision() == InferenceEngine::Precision::FP32) { + if (inTensorDesc.getPrecision() == Precision::FP32) { _normalizePreprocMap[name].NormalizeImage(outDims, reinterpret_cast(inter_data_ptr), inTensorDesc.getLayout()); } else { @@ -1460,16 +1459,16 @@ void Graph::SortTopologically() { } } -void Graph::GetPerfData(std::map &perfMap) const { +void Graph::GetPerfData(std::map &perfMap) const { unsigned i = 0; - std::function &, const NodePtr&)> - getPerfMapFor = [&](std::map &perfMap, const NodePtr& node) { - InferenceEngine::InferenceEngineProfileInfo &pc = perfMap[node->getName()]; + std::function &, const NodePtr&)> + getPerfMapFor = [&](std::map &perfMap, const NodePtr& node) { + InferenceEngineProfileInfo &pc = perfMap[node->getName()]; pc.execution_index = i++; // TODO: Why time counter is signed? pc.cpu_uSec = pc.realTime_uSec = (long long) node->PerfCounter().avg(); - pc.status = pc.cpu_uSec > 0 ? InferenceEngine::InferenceEngineProfileInfo::EXECUTED - : InferenceEngine::InferenceEngineProfileInfo::NOT_RUN; + pc.status = pc.cpu_uSec > 0 ? InferenceEngineProfileInfo::EXECUTED + : InferenceEngineProfileInfo::NOT_RUN; std::string pdType = node->getPrimitiveDescriptorType(); size_t typeLen = sizeof(pc.exec_type) / sizeof(pc.exec_type[0]); pdType.copy(pc.exec_type, typeLen, 0); diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index f4e8c3944afb27..2c542b11992019 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -19,7 +19,6 @@ #include "nodes/mvn.h" #include "nodes/transpose.h" #include "nodes/interpolate.h" -#include "nodes/reduce.h" #include "nodes/input.h" #include "nodes/rnn.h" #include "nodes/common/cpu_convert.h" diff --git a/src/plugins/intel_cpu/src/infer_request.cpp b/src/plugins/intel_cpu/src/infer_request.cpp index f2aacbd3db42b8..7729ff5b1c2cbc 100644 --- a/src/plugins/intel_cpu/src/infer_request.cpp +++ b/src/plugins/intel_cpu/src/infer_request.cpp @@ -367,7 +367,7 @@ InferRequestBase::normToInputSupportedPrec(const std::pairhasMeanImageFor(input.first) && one_of(inPrec, InferenceEngine::Precision::U8, InferenceEngine::Precision::BOOL)) { inPrec = InferenceEngine::Precision::FP32; } else { - inPrec = normalizeToSupportedPrecision(inPrec); + inPrec = normalizeToSupportedPrecision(inPrec, graph->getConfig().enableNativeI64); } if (inPrec == InferenceEngine::Precision::UNSPECIFIED) { @@ -583,7 +583,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name) auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory()); if (!data) { InferenceEngine::TensorDesc desc = _networkOutputs[name]->getTensorDesc(); - desc.setPrecision(normalizeToSupportedPrecision(desc.getPrecision())); + desc.setPrecision(normalizeToSupportedPrecision(desc.getPrecision(), graph->getConfig().enableNativeI64)); // WA: need to avoid exception thrown when we compare blocking desc in SetBlob // in situation if we push output blobs as inputs for next network (in Hetero plugin) diff --git a/src/plugins/intel_cpu/src/nodes/eye.cpp b/src/plugins/intel_cpu/src/nodes/eye.cpp index 747e89bdc1ed11..46f3fd28f7de26 100644 --- a/src/plugins/intel_cpu/src/nodes/eye.cpp +++ b/src/plugins/intel_cpu/src/nodes/eye.cpp @@ -55,10 +55,6 @@ Eye::Eye(const std::shared_ptr& op, const GraphContext::CPtr context) } outType = op->get_output_element_type(0); withBatchShape = (op->get_input_size() == 4); - if (!one_of(outType, ngraph::element::f32, ngraph::element::bf16, - ngraph::element::i32, ngraph::element::i8, ngraph::element::u8)) { - THROW_ERROR << errorPrefix << "doesn't support demanded output precision"; - } } void Eye::getSupportedDescriptors() { diff --git a/src/plugins/intel_cpu/src/nodes/non_zero.cpp b/src/plugins/intel_cpu/src/nodes/non_zero.cpp index cbb0b134211359..1f9a200fc35e2e 100644 --- a/src/plugins/intel_cpu/src/nodes/non_zero.cpp +++ b/src/plugins/intel_cpu/src/nodes/non_zero.cpp @@ -40,9 +40,6 @@ NonZero::NonZero(const std::shared_ptr& op, const GraphContext::CP } else { IE_THROW(NotImplemented) << errorMessage; } - if (op->get_output_element_type(0) != ngraph::element::i32) { - IE_THROW() << errorPrefix << "doesn't support demanded output precision"; - } } void NonZero::getSupportedDescriptors() { diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 6bab3c95e2c841..c70628b4f7a15f 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -9,7 +9,6 @@ #include "transformations/transformation_pipeline.h" #include "itt.h" -#include "extension_mngr.h" #include "extension.h" #include "serialize.h" #include "threading/ie_executor_manager.hpp" @@ -21,11 +20,9 @@ #include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" #include "openvino/runtime/intel_cpu/properties.hpp" -#include #include #include "performance_heuristics.hpp" -#include "openvino/runtime/properties.hpp" #include "weights_cache.hpp" #include "utils/denormals.hpp" @@ -36,7 +33,6 @@ #endif #include -#include #if defined(OV_CPU_WITH_ACL) #include "nodes/executors/acl/acl_ie_scheduler.hpp" @@ -164,7 +160,7 @@ static bool streamsSet(const std::map& config) { config.count(ov::num_streams.name()); } -void Engine::ApplyPerformanceHints(std::map &config, const std::shared_ptr& ngraphFunc) const { +void Engine::ApplyPerformanceHints(std::map &config, const std::shared_ptr& ngraphFunc) const { auto getNumStreamsLatency = [&]() { return std::pair(CONFIG_VALUE(CPU_THROUGHPUT_NUMA), ov::util::to_string(ov::streams::NUMA)); }; @@ -281,7 +277,7 @@ void Engine::ApplyPerformanceHints(std::map &config, c } } -void Engine::GetPerformanceStreams(Config& config, const std::shared_ptr& ngraphFunc) { +void Engine::GetPerformanceStreams(Config& config, const std::shared_ptr& ngraphFunc) { const auto perf_hint_name = config.perfHintsConfig.ovPerfHint; const int latency_streams = get_default_latency_streams(config.latencyThreadingMode); int streams; @@ -462,6 +458,19 @@ static Config::SnippetsMode getSnippetsMode(const std::map& modelConfig, Config& engineConfig) { + engineConfig.enableNativeI64 = false; + const auto i64prop = modelConfig.find(InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64); + if (i64prop != modelConfig.end()) { + if (i64prop->second == PluginConfigParams::YES) { + engineConfig.enableNativeI64 = true; + } else if (i64prop->second != PluginConfigParams::NO) { + IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_CPU_NATIVE_I64 << ": " << i64prop->second + << ". Expected only YES or NO values."; + } + } +} + InferenceEngine::IExecutableNetworkInternal::Ptr Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std::map &orig_config) { OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, "Engine::LoadExeNetworkImpl"); @@ -495,6 +504,7 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std const bool enableLPT = shouldEnableLPT(config, engConfig); ov::element::Type inferencePrecision = getInferencePrecision(config, engConfig); const Config::SnippetsMode snippetsMode = getSnippetsMode(config, engConfig); + setI64Mode(config, engConfig); auto nGraphFunc = clonedNetwork.getFunction(); @@ -770,6 +780,7 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma const bool enableLPT = (lptProp != config.end() && lptProp->second == PluginConfigParams::YES) /* enabled in the orig_config*/ || Config::LPTransformsMode::On == engConfig.lpTransformsMode /* or already enabled */; const Config::SnippetsMode snippetsMode = getSnippetsMode(config, conf); + setI64Mode(config, conf); auto model = network.getFunction(); if (model == nullptr) { @@ -785,7 +796,7 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma transformation.UpToCpuSpecificOpSet(); transformation.CpuSpecificOpSet(); }, - [&](const std::shared_ptr& op) { + [&](const std::shared_ptr& op) { std::unique_ptr ptr; try { ptr.reset(Node::factory().create(op, context)); diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp index b2cd223db3e7b2..9b5dfa94be7e41 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp @@ -2,34 +2,32 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include "ngraph/op/fake_quantize.hpp" -#include "ngraph/pass/manager.hpp" +#pragma once + #include "common/pass/reshape_fc_fusion.hpp" #include "common/pass/align_matmul_input_ranks.hpp" -#include "transformations/common_optimizations/reshape_prelu.hpp" #include "common/pass/convert_broadcast_to_tiles.hpp" #include "common/pass/convert_tile_to_seq_tiles.hpp" #include "common/pass/convert_matmul_to_fc.hpp" #include "common/pass/convert_to_power_static.hpp" #include "common/pass/convert_to_leaky_relu.hpp" #include "common/pass/convert_to_swish_cpu.hpp" -#include "transformations/convert_precision.hpp" -#include "transformations/utils/utils.hpp" #include "common/pass/rnn_sequences_optimization.hpp" -#include "transformations/common_optimizations/reshape_sequence_fusion.hpp" #include "common/pass/ngram_fusion.hpp" -#include "transformations/defs.hpp" +#include +#include "openvino/pass/manager.hpp" +#include "transformations/common_optimizations/reshape_sequence_fusion.hpp" +#include "transformations/cpu_opset/x64/pass/convert_precision_i64_i32.hpp" #include "itt.hpp" namespace ov { namespace intel_cpu { -inline void ConvertToCPUSpecificOpset(std::shared_ptr &nGraphFunc) { +inline void ConvertToCPUSpecificOpset(std::shared_ptr &model, bool enable_i64) { RUN_ON_FUNCTION_SCOPE(ConvertToCPUSpecificOpset); - ngraph::pass::Manager manager; + ov::pass::Manager manager; manager.set_per_pass_validation(false); CPU_REGISTER_PASS_COMMON(manager, ConvertMatMulToFC); CPU_REGISTER_PASS_COMMON(manager, AlignMatMulInputRanks); @@ -38,17 +36,21 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr &nGraphF CPU_REGISTER_PASS_COMMON(manager, ConvertToLeakyRelu); CPU_REGISTER_PASS_COMMON(manager, ConvertToSwishCPU); CPU_REGISTER_PASS_COMMON(manager, OptimizeSequenceTransposes); - if (!ov::op::util::has_op_with_type(nGraphFunc)) { + if (!ov::op::util::has_op_with_type(model)) { CPU_REGISTER_PASS_COMMON(manager, ReshapeFullyConnectedFusion); } // after transformation "MoveEltwiseUpThroughDataMov" there can be reshaped sequences that should be eliminated or fused CPU_REGISTER_PASS_COMMON(manager, ov::pass::ReshapeSequenceFusion); CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConstantFolding); - CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertPrecision, precisions_map {{ ngraph::element::i64, ngraph::element::i32 }}); + if (!enable_i64) { + CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertPrecision, precisions_map{{element::i64, element::i32}}); + } else { + CPU_REGISTER_PASS_X64(manager, ConvertPrecisionI64ToI32); + } CPU_REGISTER_PASS_COMMON(manager, NgramFusion); CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate); - manager.run_passes(nGraphFunc); + manager.run_passes(model); } } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.cpp new file mode 100644 index 00000000000000..358d43b56dea20 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.cpp @@ -0,0 +1,101 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + + +#include "convert_precision_i64_i32.hpp" +#include +#include "transformations/utils/utils.hpp" +#include "cpu_types.h" + +#include + +namespace ov { +namespace intel_cpu { + +// Returns list of operations that support i64. +bool ConvertPrecisionI64ToI32::isNativelySupported(const ov::Node::type_info_t& type) const { + static const std::unordered_set i64Ops = { + opset12::Parameter::get_type_info_static(), + opset12::Result::get_type_info_static() + }; + + return i64Ops.find(type) != i64Ops.end(); +} + +std::shared_ptr ConvertPrecisionI64ToI32::changeConstantPrecision(std::shared_ptr& constant) const { + const auto* srcData = constant->get_data_ptr(); + const auto size = shape_size(constant->get_shape()); + + auto newConstant = std::make_shared(element::i32, constant->get_shape()); + newConstant->output(0).set_names(constant->output(0).get_names()); + auto* dstData = const_cast(reinterpret_cast(newConstant->get_data_ptr())); + OPENVINO_ASSERT(dstData != nullptr, "Can't get destination data pointer"); + + for (size_t i = 0; i < size; ++i) { + if (srcData[i] >= std::numeric_limits::max()) { + dstData[i] = std::numeric_limits::max(); + } else if (srcData[i] <= std::numeric_limits::lowest()) { + dstData[i] = std::numeric_limits::lowest(); + } else { + dstData[i] = static_cast(srcData[i]); + } + } + return newConstant; +} + +bool ConvertPrecisionI64ToI32::run_on_model(const std::shared_ptr& model) { + const auto orderedOps = model->get_ordered_ops(); + for (const auto& op : orderedOps) { + if (isNativelySupported(op->get_type_info()) || TypeFromName(op->get_type_name()) == Type::Unknown) { + continue; + } + + bool convertForOutputsRequired = false; + for (const auto& input : op->inputs()) { + if (input.get_element_type() == element::i64) { + auto parentOutput = input.get_source_output(); + auto parentNode = parentOutput.get_node_shared_ptr(); + if (is_type(parentNode) && + parentNode->get_input_element_type(0) == element::i32 && + parentNode->get_output_element_type(0) == element::i64) { + input.replace_source_output(parentNode->input_value(0)); + } else if (is_type(op) && + op->get_input_element_type(0) == element::i64 && + op->get_output_element_type(0) == element::i32) { + continue; + } else if (auto constOp = as_type_ptr(parentNode)) { + auto newConst = changeConstantPrecision(constOp); + input.replace_source_output(newConst); + newConst->set_friendly_name(constOp->get_friendly_name()); + } else { + auto convert = std::make_shared(input.get_source_output(), element::i32); + convert->output(0).add_names(parentOutput.get_names()); + input.replace_source_output(convert); + } + convertForOutputsRequired = true; + } + } + + if (convertForOutputsRequired) { + // Propagate i32 precision into outputs. + op->validate_and_infer_types(); + for (auto& output : op->outputs()) { + if (output.get_element_type() == element::i32) { + auto convert = std::make_shared(output, element::i64); + replace_output_update_name(output, convert->output(0)); + } + } + } + + if (auto multisubgraph_op = as_type_ptr(op)) { + for (size_t idx = 0; idx < multisubgraph_op->get_internal_subgraphs_size(); ++idx) { + run_on_model(multisubgraph_op->get_function(static_cast(idx))); + } + } + } + + return true; +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.hpp new file mode 100644 index 00000000000000..15f1a18f480bf3 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/convert_precision_i64_i32.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/pass.hpp" + +// This transformation inserts Conversion node (i64->i32) before a node that does not support i64 execution. +// If the Conversion i64->i32 was added before the target node, it also inserts the Conversion i32->i64 after +// the target node to leave the child nodes with i64 type. + +namespace ov { +namespace intel_cpu { +class ConvertPrecisionI64ToI32: public ov::pass::ModelPass { +public: + OPENVINO_RTTI("ConvertPrecisionI64ToI32", "0"); + + ConvertPrecisionI64ToI32() = default; + + bool isNativelySupported(const ov::Node::type_info_t& type) const; + + std::shared_ptr changeConstantPrecision(std::shared_ptr& constant) const; + + bool run_on_model(const std::shared_ptr& model) override; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 31b39dc9acf809..d29b4f0cc61085 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2022 Intel Corporation +// Copyright (C) 2022-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -25,6 +25,7 @@ #include "transformations/common_optimizations/fq_mul_fusion.hpp" #include "transformations/common_optimizations/mul_fake_quantize_fusion.hpp" #include "transformations/common_optimizations/nop_elimination.hpp" +#include "transformations/common_optimizations/reshape_prelu.hpp" #include "transformations/common_optimizations/transpose_sinking.hpp" #include "transformations/common_optimizations/weights_dequantize_to_fake_quantize.hpp" #include "transformations/common_optimizations/augru_cell_fusion.hpp" @@ -53,8 +54,6 @@ #include "transformations/op_conversions/convert_slice_to_strided_slice.hpp" #include "transformations/op_conversions/convert_space_to_batch.hpp" #include "transformations/op_conversions/convert_space_to_depth.hpp" -#include "transformations/op_conversions/convert_subtract.hpp" -#include "transformations/op_conversions/convert_ti_to_sequences.hpp" #include "transformations/op_conversions/detection_output_downgrade.hpp" #include "transformations/op_conversions/detection_output_upgrade.hpp" #include "transformations/op_conversions/eye_decomposition.hpp" @@ -98,11 +97,6 @@ #include "transformations/snippets/x64/pass/snippets_mark_skipped.hpp" #include "transformations/cpu_opset/x64/pass/mha_fusion.hpp" #include "transformations/cpu_opset/x64/pass/convert_to_interaction.hpp" -#include "transformations/cpu_opset/arm/pass/convert_group_conv.hpp" -#include "transformations/cpu_opset/arm/pass/convert_group_conv1d.hpp" -#include "transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.hpp" -#include "transformations/cpu_opset/arm/pass/mish_decomposition.hpp" -#include "transformations/cpu_opset/common/pass/decompose_integer_divide.hpp" #include "transformations/cpu_opset/common/pass/convert_fq_rnn_to_quantized_rnn.hpp" #include "transformations/cpu_opset/common/pass/insert_convert_after_extension.hpp" #include "transformations/cpu_opset/common/pass/move_eltwise_up_data_movement.hpp" @@ -124,12 +118,22 @@ #include "dnnl.hpp" #include +#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) + +#include "transformations/cpu_opset/arm/pass/convert_group_conv.hpp" +#include "transformations/cpu_opset/arm/pass/convert_group_conv1d.hpp" +#include "transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.hpp" +#include "transformations/cpu_opset/arm/pass/mish_decomposition.hpp" +#include "transformations/cpu_opset/common/pass/decompose_integer_divide.hpp" + +#endif // OPENVINO_ARCH_ARM || OPENVINO_ARCH_ARM64 + namespace ov { namespace intel_cpu { using const_node_ptr = const std::shared_ptr; -bool Transformations::fuse_type_to_convert(const std::shared_ptr& node, const precisions_map& precisions) { +bool Transformations::fuse_type_to_convert(const std::shared_ptr& node, const precisions_map& precisions) { const auto& from = node->get_output_element_type(0); auto it = precisions.find(from); if (it == precisions.end()) @@ -141,7 +145,7 @@ bool Transformations::fuse_type_to_convert(const std::shared_ptr& // is converted to be 1 for boolean, but 0 for u8. Thus an Abs and Ceil node should be added before the // Convert node for this scenario. if (convert->input(0).get_element_type().is_real() && - convert->get_convert_element_type() == ngraph::element::boolean && to.is_integral_number()) { + convert->get_convert_element_type() == ov::element::boolean && to.is_integral_number()) { auto abs = std::make_shared(convert->input_value(0).get_node_shared_ptr()); auto ceil = std::make_shared(abs); auto new_convert = std::make_shared(ceil, to); @@ -193,7 +197,7 @@ void Transformations::UpToCpuSpecificOpSet() { void Transformations::CpuSpecificOpSet(void) { CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Specific); - ConvertToCPUSpecificOpset(model); + ConvertToCPUSpecificOpset(model, config.enableNativeI64); } void Transformations::PreLpt(const std::vector& defaultPrecisions, const bool isLegacyApi) { @@ -209,11 +213,10 @@ void Transformations::PreLpt(const std::vector& defaultPrecis if (useLpt) { CPU_REGISTER_PASS_COMMON(manager, ov::pass::MarkDequantizationSubgraph, defaultPrecisions); } + const bool enable_i64 = config.enableNativeI64; - auto get_convert_precisions = []() { + auto get_convert_precisions = [&enable_i64]() { precisions_map map = { - {ov::element::i64, ov::element::i32}, - {ov::element::u64, ov::element::i32}, {ov::element::i16, ov::element::i32}, {ov::element::u16, ov::element::i32}, {ov::element::u32, ov::element::i32}, @@ -224,12 +227,21 @@ void Transformations::PreLpt(const std::vector& defaultPrecis {ov::element::u4, ov::element::u8} }; - if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) + if (enable_i64) { + map.insert({ov::element::u64, ov::element::i64}); + } else { + map.insert({ov::element::u64, ov::element::i32}); + map.insert({ov::element::i64, ov::element::i32}); + } + + if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) { map.insert({ov::element::bf16, ov::element::f32}); + } return map; }; - static const auto precisions = get_convert_precisions(); + + const auto precisions = get_convert_precisions(); type_to_fuse_map type_to_fuse = {{ov::opset10::Convert::get_type_info_static(), fuse_type_to_convert}}; CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion); @@ -264,8 +276,13 @@ void Transformations::PreLpt(const std::vector& defaultPrecis // Common ConvertPrecision pass handles only a limited set of opevino operations to match the list of precisions supported by the plugin. // However, if the extension operation produces an output precision that is not natively supported, this may lead to inconsistency during // element type propagation. This transformation is called before the ConvertPrecision pass to align the actual precisions with the list of supported ones. - CPU_REGISTER_PASS_COMMON(manager, ov::pass::InsertConvertAfterExtension); + if (!enable_i64) { + CPU_REGISTER_PASS_COMMON(manager, ov::pass::InsertConvertAfterExtension); + } CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertPrecision, precisions, type_to_fuse); + if (enable_i64) { + CPU_REGISTER_PASS_X64(manager, ConvertPrecisionI64ToI32); + } CPU_REGISTER_PASS_COMMON(manager, ov::pass::EliminateConvert); CPU_REGISTER_PASS_COMMON(manager, SwapConvertTranspose); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h index 57ad2e95e122af..290011951aa264 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h @@ -62,7 +62,7 @@ class Transformations { void Snippets(void); - static bool fuse_type_to_convert(const std::shared_ptr& node, const precisions_map& precisions); + static bool fuse_type_to_convert(const std::shared_ptr& node, const precisions_map& precisions); }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/utils/blob_dump.cpp b/src/plugins/intel_cpu/src/utils/blob_dump.cpp index dce76d115d0908..af4b32babce63e 100644 --- a/src/plugins/intel_cpu/src/utils/blob_dump.cpp +++ b/src/plugins/intel_cpu/src/utils/blob_dump.cpp @@ -166,6 +166,12 @@ void BlobDumper::dumpAsTxt(std::ostream &stream) const { const void *ptr = memory->getData(); switch (desc.getPrecision()) { + case Precision::FP64 : { + auto *blob_ptr = reinterpret_cast(ptr); + for (size_t i = 0; i < data_size; i++) + stream << blob_ptr[desc.getElementOffset(i)] << std::endl; + break; + } case Precision::FP32 : { auto *blob_ptr = reinterpret_cast(ptr); for (size_t i = 0; i < data_size; i++) @@ -180,6 +186,12 @@ void BlobDumper::dumpAsTxt(std::ostream &stream) const { } break; } + case Precision::I64: { + auto *blob_ptr = reinterpret_cast(ptr); + for (size_t i = 0; i < data_size; i++) + stream << blob_ptr[desc.getElementOffset(i)] << std::endl; + break; + } case Precision::I32: { auto *blob_ptr = reinterpret_cast(ptr); for (size_t i = 0; i < data_size; i++) diff --git a/src/plugins/intel_cpu/src/utils/cpu_utils.hpp b/src/plugins/intel_cpu/src/utils/cpu_utils.hpp index 870b081ba277cb..7aaed391c2a7e6 100644 --- a/src/plugins/intel_cpu/src/utils/cpu_utils.hpp +++ b/src/plugins/intel_cpu/src/utils/cpu_utils.hpp @@ -96,7 +96,11 @@ inline bool isEmptyTensorDesc(const InferenceEngine::TensorDesc &td) { * precision for convert * @return plug-in supported precision or UNSPECIFIED if precision unsupported */ -inline InferenceEngine::Precision normalizeToSupportedPrecision(InferenceEngine::Precision precision) { +inline InferenceEngine::Precision normalizeToSupportedPrecision(InferenceEngine::Precision precision, bool enable_i64 = false) { + if (enable_i64 && one_of(precision, InferenceEngine::Precision::I64, InferenceEngine::Precision::U64)) { + return InferenceEngine::Precision::I64; + } + switch (precision) { case InferenceEngine::Precision::U8: case InferenceEngine::Precision::I8: diff --git a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/custom_op_insert_convert_i64.cpp b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/custom_op_insert_convert_i64.cpp index afde4960bc3bc0..333adbef08fbfc 100644 --- a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/custom_op_insert_convert_i64.cpp +++ b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/custom_op_insert_convert_i64.cpp @@ -134,8 +134,7 @@ class CustomOpConvertI64CPUTest : public testing::WithParamInterface +#include +#include "test_utils/cpu_test_utils.hpp" +#include + +using namespace ov::test; +using namespace CPUTestUtils; + +namespace CPULayerTestsDefinitions { +using InsertConvertI64I32CPUTestParams = std::tuple; + +class InsertConvertI64I32CPUTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest, + public CPUTestsBase { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + ElementType inType; + InputShape inputShape; + std::tie(inType, inputShape) = obj.param; + + std::ostringstream result; + result << "IS=" << inputShape << "_"; + result << "Prc=" << inType; + return result.str(); + } + +protected: + void SetUp() override { + targetDevice = CommonTestUtils::DEVICE_CPU; + configuration[InferenceEngine::PluginConfigInternalParams::KEY_CPU_NATIVE_I64] = InferenceEngine::PluginConfigParams::YES; + + InputShape inputShape; + std::tie(inType, inputShape) = this->GetParam(); + + init_input_shapes({inputShape}); + auto inputParams = ngraph::builder::makeDynamicParams(inType, inputDynamicShapes); + auto nonZero = std::make_shared(inputParams[0]); + + ov::ResultVector results{std::make_shared(nonZero)}; + function = std::make_shared(results, inputParams, "insertConvertI64I32"); + } +}; + +TEST_P(InsertConvertI64I32CPUTest, CompareWithRefs) { + run(); + CheckNumberOfNodesWithType(compiledModel, "Convert", 2); +} + +const InputShape inputShapes = { + {}, {{1, 3, 32, 32}} +}; + +INSTANTIATE_TEST_SUITE_P(smoke_CustomOp, + InsertConvertI64I32CPUTest, + ::testing::Combine(::testing::Values(ElementType::i64), ::testing::Values(inputShapes)), + InsertConvertI64I32CPUTest::getTestCaseName); + +} // namespace CPULayerTestsDefinitions diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index 5bf389f616d0b1..5ebeba259e06d2 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit 5bf389f616d0b1551ce468f2712df492a7aab140 +Subproject commit 5ebeba259e06d28b74ad4114074cc3ee1c53fa0a