diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
index 48401ba5ea42a9..b702eece37002b 100755
--- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
@@ -511,12 +511,10 @@ async def main() -> None:
         )
         invocation.append("-list-checks")
         invocation.append("-")
-        if args.quiet:
-            # Even with -quiet we still want to check if we can call clang-tidy.
-            with open(os.devnull, "w") as dev_null:
-                subprocess.check_call(invocation, stdout=dev_null)
-        else:
-            subprocess.check_call(invocation)
+        # Even with -quiet we still want to check if we can call clang-tidy.
+        subprocess.check_call(
+            invocation, stdout=subprocess.DEVNULL if args.quiet else None
+        )
     except:
         print("Unable to run clang-tidy.", file=sys.stderr)
         sys.exit(1)
diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst
index 92074bd4dae8ba..d5303418b859b2 100644
--- a/clang-tools-extra/docs/clang-tidy/Contributing.rst
+++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst
@@ -127,14 +127,15 @@ Writing a clang-tidy Check
 
 So you have an idea of a useful check for :program:`clang-tidy`.
 
-First, if you're not familiar with LLVM development, read through the `Getting
-Started with LLVM`_ document for instructions on setting up your workflow and
+First, if you're not familiar with LLVM development, read through the `Getting Started 
+with the LLVM System`_ document for instructions on setting up your workflow and
 the `LLVM Coding Standards`_ document to familiarize yourself with the coding
-style used in the project. For code reviews we mostly use `LLVM Phabricator`_.
+style used in the project. For code reviews we currently use `LLVM Github`_,
+though historically we used Phabricator.
 
-.. _Getting Started with LLVM: https://llvm.org/docs/GettingStarted.html
+.. _Getting Started with the LLVM System: https://llvm.org/docs/GettingStarted.html
 .. _LLVM Coding Standards: https://llvm.org/docs/CodingStandards.html
-.. _LLVM Phabricator: https://llvm.org/docs/Phabricator.html
+.. _LLVM Github: https://github.com/llvm/llvm-project
 
 Next, you need to decide which module the check belongs to. Modules
 are located in subdirectories of `clang-tidy/
@@ -336,13 +337,25 @@ a starting point for your test cases.  A rough outline of the process looks like
 The quickest way to prototype your matcher is to use :program:`clang-query` to
 interactively build up your matcher.  For complicated matchers, build up a matching
 expression incrementally and use :program:`clang-query`'s ``let`` command to save named
-matching expressions to simplify your matcher.  Just like breaking up a huge function
-into smaller chunks with intention-revealing names can help you understand a complex
-algorithm, breaking up a matcher into smaller matchers with intention-revealing names
-can help you understand a complicated matcher.  Once you have a working matcher, the
-C++ API will be virtually identical to your interactively constructed matcher.  You can
-use local variables to preserve your intention-revealing names that you applied to
-nested matchers.
+matching expressions to simplify your matcher.
+
+.. code-block:: console
+
+  clang-query> let c1 cxxRecordDecl()
+  clang-query> match c1
+
+Alternatively, pressing the tab key after a previous matcher's open parentheses would also 
+show which matchers can be chained with the previous matcher, though some matchers that work 
+may not be listed.
+
+Just like breaking up a huge function into smaller chunks with intention-revealing names 
+can help you understand a complex algorithm, breaking up a matcher into smaller matchers 
+with intention-revealing names can help you understand a complicated matcher.  
+
+Once you have a working clang-query matcher, the C++ API matchers will be the same or similar 
+to your interactively constructed matcher (there can be cases where they differ slightly). 
+You can use local variables to preserve your intention-revealing names that you applied 
+to nested matchers.
 
 Creating private matchers
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -646,10 +659,13 @@ directory.  The path to this directory is available in a lit test with the varia
 Out-of-tree check plugins
 -------------------------
 
+
 Developing an out-of-tree check as a plugin largely follows the steps
-outlined above. The plugin is a shared library whose code lives outside
+outlined above, including creating a new module and doing the hacks to 
+register the module. The plugin is a shared library whose code lives outside
 the clang-tidy build system. Build and link this shared library against
-LLVM as done for other kinds of Clang plugins.
+LLVM as done for other kinds of Clang plugins. If using CMake, use the keyword
+``MODULE`` while invoking ``add_library`` or ``llvm_add_library``.
 
 The plugin can be loaded by passing `-load` to `clang-tidy` in addition to the
 names of the checks to enable.
@@ -664,6 +680,19 @@ compiled against the version of clang-tidy that will be loading the plugin.
 The plugins can use threads, TLS, or any other facilities available to in-tree
 code which is accessible from the external headers.
 
+Note that testing out-of-tree checks might involve getting ``llvm-lit`` from an LLVM 
+installation compiled from source. See `Getting Started with the LLVM System`_ for ways 
+to do so.
+
+Alternatively, get `lit`_ following the `test-suite guide`_ and get the `FileCheck`_ binary, 
+and write a version of `check_clang_tidy.py`_ to suit your needs.
+
+.. _Getting Started with the LLVM System: https://llvm.org/docs/GettingStarted.html
+.. _test-suite guide: https://llvm.org/docs/TestSuiteGuide.html
+.. _lit: https://llvm.org/docs/CommandGuide/lit.html
+.. _FileCheck: https://llvm.org/docs/CommandGuide/FileCheck.html
+.. _check_clang_tidy.py: https://github.com/llvm/llvm-project/blob/main/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
+
 Running clang-tidy on LLVM
 --------------------------
 
@@ -688,10 +717,10 @@ warnings and errors. The script provides multiple configuration flags.
 
 * To restrict the files examined you can provide one or more regex arguments
   that the file names are matched against.
-  ``run-clang-tidy.py clang-tidy/.*Check\.cpp`` will only analyze clang-tidy
+  ``run-clang-tidy.py clang-tidy/.*Check\.cpp`` will only analyze `clang-tidy`
   checks. It may also be necessary to restrict the header files that warnings
-  are displayed from using the ``-header-filter`` flag. It has the same behavior
-  as the corresponding :program:`clang-tidy` flag.
+  are displayed from by using the ``-header-filter`` and ``-exclude-header-filter`` flags. 
+  They have the same behavior as the corresponding :program:`clang-tidy` flags.
 
 * To apply suggested fixes ``-fix`` can be passed as an argument. This gathers
   all changes in a temporary directory and applies them. Passing ``-format``
@@ -758,4 +787,4 @@ There is only one argument that controls profile storage:
 
   * If you run :program:`clang-tidy` from within ``/foo`` directory, and specify
     ``-store-check-profile=.``, then the profile will still be saved to
-    ``/foo/<ISO8601-like timestamp>-example.cpp.json``
+    ``/foo/<ISO8601-like timestamp>-example.cpp.json``
\ No newline at end of file
diff --git a/clang/docs/HLSL/ExpectedDifferences.rst b/clang/docs/HLSL/ExpectedDifferences.rst
index 4782eb3cda754a..e143c5b71575aa 100644
--- a/clang/docs/HLSL/ExpectedDifferences.rst
+++ b/clang/docs/HLSL/ExpectedDifferences.rst
@@ -54,6 +54,19 @@ HLSL 202x based on proposal
 and
 `0008 <https://github.com/microsoft/hlsl-specs/blob/main/proposals/0008-non-member-operator-overloading.md>`_.
 
+The largest difference between Clang and DXC's overload resolution is the
+algorithm used for identifying best-match overloads. There are more details
+about the algorithmic differences in the :ref:`multi_argument_overloads` section
+below. There are three high level differences that should be highlighted:
+
+* **There should be no cases** where DXC and Clang both successfully
+  resolve an overload where the resolved overload is different between the two.
+* There are cases where Clang will successfully resolve an overload that DXC
+  wouldn't because we've trimmed the overload set in Clang to remove ambiguity.
+* There are cases where DXC will successfully resolve an overload that Clang
+  will not for two reasons: (1) DXC only generates partial overload sets for
+  builtin functions and (2) DXC resolves cases that probably should be ambiguous.
+
 Clang's implementation extends standard overload resolution rules to HLSL
 library functionality. This causes subtle changes in overload resolution
 behavior between Clang and DXC. Some examples include:
@@ -71,18 +84,23 @@ behavior between Clang and DXC. Some examples include:
     uint U;
     int I;
     float X, Y, Z;
-    double3 A, B;
+    double3 R, G;
   }
 
-  void twoParams(int, int);
-  void twoParams(float, float);
+  void takesSingleDouble(double);
+  void takesSingleDouble(vector<double, 1>);
+
+  void scalarOrVector(double);
+  void scalarOrVector(vector<double, 2>);
 
   export void call() {
-    halfOrInt16(U); // DXC: Fails with call ambiguous between int16_t and uint16_t overloads
-                    // Clang: Resolves to halfOrInt16(uint16_t).
-    halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t).
     half H;
+    halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t).
+
   #ifndef IGNORE_ERRORS
+    halfOrInt16(U); // All: Fails with call ambiguous between int16_t and uint16_t
+                    // overloads
+
     // asfloat16 is a builtin with overloads for half, int16_t, and uint16_t.
     H = asfloat16(I); // DXC: Fails to resolve overload for int.
                       // Clang: Resolves to asfloat16(int16_t).
@@ -94,21 +112,28 @@ behavior between Clang and DXC. Some examples include:
 
     takesDoubles(X, Y, Z); // Works on all compilers
   #ifndef IGNORE_ERRORS
-    fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to double.
+    fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to
+                  //   double.
                   // Clang: Resolves to fma(double,double,double).
-  #endif
 
-    double D = dot(A, B); // DXC: Resolves to dot(double3, double3), fails DXIL Validation.
+    double D = dot(R, G); // DXC: Resolves to dot(double3, double3), fails DXIL Validation.
                           // FXC: Expands to compute double dot product with fmul/fadd
-                          // Clang: Resolves to dot(float3, float3), emits conversion warnings.
+                          // Clang: Fails to resolve as ambiguous against
+                          //   dot(half, half) or dot(float, float)
+  #endif
 
   #ifndef IGNORE_ERRORS
     tan(B); // DXC: resolves to tan(float).
             // Clang: Fails to resolve, ambiguous between integer types.
 
-    twoParams(I, X); // DXC: resolves twoParams(int, int).
-                     // Clang: Fails to resolve ambiguous conversions.
   #endif
+
+    double D;
+    takesSingleDouble(D); // All: Fails to resolve ambiguous conversions.
+    takesSingleDouble(R); // All: Fails to resolve ambiguous conversions.
+
+    scalarOrVector(D); // All: Resolves to scalarOrVector(double).
+    scalarOrVector(R); // All: Fails to resolve ambiguous conversions.
   }
 
 .. note::
@@ -119,3 +144,75 @@ behavior between Clang and DXC. Some examples include:
   diagnostic notifying the user of the conversion rather than silently altering
   precision relative to the other overloads (as FXC does) or generating code
   that will fail validation (as DXC does).
+
+.. _multi_argument_overloads:
+
+Multi-Argument Overloads
+------------------------
+
+In addition to the differences in single-element conversions, Clang and DXC
+differ dramatically in multi-argument overload resolution. C++ multi-argument
+overload resolution behavior (or something very similar) is required to
+implement
+`non-member operator overloading <https://github.com/microsoft/hlsl-specs/blob/main/proposals/0008-non-member-operator-overloading.md>`_.
+
+Clang adopts the C++ inspired language from the
+`draft HLSL specification <https://microsoft.github.io/hlsl-specs/specs/hlsl.pdf>`_,
+where an overload ``f1`` is a better candidate than ``f2`` if for all arguments the
+conversion sequences is not worse than the corresponding conversion sequence and
+for at least one argument it is better.
+
+.. code-block:: c++
+
+  cbuffer CB {
+    int I;
+    float X;
+    float4 V;
+  }
+
+  void twoParams(int, int);
+  void twoParams(float, float);
+  void threeParams(float, float, float);
+  void threeParams(float4, float4, float4);
+
+  export void call() {
+    twoParams(I, X); // DXC: resolves twoParams(int, int).
+                     // Clang: Fails to resolve ambiguous conversions.
+
+    threeParams(X, V, V); // DXC: resolves threeParams(float4, float4, float4).
+                          // Clang: Fails to resolve ambiguous conversions.
+  }
+
+For the examples above since ``twoParams`` called with mixed parameters produces
+implicit conversion sequences that are { ExactMatch, FloatingIntegral }  and {
+FloatingIntegral, ExactMatch }. In both cases an argument has a worse conversion
+in the other sequence, so the overload is ambiguous.
+
+In the ``threeParams`` example the sequences are { ExactMatch, VectorTruncation,
+VectorTruncation } or { VectorSplat, ExactMatch, ExactMatch }, again in both
+cases at least one parameter has a worse conversion in the other sequence, so
+the overload is ambiguous.
+
+.. note::
+
+  The behavior of DXC documented below is undocumented so this is gleaned from
+  observation and a bit of reading the source.
+
+DXC's approach for determining the best overload produces an integer score value
+for each implicit conversion sequence for each argument expression. Scores for
+casts are based on a bitmask construction that is complicated to reverse
+engineer. It seems that:
+
+* Exact match is 0
+* Dimension increase is 1
+* Promotion is 2
+* Integral -> Float conversion is 4
+* Float -> Integral conversion is 8
+* Cast is 16
+
+The masks are or'd against each other to produce a score for the cast.
+
+The scores of each conversion sequence are then summed to generate a score for
+the overload candidate. The overload candidate with the lowest score is the best
+candidate. If more than one overload are matched for the lowest score the call
+is ambiguous.
diff --git a/clang/docs/tools/generate_formatted_state.py b/clang/docs/tools/generate_formatted_state.py
index 66cebbf7af33a4..2de43dc383f557 100755
--- a/clang/docs/tools/generate_formatted_state.py
+++ b/clang/docs/tools/generate_formatted_state.py
@@ -78,8 +78,6 @@ def get_style(count, passed):
      - {style2}`{percent}%`
 """
 
-FNULL = open(os.devnull, "w")
-
 
 with open(DOC_FILE, "wb") as output:
     cleanfiles = open(CLEAN_FILE, "wb")
@@ -101,8 +99,8 @@ def get_style(count, passed):
                 # interested in it, just the return code.
                 git_check = subprocess.Popen(
                     ["git", "ls-files", "--error-unmatch", act_sub_dir],
-                    stdout=FNULL,
-                    stderr=FNULL,
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL,
                 )
                 if git_check.wait() != 0:
                     print("Skipping directory: ", act_sub_dir)
diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index 034d32c6291b3d..2e80eef2c8b9bc 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -124,6 +124,7 @@ TARGET_BUILTIN(__builtin_wasm_bitmask_i16x8, "UiV8s", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_bitmask_i32x4, "UiV4i", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_bitmask_i64x2, "UiV2LLi", "nc", "simd128")
 
+TARGET_BUILTIN(__builtin_wasm_abs_f16x8, "V8hV8h", "nc", "fp16")
 TARGET_BUILTIN(__builtin_wasm_abs_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_abs_f64x2, "V2dV2d", "nc", "simd128")
 
@@ -140,6 +141,10 @@ TARGET_BUILTIN(__builtin_wasm_max_f16x8, "V8hV8hV8h", "nc", "fp16")
 TARGET_BUILTIN(__builtin_wasm_pmin_f16x8, "V8hV8hV8h", "nc", "fp16")
 TARGET_BUILTIN(__builtin_wasm_pmax_f16x8, "V8hV8hV8h", "nc", "fp16")
 
+TARGET_BUILTIN(__builtin_wasm_ceil_f16x8, "V8hV8h", "nc", "fp16")
+TARGET_BUILTIN(__builtin_wasm_floor_f16x8, "V8hV8h", "nc", "fp16")
+TARGET_BUILTIN(__builtin_wasm_trunc_f16x8, "V8hV8h", "nc", "fp16")
+TARGET_BUILTIN(__builtin_wasm_nearest_f16x8, "V8hV8h", "nc", "fp16")
 TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_trunc_f32x4, "V4fV4f", "nc", "simd128")
@@ -151,9 +156,13 @@ TARGET_BUILTIN(__builtin_wasm_nearest_f64x2, "V2dV2d", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_dot_s_i32x4_i16x8, "V4iV8sV8s", "nc", "simd128")
 
+TARGET_BUILTIN(__builtin_wasm_sqrt_f16x8, "V8hV8h", "nc", "fp16")
 TARGET_BUILTIN(__builtin_wasm_sqrt_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_sqrt_f64x2, "V2dV2d", "nc", "simd128")
 
+TARGET_BUILTIN(__builtin_wasm_trunc_saturate_s_i16x8_f16x8, "V8sV8h", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_trunc_saturate_u_i16x8_f16x8, "V8sV8h", "nc", "simd128")
+
 TARGET_BUILTIN(__builtin_wasm_trunc_saturate_s_i32x4_f32x4, "V4iV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_trunc_saturate_u_i32x4_f32x4, "V4iV4f", "nc", "simd128")
 
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 6a77323d939791..9bd77edb0a550f 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -5324,11 +5324,11 @@ bool Compiler<Emitter>::VisitVectorUnaryOperator(const UnaryOperator *E) {
 
   auto UnaryOp = E->getOpcode();
   if (UnaryOp != UO_Plus && UnaryOp != UO_Minus && UnaryOp != UO_LNot &&
-      UnaryOp != UO_Not)
+      UnaryOp != UO_Not && UnaryOp != UO_AddrOf)
     return this->emitInvalid(E);
 
   // Nothing to do here.
-  if (UnaryOp == UO_Plus)
+  if (UnaryOp == UO_Plus || UnaryOp == UO_AddrOf)
     return this->delegate(SubExpr);
 
   if (!Initializing) {
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index b0256a8ce9ed04..d6ec26af80aadd 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -350,6 +350,7 @@ void SourceManager::clearIDTables() {
   LastLineNoContentCache = nullptr;
   LastFileIDLookup = FileID();
 
+  IncludedLocMap.clear();
   if (LineTable)
     LineTable->clear();
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4204c8ff276ab1..c9f21f9ded24f4 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -21211,6 +21211,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f64:
   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f32:
   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f64:
+  case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i16x8_f16x8:
   case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32x4_f32x4: {
     Value *Src = EmitScalarExpr(E->getArg(0));
     llvm::Type *ResT = ConvertType(E->getType());
@@ -21222,6 +21223,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f64:
   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f32:
   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f64:
+  case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i16x8_f16x8:
   case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32x4_f32x4: {
     Value *Src = EmitScalarExpr(E->getArg(0));
     llvm::Type *ResT = ConvertType(E->getType());
@@ -21269,6 +21271,10 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType()));
     return Builder.CreateCall(Callee, {LHS, RHS});
   }
+  case WebAssembly::BI__builtin_wasm_ceil_f16x8:
+  case WebAssembly::BI__builtin_wasm_floor_f16x8:
+  case WebAssembly::BI__builtin_wasm_trunc_f16x8:
+  case WebAssembly::BI__builtin_wasm_nearest_f16x8:
   case WebAssembly::BI__builtin_wasm_ceil_f32x4:
   case WebAssembly::BI__builtin_wasm_floor_f32x4:
   case WebAssembly::BI__builtin_wasm_trunc_f32x4:
@@ -21279,18 +21285,22 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   case WebAssembly::BI__builtin_wasm_nearest_f64x2: {
     unsigned IntNo;
     switch (BuiltinID) {
+    case WebAssembly::BI__builtin_wasm_ceil_f16x8:
     case WebAssembly::BI__builtin_wasm_ceil_f32x4:
     case WebAssembly::BI__builtin_wasm_ceil_f64x2:
       IntNo = Intrinsic::ceil;
       break;
+    case WebAssembly::BI__builtin_wasm_floor_f16x8:
     case WebAssembly::BI__builtin_wasm_floor_f32x4:
     case WebAssembly::BI__builtin_wasm_floor_f64x2:
       IntNo = Intrinsic::floor;
       break;
+    case WebAssembly::BI__builtin_wasm_trunc_f16x8:
     case WebAssembly::BI__builtin_wasm_trunc_f32x4:
     case WebAssembly::BI__builtin_wasm_trunc_f64x2:
       IntNo = Intrinsic::trunc;
       break;
+    case WebAssembly::BI__builtin_wasm_nearest_f16x8:
     case WebAssembly::BI__builtin_wasm_nearest_f32x4:
     case WebAssembly::BI__builtin_wasm_nearest_f64x2:
       IntNo = Intrinsic::nearbyint;
@@ -21489,12 +21499,14 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::wasm_bitmask, Vec->getType());
     return Builder.CreateCall(Callee, {Vec});
   }
+  case WebAssembly::BI__builtin_wasm_abs_f16x8:
   case WebAssembly::BI__builtin_wasm_abs_f32x4:
   case WebAssembly::BI__builtin_wasm_abs_f64x2: {
     Value *Vec = EmitScalarExpr(E->getArg(0));
     Function *Callee = CGM.getIntrinsic(Intrinsic::fabs, Vec->getType());
     return Builder.CreateCall(Callee, {Vec});
   }
+  case WebAssembly::BI__builtin_wasm_sqrt_f16x8:
   case WebAssembly::BI__builtin_wasm_sqrt_f32x4:
   case WebAssembly::BI__builtin_wasm_sqrt_f64x2: {
     Value *Vec = EmitScalarExpr(E->getArg(0));
diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h
index 2327bec52522d2..67d12f6f2cf419 100644
--- a/clang/lib/Headers/wasm_simd128.h
+++ b/clang/lib/Headers/wasm_simd128.h
@@ -33,6 +33,7 @@ typedef unsigned long long __u64x2
     __attribute__((__vector_size__(16), __aligned__(16)));
 typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
 typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef __fp16 __f16x8 __attribute__((__vector_size__(16), __aligned__(16)));
 
 typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8)));
 typedef unsigned char __u8x8
@@ -1878,6 +1879,152 @@ wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v128_t __a, v128_t __b, v128_t __c) {
       (__i8x16)__a, (__i8x16)__b, (__i32x4)__c);
 }
 
+// FP16 intrinsics
+#define __FP16_FN_ATTRS                                                        \
+  __attribute__((__always_inline__, __nodebug__, __target__("fp16"),           \
+                 __min_vector_width__(128)))
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_splat(float __a) {
+  return (v128_t)__builtin_wasm_splat_f16x8(__a);
+}
+
+static __inline__ float __FP16_FN_ATTRS wasm_f16x8_extract_lane(v128_t __a,
+                                                                int __i)
+    __REQUIRE_CONSTANT(__i) {
+  return __builtin_wasm_extract_lane_f16x8((__f16x8)__a, __i);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_replace_lane(v128_t __a,
+                                                                 int __i,
+                                                                 float __b)
+    __REQUIRE_CONSTANT(__i) {
+  return (v128_t)__builtin_wasm_replace_lane_f16x8((__f16x8)__a, __i, __b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_abs(v128_t __a) {
+  return (v128_t)__builtin_wasm_abs_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_neg(v128_t __a) {
+  return (v128_t)(-(__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sqrt(v128_t __a) {
+  return (v128_t)__builtin_wasm_sqrt_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ceil(v128_t __a) {
+  return (v128_t)__builtin_wasm_ceil_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_floor(v128_t __a) {
+  return (v128_t)__builtin_wasm_floor_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_trunc(v128_t __a) {
+  return (v128_t)__builtin_wasm_trunc_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_nearest(v128_t __a) {
+  return (v128_t)__builtin_wasm_nearest_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_eq(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a == (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ne(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a != (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_lt(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a < (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_gt(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a > (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_le(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a <= (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ge(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a >= (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_add(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)((__f16x8)__a + (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sub(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)((__f16x8)__a - (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_mul(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)((__f16x8)__a * (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_div(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)((__f16x8)__a / (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_min(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)__builtin_wasm_min_f16x8((__f16x8)__a, (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_max(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)__builtin_wasm_max_f16x8((__f16x8)__a, (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmin(v128_t __a,
+                                                         v128_t __b) {
+  return (v128_t)__builtin_wasm_pmin_f16x8((__f16x8)__a, (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmax(v128_t __a,
+                                                         v128_t __b) {
+  return (v128_t)__builtin_wasm_pmax_f16x8((__f16x8)__a, (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS
+wasm_i16x8_trunc_sat_f16x8(v128_t __a) {
+  return (v128_t)__builtin_wasm_trunc_saturate_s_i16x8_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS
+wasm_u16x8_trunc_sat_f16x8(v128_t __a) {
+  return (v128_t)__builtin_wasm_trunc_saturate_u_i16x8_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_i16x8(v128_t __a) {
+  return (v128_t) __builtin_convertvector((__i16x8)__a, __f16x8);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) {
+  return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a,
+                                                                 v128_t __b,
+                                                                 v128_t __c) {
+  return (v128_t)__builtin_wasm_relaxed_madd_f16x8((__f16x8)__a, (__f16x8)__b,
+                                                   (__f16x8)__c);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_nmadd(v128_t __a,
+                                                                  v128_t __b,
+                                                                  v128_t __c) {
+  return (v128_t)__builtin_wasm_relaxed_nmadd_f16x8((__f16x8)__a, (__f16x8)__b,
+                                                    (__f16x8)__c);
+}
+
 // Deprecated intrinsics
 
 static __inline__ v128_t __DEPRECATED_FN_ATTRS("wasm_i8x16_swizzle")
diff --git a/clang/test/CodeGen/address-safety-attr-flavors.cpp b/clang/test/CodeGen/address-safety-attr-flavors.cpp
index 04d540d471dc8f..ef815555059db8 100644
--- a/clang/test/CodeGen/address-safety-attr-flavors.cpp
+++ b/clang/test/CodeGen/address-safety-attr-flavors.cpp
@@ -28,8 +28,8 @@ int HasSanitizeAddress() { return 1; }
 // CHECK-NOASAN: {{Function Attrs: mustprogress noinline nounwind$}}
 // CHECK-ASAN: Function Attrs: mustprogress noinline nounwind sanitize_address
 // CHECK-KASAN: Function Attrs: mustprogress noinline nounwind sanitize_address
-// CHECK-HWASAN: Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress
-// CHECK-KHWASAN: Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress
+// CHECK-HWASAN: Function Attrs: mustprogress noinline nounwind sanitize_hwaddress
+// CHECK-KHWASAN: Function Attrs: mustprogress noinline nounwind sanitize_hwaddress
 
 __attribute__((no_sanitize("address"))) int NoSanitizeQuoteAddress() {
   return 0;
@@ -37,15 +37,15 @@ __attribute__((no_sanitize("address"))) int NoSanitizeQuoteAddress() {
 // CHECK-NOASAN: {{Function Attrs: mustprogress noinline nounwind$}}
 // CHECK-ASAN: {{Function Attrs: mustprogress noinline nounwind$}}
 // CHECK-KASAN: {{Function Attrs: mustprogress noinline nounwind$}}
-// CHECK-HWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress$}}
-// CHECK-KHWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress$}}
+// CHECK-HWASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_hwaddress$}}
+// CHECK-KHWASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_hwaddress$}}
 
 __attribute__((no_sanitize_address)) int NoSanitizeAddress() { return 0; }
 // CHECK-NOASAN: {{Function Attrs: mustprogress noinline nounwind$}}
 // CHECK-ASAN: {{Function Attrs: mustprogress noinline nounwind$}}
 // CHECK-KASAN: {{Function Attrs: mustprogress noinline nounwind$}}
-// CHECK-HWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress$}}
-// CHECK-KHWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress$}}
+// CHECK-HWASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_hwaddress$}}
+// CHECK-KHWASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_hwaddress$}}
 
 __attribute__((no_sanitize("kernel-address"))) int NoSanitizeKernelAddress() {
   return 0;
@@ -53,8 +53,8 @@ __attribute__((no_sanitize("kernel-address"))) int NoSanitizeKernelAddress() {
 // CHECK-NOASAN: {{Function Attrs: mustprogress noinline nounwind$}}
 // CHECK-ASAN: {{Function Attrs: mustprogress noinline nounwind$}}
 // CHECK-KASAN: {{Function Attrs: mustprogress noinline nounwind$}}
-// CHECK-HWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress$}}
-// CHECK-KHWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind sanitize_hwaddress$}}
+// CHECK-HWASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_hwaddress$}}
+// CHECK-KHWASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_hwaddress$}}
 
 __attribute__((no_sanitize("hwaddress"))) int NoSanitizeHWAddress() {
   return 0;
@@ -62,8 +62,8 @@ __attribute__((no_sanitize("hwaddress"))) int NoSanitizeHWAddress() {
 // CHECK-NOASAN: {{Function Attrs: mustprogress noinline nounwind$}}
 // CHECK-ASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_address$}}
 // CHECK-KASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_address$}}
-// CHECK-HWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind$}}
-// CHECK-KHWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind$}}
+// CHECK-HWASAN: {{Function Attrs: mustprogress noinline nounwind$}}
+// CHECK-KHWASAN: {{Function Attrs: mustprogress noinline nounwind$}}
 
 __attribute__((no_sanitize("kernel-hwaddress"))) int NoSanitizeKernelHWAddress() {
   return 0;
@@ -71,8 +71,8 @@ __attribute__((no_sanitize("kernel-hwaddress"))) int NoSanitizeKernelHWAddress()
 // CHECK-NOASAN: {{Function Attrs: mustprogress noinline nounwind$}}
 // CHECK-ASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_address$}}
 // CHECK-KASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_address$}}
-// CHECK-HWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind$}}
-// CHECK-KHWASAN: {{Function Attrs: mustprogress nobuiltin noinline nounwind$}}
+// CHECK-HWASAN: {{Function Attrs: mustprogress noinline nounwind$}}
+// CHECK-KHWASAN: {{Function Attrs: mustprogress noinline nounwind$}}
 
 __attribute__((disable_sanitizer_instrumentation)) int DisableSanitizerInstrumentation() {
   return 0;
@@ -80,5 +80,5 @@ __attribute__((disable_sanitizer_instrumentation)) int DisableSanitizerInstrumen
 // CHECK-NOASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}}
 // CHECK-ASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}}
 // CHECK-KASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}}
-// CHECK-HWASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress nobuiltin noinline nounwind$}}
-// CHECK-KHWASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress nobuiltin noinline nounwind$}}
+// CHECK-HWASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}}
+// CHECK-KHWASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}}
diff --git a/clang/test/CodeGen/compound-literal.c b/clang/test/CodeGen/compound-literal.c
index 5b3cebb7c6ae6a..5fe9594c0f954f 100644
--- a/clang/test/CodeGen/compound-literal.c
+++ b/clang/test/CodeGen/compound-literal.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -fexperimental-new-constant-interpreter -emit-llvm %s -o - | FileCheck %s
 
 // Capture the type and name so matching later is cleaner.
 struct CompoundTy { int a; };
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 6ee10976f12079..9d202e0d046822 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -2361,198 +2361,395 @@ extern "C" __device__ double test_modf(double x, double* y) {
   return modf(x, y);
 }
 
-// CHECK-LABEL: @test_nanf(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// CHECK-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
-// CHECK:       if.then.i.i:
-// CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
-// CHECK-NEXT:      i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
-// CHECK-NEXT:      i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]]
-// CHECK-NEXT:    ]
-// CHECK:       while.cond.i30.i.i.preheader:
-// CHECK-NEXT:    br label [[WHILE_COND_I30_I_I:%.*]]
-// CHECK:       while.cond.i30.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ]
-// CHECK-NEXT:    [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ]
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I33_I_I]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]]
-// CHECK:       while.body.i34.i.i:
-// CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP2]], -48
-// CHECK-NEXT:    [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]]
-// CHECK:       if.else.i.i.i:
-// CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[TMP2]], -97
-// CHECK-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6
-// CHECK-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
-// CHECK:       if.else17.i.i.i:
-// CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP2]], -65
-// CHECK-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6
-// CHECK-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]]
-// CHECK:       if.end31.i.i.i:
-// CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4
-// CHECK-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
-// CHECK-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
-// CHECK-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
-// CHECK-NEXT:    br label [[CLEANUP_I36_I_I]]
-// CHECK:       cleanup.i36.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP11]]
-// CHECK:       while.cond.i.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ]
-// CHECK-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ]
-// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
-// CHECK:       while.body.i.i.i:
-// CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -8
-// CHECK-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48
-// CHECK-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]]
-// CHECK:       if.then.i.i.i:
-// CHECK-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3
-// CHECK-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
-// CHECK-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
-// CHECK-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
-// CHECK-NEXT:    br label [[CLEANUP_I_I_I]]
-// CHECK:       cleanup.i.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
-// CHECK-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
-// CHECK-NEXT:    br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP7]]
-// CHECK:       while.cond.i14.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ]
-// CHECK-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ]
-// CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]]
-// CHECK:       while.body.i18.i.i:
-// CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[TMP8]], -48
-// CHECK-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]]
-// CHECK:       if.then.i24.i.i:
-// CHECK-NEXT:    [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10
-// CHECK-NEXT:    [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64
-// CHECK-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
-// CHECK-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
-// CHECK-NEXT:    br label [[CLEANUP_I20_I_I]]
-// CHECK:       cleanup.i20.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
-// CHECK-NEXT:    [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ]
-// CHECK-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP10]]
-// CHECK:       _ZL4nanfPKc.exit:
-// CHECK-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ]
-// CHECK-NEXT:    [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32
-// CHECK-NEXT:    [[BF_VALUE_I:%.*]] = and i32 [[CONV_I]], 4194303
-// CHECK-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i32 [[BF_VALUE_I]], 2143289344
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[BF_SET9_I]] to float
-// CHECK-NEXT:    ret float [[TMP10]]
+// DEFAULT-LABEL: @test_nanf(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
+// DEFAULT-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
+// DEFAULT:       if.then.i.i:
+// DEFAULT-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
+// DEFAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
+// DEFAULT-NEXT:      i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
+// DEFAULT-NEXT:      i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]]
+// DEFAULT-NEXT:    ]
+// DEFAULT:       while.cond.i30.i.i.preheader:
+// DEFAULT-NEXT:    br label [[WHILE_COND_I30_I_I:%.*]]
+// DEFAULT:       while.cond.i30.i.i:
+// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I33_I_I]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]]
+// DEFAULT:       while.body.i34.i.i:
+// DEFAULT-NEXT:    [[TMP3:%.*]] = add i8 [[TMP2]], -48
+// DEFAULT-NEXT:    [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10
+// DEFAULT-NEXT:    br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]]
+// DEFAULT:       if.else.i.i.i:
+// DEFAULT-NEXT:    [[TMP4:%.*]] = add i8 [[TMP2]], -97
+// DEFAULT-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6
+// DEFAULT-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
+// DEFAULT:       if.else17.i.i.i:
+// DEFAULT-NEXT:    [[TMP5:%.*]] = add i8 [[TMP2]], -65
+// DEFAULT-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6
+// DEFAULT-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]]
+// DEFAULT:       if.end31.i.i.i:
+// DEFAULT-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// DEFAULT-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4
+// DEFAULT-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
+// DEFAULT-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
+// DEFAULT-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
+// DEFAULT-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
+// DEFAULT-NEXT:    br label [[CLEANUP_I36_I_I]]
+// DEFAULT:       cleanup.i36.i.i:
+// DEFAULT-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
+// DEFAULT-NEXT:    [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ]
+// DEFAULT-NEXT:    [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ]
+// DEFAULT-NEXT:    br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP11]]
+// DEFAULT:       while.cond.i.i.i:
+// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ]
+// DEFAULT-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ]
+// DEFAULT-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
+// DEFAULT:       while.body.i.i.i:
+// DEFAULT-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -8
+// DEFAULT-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48
+// DEFAULT-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]]
+// DEFAULT:       if.then.i.i.i:
+// DEFAULT-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3
+// DEFAULT-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
+// DEFAULT-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
+// DEFAULT-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
+// DEFAULT-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
+// DEFAULT-NEXT:    br label [[CLEANUP_I_I_I]]
+// DEFAULT:       cleanup.i.i.i:
+// DEFAULT-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
+// DEFAULT-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
+// DEFAULT-NEXT:    br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP7]]
+// DEFAULT:       while.cond.i14.i.i:
+// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ]
+// DEFAULT-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ]
+// DEFAULT-NEXT:    [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]]
+// DEFAULT:       while.body.i18.i.i:
+// DEFAULT-NEXT:    [[TMP9:%.*]] = add i8 [[TMP8]], -48
+// DEFAULT-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10
+// DEFAULT-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]]
+// DEFAULT:       if.then.i24.i.i:
+// DEFAULT-NEXT:    [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10
+// DEFAULT-NEXT:    [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64
+// DEFAULT-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
+// DEFAULT-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
+// DEFAULT-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
+// DEFAULT-NEXT:    br label [[CLEANUP_I20_I_I]]
+// DEFAULT:       cleanup.i20.i.i:
+// DEFAULT-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
+// DEFAULT-NEXT:    [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ]
+// DEFAULT-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP10]]
+// DEFAULT:       _ZL4nanfPKc.exit:
+// DEFAULT-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ]
+// DEFAULT-NEXT:    [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32
+// DEFAULT-NEXT:    [[BF_VALUE_I:%.*]] = and i32 [[CONV_I]], 4194303
+// DEFAULT-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i32 [[BF_VALUE_I]], 2143289344
+// DEFAULT-NEXT:    [[TMP10:%.*]] = bitcast i32 [[BF_SET9_I]] to float
+// DEFAULT-NEXT:    ret float [[TMP10]]
+//
+// FINITEONLY-LABEL: @test_nanf(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret float poison
+//
+// APPROX-LABEL: @test_nanf(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
+// APPROX-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
+// APPROX:       if.then.i.i:
+// APPROX-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
+// APPROX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
+// APPROX-NEXT:      i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
+// APPROX-NEXT:      i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]]
+// APPROX-NEXT:    ]
+// APPROX:       while.cond.i30.i.i.preheader:
+// APPROX-NEXT:    br label [[WHILE_COND_I30_I_I:%.*]]
+// APPROX:       while.cond.i30.i.i:
+// APPROX-NEXT:    [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ]
+// APPROX-NEXT:    [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ]
+// APPROX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0
+// APPROX-NEXT:    br i1 [[CMP_NOT_I33_I_I]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]]
+// APPROX:       while.body.i34.i.i:
+// APPROX-NEXT:    [[TMP3:%.*]] = add i8 [[TMP2]], -48
+// APPROX-NEXT:    [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10
+// APPROX-NEXT:    br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]]
+// APPROX:       if.else.i.i.i:
+// APPROX-NEXT:    [[TMP4:%.*]] = add i8 [[TMP2]], -97
+// APPROX-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6
+// APPROX-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
+// APPROX:       if.else17.i.i.i:
+// APPROX-NEXT:    [[TMP5:%.*]] = add i8 [[TMP2]], -65
+// APPROX-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6
+// APPROX-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]]
+// APPROX:       if.end31.i.i.i:
+// APPROX-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// APPROX-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4
+// APPROX-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
+// APPROX-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
+// APPROX-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
+// APPROX-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
+// APPROX-NEXT:    br label [[CLEANUP_I36_I_I]]
+// APPROX:       cleanup.i36.i.i:
+// APPROX-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
+// APPROX-NEXT:    [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ]
+// APPROX-NEXT:    [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ]
+// APPROX-NEXT:    br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP11]]
+// APPROX:       while.cond.i.i.i:
+// APPROX-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ]
+// APPROX-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ]
+// APPROX-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
+// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
+// APPROX:       while.body.i.i.i:
+// APPROX-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -8
+// APPROX-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48
+// APPROX-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]]
+// APPROX:       if.then.i.i.i:
+// APPROX-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3
+// APPROX-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
+// APPROX-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
+// APPROX-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
+// APPROX-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
+// APPROX-NEXT:    br label [[CLEANUP_I_I_I]]
+// APPROX:       cleanup.i.i.i:
+// APPROX-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
+// APPROX-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
+// APPROX-NEXT:    br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP7]]
+// APPROX:       while.cond.i14.i.i:
+// APPROX-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ]
+// APPROX-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ]
+// APPROX-NEXT:    [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
+// APPROX-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]]
+// APPROX:       while.body.i18.i.i:
+// APPROX-NEXT:    [[TMP9:%.*]] = add i8 [[TMP8]], -48
+// APPROX-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10
+// APPROX-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]]
+// APPROX:       if.then.i24.i.i:
+// APPROX-NEXT:    [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10
+// APPROX-NEXT:    [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64
+// APPROX-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
+// APPROX-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
+// APPROX-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
+// APPROX-NEXT:    br label [[CLEANUP_I20_I_I]]
+// APPROX:       cleanup.i20.i.i:
+// APPROX-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
+// APPROX-NEXT:    [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ]
+// APPROX-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP10]]
+// APPROX:       _ZL4nanfPKc.exit:
+// APPROX-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ]
+// APPROX-NEXT:    [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32
+// APPROX-NEXT:    [[BF_VALUE_I:%.*]] = and i32 [[CONV_I]], 4194303
+// APPROX-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i32 [[BF_VALUE_I]], 2143289344
+// APPROX-NEXT:    [[TMP10:%.*]] = bitcast i32 [[BF_SET9_I]] to float
+// APPROX-NEXT:    ret float [[TMP10]]
 //
 extern "C" __device__ float test_nanf(const char *tag) {
   return nanf(tag);
 }
 
-// CHECK-LABEL: @test_nan(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// CHECK-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
-// CHECK:       if.then.i.i:
-// CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
-// CHECK-NEXT:      i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
-// CHECK-NEXT:      i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]]
-// CHECK-NEXT:    ]
-// CHECK:       while.cond.i30.i.i.preheader:
-// CHECK-NEXT:    br label [[WHILE_COND_I30_I_I:%.*]]
-// CHECK:       while.cond.i30.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ]
-// CHECK-NEXT:    [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ]
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I33_I_I]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]]
-// CHECK:       while.body.i34.i.i:
-// CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP2]], -48
-// CHECK-NEXT:    [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]]
-// CHECK:       if.else.i.i.i:
-// CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[TMP2]], -97
-// CHECK-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6
-// CHECK-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
-// CHECK:       if.else17.i.i.i:
-// CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP2]], -65
-// CHECK-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6
-// CHECK-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]]
-// CHECK:       if.end31.i.i.i:
-// CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4
-// CHECK-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
-// CHECK-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
-// CHECK-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
-// CHECK-NEXT:    br label [[CLEANUP_I36_I_I]]
-// CHECK:       cleanup.i36.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP11]]
-// CHECK:       while.cond.i.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ]
-// CHECK-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ]
-// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
-// CHECK:       while.body.i.i.i:
-// CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -8
-// CHECK-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48
-// CHECK-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]]
-// CHECK:       if.then.i.i.i:
-// CHECK-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3
-// CHECK-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
-// CHECK-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
-// CHECK-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
-// CHECK-NEXT:    br label [[CLEANUP_I_I_I]]
-// CHECK:       cleanup.i.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
-// CHECK-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
-// CHECK-NEXT:    br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP7]]
-// CHECK:       while.cond.i14.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ]
-// CHECK-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ]
-// CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]]
-// CHECK:       while.body.i18.i.i:
-// CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[TMP8]], -48
-// CHECK-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]]
-// CHECK:       if.then.i24.i.i:
-// CHECK-NEXT:    [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10
-// CHECK-NEXT:    [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64
-// CHECK-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
-// CHECK-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
-// CHECK-NEXT:    br label [[CLEANUP_I20_I_I]]
-// CHECK:       cleanup.i20.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
-// CHECK-NEXT:    [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ]
-// CHECK-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP10]]
-// CHECK:       _ZL3nanPKc.exit:
-// CHECK-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ]
-// CHECK-NEXT:    [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247
-// CHECK-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i64 [[BF_VALUE_I]], 9221120237041090560
-// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[BF_SET9_I]] to double
-// CHECK-NEXT:    ret double [[TMP10]]
+// DEFAULT-LABEL: @test_nan(
+// DEFAULT-NEXT:  entry:
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
+// DEFAULT-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
+// DEFAULT:       if.then.i.i:
+// DEFAULT-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
+// DEFAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
+// DEFAULT-NEXT:      i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
+// DEFAULT-NEXT:      i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]]
+// DEFAULT-NEXT:    ]
+// DEFAULT:       while.cond.i30.i.i.preheader:
+// DEFAULT-NEXT:    br label [[WHILE_COND_I30_I_I:%.*]]
+// DEFAULT:       while.cond.i30.i.i:
+// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ]
+// DEFAULT-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I33_I_I]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]]
+// DEFAULT:       while.body.i34.i.i:
+// DEFAULT-NEXT:    [[TMP3:%.*]] = add i8 [[TMP2]], -48
+// DEFAULT-NEXT:    [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10
+// DEFAULT-NEXT:    br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]]
+// DEFAULT:       if.else.i.i.i:
+// DEFAULT-NEXT:    [[TMP4:%.*]] = add i8 [[TMP2]], -97
+// DEFAULT-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6
+// DEFAULT-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
+// DEFAULT:       if.else17.i.i.i:
+// DEFAULT-NEXT:    [[TMP5:%.*]] = add i8 [[TMP2]], -65
+// DEFAULT-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6
+// DEFAULT-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]]
+// DEFAULT:       if.end31.i.i.i:
+// DEFAULT-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// DEFAULT-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4
+// DEFAULT-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
+// DEFAULT-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
+// DEFAULT-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
+// DEFAULT-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
+// DEFAULT-NEXT:    br label [[CLEANUP_I36_I_I]]
+// DEFAULT:       cleanup.i36.i.i:
+// DEFAULT-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
+// DEFAULT-NEXT:    [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ]
+// DEFAULT-NEXT:    [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ]
+// DEFAULT-NEXT:    br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP11]]
+// DEFAULT:       while.cond.i.i.i:
+// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ]
+// DEFAULT-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ]
+// DEFAULT-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
+// DEFAULT:       while.body.i.i.i:
+// DEFAULT-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -8
+// DEFAULT-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48
+// DEFAULT-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]]
+// DEFAULT:       if.then.i.i.i:
+// DEFAULT-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3
+// DEFAULT-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
+// DEFAULT-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
+// DEFAULT-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
+// DEFAULT-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
+// DEFAULT-NEXT:    br label [[CLEANUP_I_I_I]]
+// DEFAULT:       cleanup.i.i.i:
+// DEFAULT-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
+// DEFAULT-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
+// DEFAULT-NEXT:    br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP7]]
+// DEFAULT:       while.cond.i14.i.i:
+// DEFAULT-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ]
+// DEFAULT-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ]
+// DEFAULT-NEXT:    [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]]
+// DEFAULT-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]]
+// DEFAULT:       while.body.i18.i.i:
+// DEFAULT-NEXT:    [[TMP9:%.*]] = add i8 [[TMP8]], -48
+// DEFAULT-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10
+// DEFAULT-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]]
+// DEFAULT:       if.then.i24.i.i:
+// DEFAULT-NEXT:    [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10
+// DEFAULT-NEXT:    [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64
+// DEFAULT-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
+// DEFAULT-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
+// DEFAULT-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
+// DEFAULT-NEXT:    br label [[CLEANUP_I20_I_I]]
+// DEFAULT:       cleanup.i20.i.i:
+// DEFAULT-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
+// DEFAULT-NEXT:    [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ]
+// DEFAULT-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP10]]
+// DEFAULT:       _ZL3nanPKc.exit:
+// DEFAULT-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ]
+// DEFAULT-NEXT:    [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247
+// DEFAULT-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i64 [[BF_VALUE_I]], 9221120237041090560
+// DEFAULT-NEXT:    [[TMP10:%.*]] = bitcast i64 [[BF_SET9_I]] to double
+// DEFAULT-NEXT:    ret double [[TMP10]]
+//
+// FINITEONLY-LABEL: @test_nan(
+// FINITEONLY-NEXT:  entry:
+// FINITEONLY-NEXT:    ret double poison
+//
+// APPROX-LABEL: @test_nan(
+// APPROX-NEXT:  entry:
+// APPROX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
+// APPROX-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
+// APPROX:       if.then.i.i:
+// APPROX-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
+// APPROX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
+// APPROX-NEXT:      i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
+// APPROX-NEXT:      i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]]
+// APPROX-NEXT:    ]
+// APPROX:       while.cond.i30.i.i.preheader:
+// APPROX-NEXT:    br label [[WHILE_COND_I30_I_I:%.*]]
+// APPROX:       while.cond.i30.i.i:
+// APPROX-NEXT:    [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ]
+// APPROX-NEXT:    [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ]
+// APPROX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0
+// APPROX-NEXT:    br i1 [[CMP_NOT_I33_I_I]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]]
+// APPROX:       while.body.i34.i.i:
+// APPROX-NEXT:    [[TMP3:%.*]] = add i8 [[TMP2]], -48
+// APPROX-NEXT:    [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10
+// APPROX-NEXT:    br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]]
+// APPROX:       if.else.i.i.i:
+// APPROX-NEXT:    [[TMP4:%.*]] = add i8 [[TMP2]], -97
+// APPROX-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6
+// APPROX-NEXT:    br i1 [[OR_COND33_I_I_I]], label [[IF_END31_I_I_I]], label [[IF_ELSE17_I_I_I:%.*]]
+// APPROX:       if.else17.i.i.i:
+// APPROX-NEXT:    [[TMP5:%.*]] = add i8 [[TMP2]], -65
+// APPROX-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6
+// APPROX-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]]
+// APPROX:       if.end31.i.i.i:
+// APPROX-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// APPROX-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4
+// APPROX-NEXT:    [[CONV25_I_I_I:%.*]] = zext nneg i8 [[TMP2]] to i64
+// APPROX-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
+// APPROX-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
+// APPROX-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
+// APPROX-NEXT:    br label [[CLEANUP_I36_I_I]]
+// APPROX:       cleanup.i36.i.i:
+// APPROX-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
+// APPROX-NEXT:    [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ]
+// APPROX-NEXT:    [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ]
+// APPROX-NEXT:    br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP11]]
+// APPROX:       while.cond.i.i.i:
+// APPROX-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ]
+// APPROX-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ]
+// APPROX-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
+// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
+// APPROX:       while.body.i.i.i:
+// APPROX-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -8
+// APPROX-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48
+// APPROX-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]]
+// APPROX:       if.then.i.i.i:
+// APPROX-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3
+// APPROX-NEXT:    [[CONV5_I_I_I:%.*]] = zext nneg i8 [[TMP6]] to i64
+// APPROX-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
+// APPROX-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
+// APPROX-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
+// APPROX-NEXT:    br label [[CLEANUP_I_I_I]]
+// APPROX:       cleanup.i.i.i:
+// APPROX-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
+// APPROX-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
+// APPROX-NEXT:    br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP7]]
+// APPROX:       while.cond.i14.i.i:
+// APPROX-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ]
+// APPROX-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ]
+// APPROX-NEXT:    [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]]
+// APPROX-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
+// APPROX-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]]
+// APPROX:       while.body.i18.i.i:
+// APPROX-NEXT:    [[TMP9:%.*]] = add i8 [[TMP8]], -48
+// APPROX-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10
+// APPROX-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]]
+// APPROX:       if.then.i24.i.i:
+// APPROX-NEXT:    [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10
+// APPROX-NEXT:    [[CONV5_I26_I_I:%.*]] = zext nneg i8 [[TMP8]] to i64
+// APPROX-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
+// APPROX-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
+// APPROX-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
+// APPROX-NEXT:    br label [[CLEANUP_I20_I_I]]
+// APPROX:       cleanup.i20.i.i:
+// APPROX-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
+// APPROX-NEXT:    [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ]
+// APPROX-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP10]]
+// APPROX:       _ZL3nanPKc.exit:
+// APPROX-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ]
+// APPROX-NEXT:    [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247
+// APPROX-NEXT:    [[BF_SET9_I:%.*]] = or disjoint i64 [[BF_VALUE_I]], 9221120237041090560
+// APPROX-NEXT:    [[TMP10:%.*]] = bitcast i64 [[BF_SET9_I]] to double
+// APPROX-NEXT:    ret double [[TMP10]]
 //
 extern "C" __device__ double test_nan(const char *tag) {
   return nan(tag);
diff --git a/clang/test/SemaTemplate/default-arguments.cpp b/clang/test/SemaTemplate/default-arguments.cpp
index d5d9687cc90f49..3b1fbda414c12b 100644
--- a/clang/test/SemaTemplate/default-arguments.cpp
+++ b/clang/test/SemaTemplate/default-arguments.cpp
@@ -229,3 +229,55 @@ namespace unevaluated {
   template<int = 0> int f(int = a); // expected-warning 0-1{{extension}}
   int k = sizeof(f());
 }
+
+#if __cplusplus >= 201103L
+namespace GH68490 {
+
+template <typename T> struct S {
+  template <typename U>
+  constexpr int SizeOfU(int param = sizeof(U)) const;
+
+  template <typename U>
+  constexpr int SizeOfT(int param = sizeof(T)) const;
+};
+
+template <typename T> struct S<T *> {
+  template <typename U>
+  constexpr int SizeOfU(int param = sizeof(U)) const;
+
+  template <typename U>
+  constexpr int SizeOfT(int param = sizeof(T *)) const;
+};
+
+template <typename T>
+template <typename U>
+constexpr int S<T *>::SizeOfU(int param) const {
+  return param;
+}
+
+template <typename T>
+template <typename U>
+constexpr int S<T *>::SizeOfT(int param) const {
+  return param;
+}
+
+template <>
+template <typename T>
+constexpr int S<int>::SizeOfU(int param) const {
+  return param;
+}
+
+template <>
+template <typename T>
+constexpr int S<int>::SizeOfT(int param) const {
+  return param;
+}
+
+static_assert(S<int>().SizeOfU<char>() == sizeof(char), "");
+static_assert(S<int>().SizeOfT<char>() == sizeof(int), "");
+static_assert(S<short *>().SizeOfU<char>() == sizeof(char), "");
+static_assert(S<short *>().SizeOfT<char>() == sizeof(short *), "");
+
+} // namespace GH68490
+
+#endif
diff --git a/clang/test/SemaTemplate/default-parm-init.cpp b/clang/test/SemaTemplate/default-parm-init.cpp
deleted file mode 100644
index 73ba8998df6a98..00000000000000
--- a/clang/test/SemaTemplate/default-parm-init.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c++17 -verify %s
-// RUN: %clang_cc1 -fsyntax-only -std=c++20 -verify %s
-// expected-no-diagnostics
-
-namespace std {
-
-template<typename Signature> class function;
-
-template<typename R, typename... Args> class invoker_base {
-public: 
-  virtual ~invoker_base() { } 
-  virtual R invoke(Args...) = 0; 
-  virtual invoker_base* clone() = 0;
-};
-
-template<typename F, typename R, typename... Args> 
-class functor_invoker : public invoker_base<R, Args...> {
-public: 
-  explicit functor_invoker(const F& f) : f(f) { } 
-  R invoke(Args... args) { return f(args...); } 
-  functor_invoker* clone() { return new functor_invoker(f); }
-
-private:
-  F f;
-};
-
-template<typename R, typename... Args>
-class function<R (Args...)> {
-public: 
-  typedef R result_type;
-  function() : invoker (0) { }
-  function(const function& other) : invoker(0) { 
-    if (other.invoker)
-      invoker = other.invoker->clone();
-  }
-
-  template<typename F> function(const F& f) : invoker(0) {
-    invoker = new functor_invoker<F, R, Args...>(f);
-  }
-
-  ~function() { 
-    if (invoker)
-      delete invoker;
-  }
-
-  function& operator=(const function& other) { 
-    function(other).swap(*this); 
-    return *this;
-  }
-
-  template<typename F> 
-  function& operator=(const F& f) {
-    function(f).swap(*this); 
-    return *this;
-  }
-
-  void swap(function& other) { 
-    invoker_base<R, Args...>* tmp = invoker; 
-    invoker = other.invoker; 
-    other.invoker = tmp;
-  }
-
-  result_type operator()(Args... args) const { 
-    return invoker->invoke(args...);
-  }
-
-private: 
-  invoker_base<R, Args...>* invoker;
-};
-
-}
-
-template<typename TemplateParam>
-struct Problem {
-  template<typename FunctionTemplateParam>
-  constexpr int FuncAlign(int param = alignof(FunctionTemplateParam));
-
-  template<typename FunctionTemplateParam>
-  constexpr int FuncSizeof(int param = sizeof(FunctionTemplateParam));
-
-  template<typename FunctionTemplateParam>
-  constexpr int FuncAlign2(int param = alignof(TemplateParam));
-
-  template<typename FunctionTemplateParam>
-  constexpr int FuncSizeof2(int param = sizeof(TemplateParam));
-};
-
-template<typename TemplateParam>
-struct Problem<TemplateParam*> {
-  template<typename FunctionTemplateParam>
-  constexpr int FuncAlign(int param = alignof(FunctionTemplateParam));
-
-  template<typename FunctionTemplateParam>
-  constexpr int FuncSizeof(int param = sizeof(FunctionTemplateParam));
-
-  template<typename FunctionTemplateParam>
-  constexpr int FuncAlign2(int param = alignof(TemplateParam));
-
-  template<typename FunctionTemplateParam>
-  constexpr int FuncSizeof2(int param = sizeof(TemplateParam));
-};
-
-template<typename TemplateParam>
-template<typename FunctionTemplateParam>
-constexpr int Problem<TemplateParam*>::FuncAlign(int param) {
-	return 2U*param;
-}
-
-template<typename TemplateParam>
-template<typename FunctionTemplateParam>
-constexpr int Problem<TemplateParam*>::FuncSizeof(int param) {
-    return 2U*param;
-}
-
-template<typename TemplateParam>
-template<typename FunctionTemplateParam>
-constexpr int Problem<TemplateParam*>::FuncAlign2(int param) {
-	return 2U*param;
-}
-
-template<typename TemplateParam>
-template<typename FunctionTemplateParam>
-constexpr int Problem<TemplateParam*>::FuncSizeof2(int param) {
-	return 2U*param;
-}
-
-template <>
-template<typename FunctionTemplateParam>
-constexpr int Problem<int>::FuncAlign(int param) {
-	return param;
-}
-
-template <>
-template<typename FunctionTemplateParam>
-constexpr int Problem<int>::FuncSizeof(int param) {
-	return param;
-}
-
-template <>
-template<typename FunctionTemplateParam>
-constexpr int Problem<int>::FuncAlign2(int param) {
-	return param;
-}
-
-template <>
-template<typename FunctionTemplateParam>
-constexpr int Problem<int>::FuncSizeof2(int param) {
-	return param;
-}
-
-void foo() {
-    Problem<int> p = {};
-    static_assert(p.FuncAlign<char>() == alignof(char));
-    static_assert(p.FuncSizeof<char>() == sizeof(char));
-    static_assert(p.FuncAlign2<char>() == alignof(int));
-    static_assert(p.FuncSizeof2<char>() == sizeof(int));
-    Problem<short*> q = {};
-    static_assert(q.FuncAlign<char>() == 2U * alignof(char));
-    static_assert(q.FuncSizeof<char>() == 2U * sizeof(char));
-    static_assert(q.FuncAlign2<char>() == 2U *alignof(short));
-    static_assert(q.FuncSizeof2<char>() == 2U * sizeof(short));
-}
-
-template <typename T>
-class A {
- public:
-  void run(
-    std::function<void(T&)> f1 = [](auto&&) {},
-    std::function<void(T&)> f2 = [](auto&&) {});
- private:
-  class Helper {
-   public:
-    explicit Helper(std::function<void(T&)> f2) : f2_(f2) {}
-    std::function<void(T&)> f2_;
-  };
-};
-
-template <typename T>
-void A<T>::run(std::function<void(T&)> f1,
-               std::function<void(T&)> f2) {
-  Helper h(f2);
-}
-
-struct B {};
-
-int main() {
-    A<B> a;
-    a.run([&](auto& l) {});
-    return 0;
-}
diff --git a/clang/tools/scan-build-py/CMakeLists.txt b/clang/tools/scan-build-py/CMakeLists.txt
index 3aca22c0b0a8d3..9273eb5ed977e4 100644
--- a/clang/tools/scan-build-py/CMakeLists.txt
+++ b/clang/tools/scan-build-py/CMakeLists.txt
@@ -88,7 +88,7 @@ foreach(lib ${LibScanbuild})
                      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/lib/libscanbuild/${lib})
   list(APPEND Depends ${CMAKE_BINARY_DIR}/lib/libscanbuild/${lib})
   install(FILES lib/libscanbuild/${lib}
-          DESTINATION lib${CLANG_LIBDIR_SUFFIX}/libscanbuild
+          DESTINATION lib/libscanbuild
           COMPONENT scan-build-py)
 endforeach()
 
@@ -106,7 +106,7 @@ foreach(resource ${LibScanbuildResources})
                      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/lib/libscanbuild/resources/${resource})
   list(APPEND Depends ${CMAKE_BINARY_DIR}/lib/libscanbuild/resources/${resource})
   install(FILES lib/libscanbuild/resources/${resource}
-          DESTINATION lib${CLANG_LIBDIR_SUFFIX}/libscanbuild/resources
+          DESTINATION lib/libscanbuild/resources
           COMPONENT scan-build-py)
 endforeach()
 
@@ -122,7 +122,7 @@ foreach(lib ${LibEar})
                      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/lib/libear/${lib})
   list(APPEND Depends ${CMAKE_BINARY_DIR}/lib/libear/${lib})
   install(FILES lib/libear/${lib}
-          DESTINATION lib${CLANG_LIBDIR_SUFFIX}/libear
+          DESTINATION lib/libear
           COMPONENT scan-build-py)
 endforeach()
 
diff --git a/clang/tools/scan-view/share/startfile.py b/clang/tools/scan-view/share/startfile.py
index d63e69280e90dd..c72475e8b6212e 100644
--- a/clang/tools/scan-view/share/startfile.py
+++ b/clang/tools/scan-view/share/startfile.py
@@ -48,7 +48,7 @@ def _invoke(self, cmdline):
             or sys.platform[:3] == "win"
             or sys.platform == "darwin"
         ):
-            inout = file(os.devnull, "r+")
+            inout = subprocess.DEVNULL
         else:
             # for TTY programs, we need stdin/out
             inout = None
diff --git a/clang/unittests/Basic/SourceManagerTest.cpp b/clang/unittests/Basic/SourceManagerTest.cpp
index 45840f5188cdcd..0f2476bd8b0612 100644
--- a/clang/unittests/Basic/SourceManagerTest.cpp
+++ b/clang/unittests/Basic/SourceManagerTest.cpp
@@ -20,6 +20,7 @@
 #include "clang/Lex/PreprocessorOptions.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Process.h"
 #include "gtest/gtest.h"
 #include <cstddef>
@@ -453,6 +454,65 @@ TEST_F(SourceManagerTest, loadedSLocEntryIsInTheSameTranslationUnit) {
 
 #if defined(LLVM_ON_UNIX)
 
+// A single SourceManager instance is sometimes reused across multiple
+// compilations. This test makes sure we're resetting caches built for tracking
+// include locations that are based on FileIDs, to make sure we don't report
+// wrong include locations when FileIDs coincide between two different runs.
+TEST_F(SourceManagerTest, ResetsIncludeLocMap) {
+  auto ParseFile = [&] {
+    TrivialModuleLoader ModLoader;
+    HeaderSearch HeaderInfo(std::make_shared<HeaderSearchOptions>(), SourceMgr,
+                            Diags, LangOpts, &*Target);
+    Preprocessor PP(std::make_shared<PreprocessorOptions>(), Diags, LangOpts,
+                    SourceMgr, HeaderInfo, ModLoader,
+                    /*IILookup =*/nullptr,
+                    /*OwnsHeaderSearch =*/false);
+    PP.Initialize(*Target);
+    PP.EnterMainSourceFile();
+    PP.LexTokensUntilEOF();
+    EXPECT_FALSE(Diags.hasErrorOccurred());
+  };
+
+  auto Buf = llvm::MemoryBuffer::getMemBuffer("");
+  FileEntryRef HeaderFile =
+      FileMgr.getVirtualFileRef("/foo.h", Buf->getBufferSize(), 0);
+  SourceMgr.overrideFileContents(HeaderFile, std::move(Buf));
+
+  Buf = llvm::MemoryBuffer::getMemBuffer(R"cpp(#include "/foo.h")cpp");
+  FileEntryRef BarFile =
+      FileMgr.getVirtualFileRef("/bar.h", Buf->getBufferSize(), 0);
+  SourceMgr.overrideFileContents(BarFile, std::move(Buf));
+  SourceMgr.createFileID(BarFile, {}, clang::SrcMgr::C_User);
+
+  Buf = llvm::MemoryBuffer::getMemBuffer(R"cpp(#include "/foo.h")cpp");
+  FileID MFID = SourceMgr.createFileID(std::move(Buf));
+  SourceMgr.setMainFileID(MFID);
+
+  ParseFile();
+  auto FooFID = SourceMgr.getOrCreateFileID(HeaderFile, clang::SrcMgr::C_User);
+  auto IncFID = SourceMgr.getDecomposedIncludedLoc(FooFID).first;
+  EXPECT_EQ(IncFID, MFID);
+
+  // Clean up source-manager state before we start next parse.
+  SourceMgr.clearIDTables();
+
+  // Set up a new main file.
+  Buf = llvm::MemoryBuffer::getMemBuffer(R"cpp(
+  // silly comment 42
+  #include "/bar.h")cpp");
+  MFID = SourceMgr.createFileID(std::move(Buf));
+  SourceMgr.setMainFileID(MFID);
+
+  ParseFile();
+  // Make sure foo.h got the same file-id in both runs.
+  EXPECT_EQ(FooFID,
+            SourceMgr.getOrCreateFileID(HeaderFile, clang::SrcMgr::C_User));
+  auto BarFID = SourceMgr.getOrCreateFileID(BarFile, clang::SrcMgr::C_User);
+  IncFID = SourceMgr.getDecomposedIncludedLoc(FooFID).first;
+  // Check that includer is bar.h during this run.
+  EXPECT_EQ(IncFID, BarFID);
+}
+
 TEST_F(SourceManagerTest, getMacroArgExpandedLocation) {
   const char *header =
     "#define FM(x,y) x\n";
diff --git a/clang/utils/creduce-clang-crash.py b/clang/utils/creduce-clang-crash.py
index db4a3435a3aef7..180dfbeab224e9 100755
--- a/clang/utils/creduce-clang-crash.py
+++ b/clang/utils/creduce-clang-crash.py
@@ -8,7 +8,6 @@
   *.test.sh -- interestingness test for C-Reduce
 """
 
-from __future__ import print_function
 from argparse import ArgumentParser, RawTextHelpFormatter
 import os
 import re
@@ -228,8 +227,7 @@ def check_interestingness(self):
         testfile = os.path.abspath(self.testfile)
 
         # Check that the test considers the original file interesting
-        with open(os.devnull, "w") as devnull:
-            returncode = subprocess.call(testfile, stdout=devnull)
+        returncode = subprocess.call(testfile, stdout=subprocess.DEVNULL)
         if returncode:
             sys.exit("The interestingness test does not pass for the original file.")
 
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 13adbd6c4d57d9..2c3b0fa84a4782 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -868,10 +868,12 @@ else ()
           endif()
         endif()
       endif()
-      check_c_source_compiles("_Float16 foo(_Float16 x) { return x; }"
+      check_c_source_compiles("_Float16 foo(_Float16 x) { return x; }
+                               int main(void) { return 0; }"
                               COMPILER_RT_HAS_${arch}_FLOAT16)
       append_list_if(COMPILER_RT_HAS_${arch}_FLOAT16 -DCOMPILER_RT_HAS_FLOAT16 BUILTIN_CFLAGS_${arch})
-      check_c_source_compiles("__bf16 foo(__bf16 x) { return x; }"
+      check_c_source_compiles("__bf16 foo(__bf16 x) { return x; }
+                               int main(void) { return 0; }"
                               COMPILER_RT_HAS_${arch}_BFLOAT16)
       # Build BF16 files only when "__bf16" is available.
       if(COMPILER_RT_HAS_${arch}_BFLOAT16)
diff --git a/compiler-rt/lib/rtsan/rtsan.cpp b/compiler-rt/lib/rtsan/rtsan.cpp
index 8a7ff03c611c65..b2c4616b5fd0dc 100644
--- a/compiler-rt/lib/rtsan/rtsan.cpp
+++ b/compiler-rt/lib/rtsan/rtsan.cpp
@@ -58,11 +58,11 @@ SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_exit() {
   __rtsan::GetContextForThisThread().RealtimePop();
 }
 
-SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_off() {
+SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_disable() {
   __rtsan::GetContextForThisThread().BypassPush();
 }
 
-SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_on() {
+SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_enable() {
   __rtsan::GetContextForThisThread().BypassPop();
 }
 
diff --git a/compiler-rt/lib/rtsan/rtsan.h b/compiler-rt/lib/rtsan/rtsan.h
index 3d665c98aed184..ae23609f97d2dc 100644
--- a/compiler-rt/lib/rtsan/rtsan.h
+++ b/compiler-rt/lib/rtsan/rtsan.h
@@ -38,11 +38,11 @@ SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_exit();
 
 // Disable all RTSan error reporting.
 // Injected into the code if "nosanitize(realtime)" is on a function.
-SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_off();
+SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_disable();
 
 // Re-enable all RTSan error reporting.
-// The counterpart to `__rtsan_off`.
-SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_on();
+// The counterpart to `__rtsan_disable`.
+SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_enable();
 
 SANITIZER_INTERFACE_ATTRIBUTE void
 __rtsan_expect_not_realtime(const char *intercepted_function_name);
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp
index 6e7ab016a4c6b2..5a86957170dcec 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_functional.cpp
@@ -204,10 +204,10 @@ TEST(TestRtsan, ThrowingAnExceptionDiesWhenRealtime) {
 TEST(TestRtsan, DoesNotDieIfTurnedOff) {
   std::mutex mutex;
   auto RealtimeUnsafeFunc = [&]() {
-    __rtsan_off();
+    __rtsan_disable();
     mutex.lock();
     mutex.unlock();
-    __rtsan_on();
+    __rtsan_enable();
   };
   RealtimeInvoke(RealtimeUnsafeFunc);
 }
diff --git a/compiler-rt/test/nsan/vec_sqrt.cpp b/compiler-rt/test/nsan/vec_sqrt.cpp
index d1ef0487858506..64a7130322873c 100644
--- a/compiler-rt/test/nsan/vec_sqrt.cpp
+++ b/compiler-rt/test/nsan/vec_sqrt.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx_nsan -O0 -g -mavx %s -o %t
-// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s
+// RUN: env NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s
 // RUN: %clangxx_nsan -O3 -g -mavx %s -o %t
-// RUN: NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s
+// RUN: env NSAN_OPTIONS=check_nan=true,halt_on_error=0 %run %t 2>&1 | FileCheck %s
 
 #include <cmath>
 #include <immintrin.h>
@@ -31,4 +31,4 @@ int main() {
     // CHECK: WARNING: NumericalStabilitySanitizer: NaN detected
   }
   return 0;
-}
\ No newline at end of file
+}
diff --git a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
index fb15e0143d3653..b601d90cfcc927 100644
--- a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
+++ b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c
@@ -2,7 +2,7 @@
 // expected-no-diagnostics
 
 // RUN: %clang %s -O2 -S -o - -target wasm32-unknown-unknown \
-// RUN: -msimd128 -mrelaxed-simd -Wcast-qual -Werror | FileCheck %s
+// RUN: -msimd128 -mrelaxed-simd -mfp16 -Wcast-qual -Werror | FileCheck %s
 
 #include <wasm_simd128.h>
 
@@ -1385,3 +1385,139 @@ v128_t test_i16x8_relaxed_dot_i8x16_i7x16(v128_t a, v128_t b) {
 v128_t test_i32x4_relaxed_dot_i8x16_i7x16_add(v128_t a, v128_t b, v128_t c) {
   return wasm_i32x4_relaxed_dot_i8x16_i7x16_add(a, b, c);
 }
+
+// CHECK-LABEL: test_f16x8_splat:
+// CHECK: f16x8.splat{{$}}
+v128_t test_f16x8_splat(float a) { return wasm_f16x8_splat(a); }
+
+// CHECK-LABEL: test_f16x8_extract_lane:
+// CHECK: f16x8.extract_lane 7{{$}}
+int16_t test_f16x8_extract_lane(v128_t a) {
+  return wasm_f16x8_extract_lane(a, 7);
+}
+
+// CHECK-LABEL: test_f16x8_replace_lane:
+// CHECK: f16x8.replace_lane 7{{$}}
+v128_t test_f16x8_replace_lane(v128_t a, float b) {
+  return wasm_f16x8_replace_lane(a, 7, b);
+}
+
+// CHECK-LABEL: test_f16x8_abs:
+// CHECK: f16x8.abs{{$}}
+v128_t test_f16x8_abs(v128_t a) { return wasm_f16x8_abs(a); }
+
+// CHECK-LABEL: test_f16x8_neg:
+// CHECK: f16x8.neg{{$}}
+v128_t test_f16x8_neg(v128_t a) { return wasm_f16x8_neg(a); }
+
+// CHECK-LABEL: test_f16x8_sqrt:
+// CHECK: f16x8.sqrt{{$}}
+v128_t test_f16x8_sqrt(v128_t a) { return wasm_f16x8_sqrt(a); }
+
+// CHECK-LABEL: test_f16x8_ceil:
+// CHECK: f16x8.ceil{{$}}
+v128_t test_f16x8_ceil(v128_t a) { return wasm_f16x8_ceil(a); }
+
+// CHECK-LABEL: test_f16x8_floor:
+// CHECK: f16x8.floor{{$}}
+v128_t test_f16x8_floor(v128_t a) { return wasm_f16x8_floor(a); }
+
+// CHECK-LABEL: test_f16x8_trunc:
+// CHECK: f16x8.trunc{{$}}
+v128_t test_f16x8_trunc(v128_t a) { return wasm_f16x8_trunc(a); }
+
+// CHECK-LABEL: test_f16x8_nearest:
+// CHECK: f16x8.nearest{{$}}
+v128_t test_f16x8_nearest(v128_t a) { return wasm_f16x8_nearest(a); }
+
+// CHECK-LABEL: test_f16x8_add:
+// CHECK: f16x8.add{{$}}
+v128_t test_f16x8_add(v128_t a, v128_t b) { return wasm_f16x8_add(a, b); }
+
+// CHECK-LABEL: test_f16x8_sub:
+// CHECK: f16x8.sub{{$}}
+v128_t test_f16x8_sub(v128_t a, v128_t b) { return wasm_f16x8_sub(a, b); }
+
+// CHECK-LABEL: test_f16x8_mul:
+// CHECK: f16x8.mul{{$}}
+v128_t test_f16x8_mul(v128_t a, v128_t b) { return wasm_f16x8_mul(a, b); }
+
+// CHECK-LABEL: test_f16x8_div:
+// CHECK: f16x8.div{{$}}
+v128_t test_f16x8_div(v128_t a, v128_t b) { return wasm_f16x8_div(a, b); }
+
+// CHECK-LABEL: test_f16x8_min:
+// CHECK: f16x8.min{{$}}
+v128_t test_f16x8_min(v128_t a, v128_t b) { return wasm_f16x8_min(a, b); }
+
+// CHECK-LABEL: test_f16x8_max:
+// CHECK: f16x8.max{{$}}
+v128_t test_f16x8_max(v128_t a, v128_t b) { return wasm_f16x8_max(a, b); }
+
+// CHECK-LABEL: test_f16x8_pmin:
+// CHECK: f16x8.pmin{{$}}
+v128_t test_f16x8_pmin(v128_t a, v128_t b) { return wasm_f16x8_pmin(a, b); }
+
+// CHECK-LABEL: test_f16x8_pmax:
+// CHECK: f16x8.pmax{{$}}
+v128_t test_f16x8_pmax(v128_t a, v128_t b) { return wasm_f16x8_pmax(a, b); }
+
+// CHECK-LABEL: test_f16x8_eq:
+// CHECK: f16x8.eq{{$}}
+v128_t test_f16x8_eq(v128_t a, v128_t b) { return wasm_f16x8_eq(a, b); }
+
+// CHECK-LABEL: test_f16x8_ne:
+// CHECK: f16x8.ne{{$}}
+v128_t test_f16x8_ne(v128_t a, v128_t b) { return wasm_f16x8_ne(a, b); }
+
+// CHECK-LABEL: test_f16x8_lt:
+// CHECK: f16x8.lt{{$}}
+v128_t test_f16x8_lt(v128_t a, v128_t b) { return wasm_f16x8_lt(a, b); }
+
+// CHECK-LABEL: test_f16x8_gt:
+// CHECK: f16x8.gt{{$}}
+v128_t test_f16x8_gt(v128_t a, v128_t b) { return wasm_f16x8_gt(a, b); }
+
+// CHECK-LABEL: test_f16x8_le:
+// CHECK: f16x8.le{{$}}
+v128_t test_f16x8_le(v128_t a, v128_t b) { return wasm_f16x8_le(a, b); }
+
+// CHECK-LABEL: test_f16x8_ge:
+// CHECK: f16x8.ge{{$}}
+v128_t test_f16x8_ge(v128_t a, v128_t b) { return wasm_f16x8_ge(a, b); }
+
+// CHECK-LABEL: test_i16x8_trunc_sat_f16x8:
+// CHECK: i16x8.trunc_sat_f16x8_s{{$}}
+v128_t test_i16x8_trunc_sat_f16x8(v128_t a) {
+  return wasm_i16x8_trunc_sat_f16x8(a);
+}
+
+// CHECK-LABEL: test_u16x8_trunc_sat_f16x8:
+// CHECK: i16x8.trunc_sat_f16x8_u{{$}}
+v128_t test_u16x8_trunc_sat_f16x8(v128_t a) {
+  return wasm_u16x8_trunc_sat_f16x8(a);
+}
+
+// CHECK-LABEL: test_f16x8_convert_i16x8:
+// CHECK: f16x8.convert_i16x8_s{{$}}
+v128_t test_f16x8_convert_i16x8(v128_t a) {
+  return wasm_f16x8_convert_i16x8(a);
+}
+
+// CHECK-LABEL: test_f16x8_convert_u16x8:
+// CHECK: f16x8.convert_i16x8_u{{$}}
+v128_t test_f16x8_convert_u16x8(v128_t a) {
+  return wasm_f16x8_convert_u16x8(a);
+}
+
+// CHECK-LABEL: test_f16x8_relaxed_madd:
+// CHECK: f16x8.relaxed_madd{{$}}
+v128_t test_f16x8_relaxed_madd(v128_t a, v128_t b, v128_t c) {
+  return wasm_f16x8_relaxed_madd(a, b, c);
+}
+
+// CHECK-LABEL: test_f16x8_relaxed_nmadd:
+// CHECK: f16x8.relaxed_nmadd{{$}}
+v128_t test_f16x8_relaxed_nmadd(v128_t a, v128_t b, v128_t c) {
+  return wasm_f16x8_relaxed_nmadd(a, b, c);
+}
diff --git a/flang/lib/Evaluate/intrinsics-library.cpp b/flang/lib/Evaluate/intrinsics-library.cpp
index 65636b9956e780..ed28d8130808fa 100644
--- a/flang/lib/Evaluate/intrinsics-library.cpp
+++ b/flang/lib/Evaluate/intrinsics-library.cpp
@@ -255,6 +255,25 @@ struct HostRuntimeLibrary<HostT, LibraryVersion::Libm> {
   static constexpr HostRuntimeMap map{table};
   static_assert(map.Verify(), "map must be sorted");
 };
+
+// Helpers to map complex std::pow whose resolution in F2{std::pow} is
+// ambiguous as of clang++ 20.
+template <typename HostT>
+static std::complex<HostT> StdPowF2(
+    const std::complex<HostT> &x, const std::complex<HostT> &y) {
+  return std::pow(x, y);
+}
+template <typename HostT>
+static std::complex<HostT> StdPowF2A(
+    const HostT &x, const std::complex<HostT> &y) {
+  return std::pow(x, y);
+}
+template <typename HostT>
+static std::complex<HostT> StdPowF2B(
+    const std::complex<HostT> &x, const HostT &y) {
+  return std::pow(x, y);
+}
+
 template <typename HostT>
 struct HostRuntimeLibrary<std::complex<HostT>, LibraryVersion::Libm> {
   using F = FuncPointer<std::complex<HostT>, const std::complex<HostT> &>;
@@ -275,9 +294,9 @@ struct HostRuntimeLibrary<std::complex<HostT>, LibraryVersion::Libm> {
       FolderFactory<F, F{std::cosh}>::Create("cosh"),
       FolderFactory<F, F{std::exp}>::Create("exp"),
       FolderFactory<F, F{std::log}>::Create("log"),
-      FolderFactory<F2, F2{std::pow}>::Create("pow"),
-      FolderFactory<F2A, F2A{std::pow}>::Create("pow"),
-      FolderFactory<F2B, F2B{std::pow}>::Create("pow"),
+      FolderFactory<F2, F2{StdPowF2}>::Create("pow"),
+      FolderFactory<F2A, F2A{StdPowF2A}>::Create("pow"),
+      FolderFactory<F2B, F2B{StdPowF2B}>::Create("pow"),
       FolderFactory<F, F{std::sin}>::Create("sin"),
       FolderFactory<F, F{std::sinh}>::Create("sinh"),
       FolderFactory<F, F{std::sqrt}>::Create("sqrt"),
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 90943fa92493ce..e5ccf659c3f8ed 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2349,8 +2349,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       fir::IfOp topIfOp, currentIfOp;
       for (Fortran::lower::pft::Evaluation &e : eval.getNestedEvaluations()) {
         auto genIfOp = [&](mlir::Value cond) {
-          auto ifOp =
-              builder->create<fir::IfOp>(toLocation(), cond, /*withElse=*/true);
+          Fortran::lower::pft::Evaluation &succ = *e.controlSuccessor;
+          bool hasElse = succ.isA<Fortran::parser::ElseIfStmt>() ||
+                         succ.isA<Fortran::parser::ElseStmt>();
+          auto ifOp = builder->create<fir::IfOp>(toLocation(), cond,
+                                                 /*withElseRegion=*/hasElse);
           builder->setInsertionPointToStart(&ifOp.getThenRegion().front());
           return ifOp;
         };
diff --git a/flang/test/HLFIR/assumed_shape_with_value_keyword.f90 b/flang/test/HLFIR/assumed_shape_with_value_keyword.f90
index 197efc08422c6e..208f22badda28d 100644
--- a/flang/test/HLFIR/assumed_shape_with_value_keyword.f90
+++ b/flang/test/HLFIR/assumed_shape_with_value_keyword.f90
@@ -102,7 +102,6 @@ subroutine test_optional1(x)
 ! CHECK:            %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:            fir.call @_QPinternal_call7(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.array<?xf32>>) -> ()
 ! CHECK:            hlfir.copy_out %[[TMP_BOX]], %[[VAL_2]]#1 to %[[VAL_0]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, i1, !fir.box<!fir.array<?xf32>>) -> ()
-! CHECK:          } else {
 ! CHECK:          }
 ! CHECK:          return
 ! CHECK:        }
@@ -122,7 +121,6 @@ subroutine test_optional2(x)
 ! CHECK:            %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 ! CHECK:            fir.call @_QPinternal_call8(%[[VAL_3]]) fastmath<contract> : (!fir.ref<!fir.array<?x?xf32>>) -> ()
 ! CHECK:            hlfir.copy_out %[[TMP_BOX]], %[[VAL_2]]#1 to %[[VAL_0]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>, i1, !fir.box<!fir.array<?x?xf32>>) -> ()
-! CHECK:          } else {
 ! CHECK:          }
 ! CHECK:          return
 ! CHECK:        }
diff --git a/flang/test/Lower/HLFIR/select-rank.f90 b/flang/test/Lower/HLFIR/select-rank.f90
index 211b7565bab8a3..d27a6d732ffc71 100644
--- a/flang/test/Lower/HLFIR/select-rank.f90
+++ b/flang/test/Lower/HLFIR/select-rank.f90
@@ -796,7 +796,6 @@ subroutine test_branching(x)
 ! CHECK:           %[[VAL_10:.*]] = arith.xori %[[VAL_8]], %[[VAL_9]] : i1
 ! CHECK:           fir.if %[[VAL_10]] {
 ! CHECK:             fir.call @_QPone() fastmath<contract> : () -> ()
-! CHECK:           } else {
 ! CHECK:           }
 ! CHECK:           fir.call @_QPrdefault(%[[VAL_6]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           cf.br ^bb7
diff --git a/flang/test/Lower/Intrinsics/system_clock.f90 b/flang/test/Lower/Intrinsics/system_clock.f90
index ca36920c04eb3b..9eae3a58884faf 100644
--- a/flang/test/Lower/Intrinsics/system_clock.f90
+++ b/flang/test/Lower/Intrinsics/system_clock.f90
@@ -104,7 +104,6 @@ subroutine ss(count)
   ! CHECK:   fir.if %[[V_17]] {
   ! CHECK:     %[[C_0:c[0-9a-z_]+]] = arith.constant 0 : i64
   ! CHECK:     fir.store %[[C_0]] to %arg0 : !fir.ref<i64>
-  ! CHECK:   } else {
   ! CHECK:   }
   ! CHECK:   %[[V_18:[0-9]+]] = fir.zero_bits !fir.ptr<i64>
   ! CHECK:   fir.store %[[V_18]] to %[[V_4]] : !fir.ref<!fir.ptr<i64>>
@@ -137,7 +136,6 @@ subroutine ss(count)
   ! CHECK:     %[[V_32]] = fir.load %arg0 : !fir.ref<i64>
   ! CHECK:     %[[V_33]] = fir.call @_FortranAioOutputInteger64(%[[V_31]], %[[V_32]]) {{.*}}: (!fir.ref<i8>, i64) -> i1
   ! CHECK:     %[[V_34]] = fir.call @_FortranAioEndIoStatement(%[[V_31]]) {{.*}}: (!fir.ref<i8>) -> i32
-  ! CHECK:   } else {
   ! CHECK:   }
   ! CHECK:   return
   ! CHECK: }
diff --git a/flang/test/Lower/OpenMP/master.f90 b/flang/test/Lower/OpenMP/master.f90
index 7db1be4f005b57..9f98ac89fb1fd9 100644
--- a/flang/test/Lower/OpenMP/master.f90
+++ b/flang/test/Lower/OpenMP/master.f90
@@ -91,7 +91,7 @@ subroutine omp_master_parallel()
         !CHECK: hlfir.assign %{{.*}} to %{{.*}}#0 : i32, !fir.ref<i32>
         beta = alpha + gama
     end if
-    !CHECK: else
+    !CHECK: }
 
 !CHECK: omp.terminator
 !$omp end master
diff --git a/flang/test/Lower/OpenMP/unstructured.f90 b/flang/test/Lower/OpenMP/unstructured.f90
index 9c3527eda5bb43..bd030b918033e6 100644
--- a/flang/test/Lower/OpenMP/unstructured.f90
+++ b/flang/test/Lower/OpenMP/unstructured.f90
@@ -141,7 +141,6 @@ subroutine ss3(n) ! nested unstructured OpenMP constructs
 ! CHECK:              @_FortranAioBeginExternalListOutput
 ! CHECK:              %[[LOAD:.*]] = fir.load %[[OMP_LOOP_J_DECL]]#0 : !fir.ref<i32>
 ! CHECK:              @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD]])
-! CHECK:             } else {
 ! CHECK:             }
 ! CHECK-NEXT:        omp.yield
 ! CHECK-NEXT:      }
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
index 7e4890dd00fea3..56a43abca42a76 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
@@ -118,7 +118,6 @@
 ! CHECK:                   %[[VAL_46:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_45]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 ! CHECK:                   %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<f32>
 ! CHECK:                   hlfir.assign %[[VAL_47]] to %[[VAL_37]]#0 : f32, !fir.ref<f32>
-! CHECK:                 } else {
 ! CHECK:                 }
 ! CHECK:                 omp.yield
 ! CHECK:               omp.terminator
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
index 9a93c75f5bd1a8..775554fd3dcca1 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90
@@ -108,7 +108,6 @@
 ! CHECK:                   %[[VAL_46:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_45]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 ! CHECK:                   %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<f32>
 ! CHECK:                   hlfir.assign %[[VAL_47]] to %[[VAL_37]]#0 : f32, !fir.ref<f32>
-! CHECK:                 } else {
 ! CHECK:                 }
 ! CHECK:                 omp.yield
 ! CHECK:               omp.terminator
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
index 41fcc979cdc9d9..d16de4a867a24c 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
@@ -120,7 +120,6 @@
 ! CHECK:                   %[[VAL_46:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_45]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 ! CHECK:                   %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<f32>
 ! CHECK:                   hlfir.assign %[[VAL_47]] to %[[VAL_37]]#0 : f32, !fir.ref<f32>
-! CHECK:                 } else {
 ! CHECK:                 }
 ! CHECK:                 omp.yield
 ! CHECK:               omp.terminator
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
index 50b2db9463d23d..04957c7287eae4 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90
@@ -110,7 +110,6 @@
 ! CHECK:                   %[[VAL_46:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_45]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
 ! CHECK:                   %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<f32>
 ! CHECK:                   hlfir.assign %[[VAL_47]] to %[[VAL_37]]#0 : f32, !fir.ref<f32>
-! CHECK:                 } else {
 ! CHECK:                 }
 ! CHECK:                 omp.yield
 ! CHECK:               omp.terminator
diff --git a/flang/test/Lower/OpenMP/wsloop-variable.f90 b/flang/test/Lower/OpenMP/wsloop-variable.f90
index dc2acf881f482a..7bfb9274f389a3 100644
--- a/flang/test/Lower/OpenMP/wsloop-variable.f90
+++ b/flang/test/Lower/OpenMP/wsloop-variable.f90
@@ -190,7 +190,6 @@ subroutine wsloop_variable_sub
 !CHECK:               %[[VAL_56:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i8>
 !CHECK:               %[[VAL_57:.*]] = arith.cmpi eq, %[[VAL_55]], %[[VAL_56]] : i8
 !CHECK:               fir.if %[[VAL_57]] {
-!CHECK:               } else {
 !CHECK:               }
 !CHECK:               omp.yield
 !CHECK:             }
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 9c65ff9a536407..e5d2498473ecde 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -172,7 +172,7 @@
 "`LWG3221 <https://wg21.link/LWG3221>`__","Result of ``year_month``\  arithmetic with ``months``\  is ambiguous","2019-11 (Belfast)","|Complete|","8.0",""
 "`LWG3235 <https://wg21.link/LWG3235>`__","``parse``\  manipulator without abbreviation is not callable","2019-11 (Belfast)","","",""
 "`LWG3246 <https://wg21.link/LWG3246>`__","LWG3246: What are the constraints on the template parameter of `basic_format_arg`?","2019-11 (Belfast)","|Nothing To Do|","",""
-"`LWG3253 <https://wg21.link/LWG3253>`__","``basic_syncbuf::basic_syncbuf()``\  should not be explicit","2019-11 (Belfast)","","",""
+"`LWG3253 <https://wg21.link/LWG3253>`__","``basic_syncbuf::basic_syncbuf()``\  should not be explicit","2019-11 (Belfast)","|Complete|","20.0",""
 "`LWG3245 <https://wg21.link/LWG3245>`__","Unnecessary restriction on ``'%p'``\  parse specifier","2019-11 (Belfast)","","",""
 "`LWG3244 <https://wg21.link/LWG3244>`__","Constraints for ``Source``\  in |sect|\ [fs.path.req] insufficiently constrainty","2019-11 (Belfast)","","",""
 "`LWG3241 <https://wg21.link/LWG3241>`__","``chrono-spec``\  grammar ambiguity in |sect|\ [time.format]","2019-11 (Belfast)","|Complete|","16.0",""
diff --git a/libcxx/include/__chrono/leap_second.h b/libcxx/include/__chrono/leap_second.h
index d79111ed8eecfc..be3ab4235da3ca 100644
--- a/libcxx/include/__chrono/leap_second.h
+++ b/libcxx/include/__chrono/leap_second.h
@@ -122,7 +122,7 @@ class leap_second {
 
 } // namespace chrono
 
-#  endif //_LIBCPP_STD_VER >= 20
+#  endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__chrono/parser_std_format_spec.h b/libcxx/include/__chrono/parser_std_format_spec.h
index 785bbae198e464..6803d03ad882fd 100644
--- a/libcxx/include/__chrono/parser_std_format_spec.h
+++ b/libcxx/include/__chrono/parser_std_format_spec.h
@@ -409,7 +409,7 @@ class _LIBCPP_TEMPLATE_VIS __parser_chrono {
 
 } // namespace __format_spec
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__chrono/statically_widen.h b/libcxx/include/__chrono/statically_widen.h
index a18c46f057a819..680483a59ac2c4 100644
--- a/libcxx/include/__chrono/statically_widen.h
+++ b/libcxx/include/__chrono/statically_widen.h
@@ -45,7 +45,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* __statically_widen(const char* __s
 #    define _LIBCPP_STATICALLY_WIDEN(_CharT, __str) ::std::__statically_widen<_CharT>(__str)
 #  endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__chrono/time_zone_link.h b/libcxx/include/__chrono/time_zone_link.h
index b2d365c5fd0820..7b15f6ae39278e 100644
--- a/libcxx/include/__chrono/time_zone_link.h
+++ b/libcxx/include/__chrono/time_zone_link.h
@@ -68,7 +68,7 @@ operator<=>(const time_zone_link& __x, const time_zone_link& __y) noexcept {
 
 } // namespace chrono
 
-#  endif //_LIBCPP_STD_VER >= 20
+#  endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index e47ec2f28844b6..8661d5d6e9b939 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -503,25 +503,24 @@ class expected : private __expected_base<_Tp, _Err> {
 
 private:
   template <class _Up, class _OtherErr, class _UfQual, class _OtherErrQual>
-  using __can_convert =
-      _And< is_constructible<_Tp, _UfQual>,
-            is_constructible<_Err, _OtherErrQual>,
-            _If<_Not<is_same<remove_cv_t<_Tp>, bool>>::value,
-                _And< 
-                      _Not<_And<is_same<_Tp, _Up>, is_same<_Err, _OtherErr>>>, // use the copy constructor instead, see #92676
-                      _Not<is_constructible<_Tp, expected<_Up, _OtherErr>&>>,
-                      _Not<is_constructible<_Tp, expected<_Up, _OtherErr>>>,
-                      _Not<is_constructible<_Tp, const expected<_Up, _OtherErr>&>>,
-                      _Not<is_constructible<_Tp, const expected<_Up, _OtherErr>>>,
-                      _Not<is_convertible<expected<_Up, _OtherErr>&, _Tp>>,
-                      _Not<is_convertible<expected<_Up, _OtherErr>&&, _Tp>>,
-                      _Not<is_convertible<const expected<_Up, _OtherErr>&, _Tp>>,
-                      _Not<is_convertible<const expected<_Up, _OtherErr>&&, _Tp>>>,
-                true_type>,
-            _Not<is_constructible<unexpected<_Err>, expected<_Up, _OtherErr>&>>,
-            _Not<is_constructible<unexpected<_Err>, expected<_Up, _OtherErr>>>,
-            _Not<is_constructible<unexpected<_Err>, const expected<_Up, _OtherErr>&>>,
-            _Not<is_constructible<unexpected<_Err>, const expected<_Up, _OtherErr>>> >;
+  using __can_convert = _And<
+      is_constructible<_Tp, _UfQual>,
+      is_constructible<_Err, _OtherErrQual>,
+      _If<_Not<is_same<remove_cv_t<_Tp>, bool>>::value,
+          _And< _Not<_And<is_same<_Tp, _Up>, is_same<_Err, _OtherErr>>>, // use the copy constructor instead, see #92676
+                _Not<is_constructible<_Tp, expected<_Up, _OtherErr>&>>,
+                _Not<is_constructible<_Tp, expected<_Up, _OtherErr>>>,
+                _Not<is_constructible<_Tp, const expected<_Up, _OtherErr>&>>,
+                _Not<is_constructible<_Tp, const expected<_Up, _OtherErr>>>,
+                _Not<is_convertible<expected<_Up, _OtherErr>&, _Tp>>,
+                _Not<is_convertible<expected<_Up, _OtherErr>&&, _Tp>>,
+                _Not<is_convertible<const expected<_Up, _OtherErr>&, _Tp>>,
+                _Not<is_convertible<const expected<_Up, _OtherErr>&&, _Tp>>>,
+          true_type>,
+      _Not<is_constructible<unexpected<_Err>, expected<_Up, _OtherErr>&>>,
+      _Not<is_constructible<unexpected<_Err>, expected<_Up, _OtherErr>>>,
+      _Not<is_constructible<unexpected<_Err>, const expected<_Up, _OtherErr>&>>,
+      _Not<is_constructible<unexpected<_Err>, const expected<_Up, _OtherErr>>> >;
 
   template <class _Func, class... _Args>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(
diff --git a/libcxx/include/__format/buffer.h b/libcxx/include/__format/buffer.h
index 8598f0a1c03957..ce9ac0c81e315a 100644
--- a/libcxx/include/__format/buffer.h
+++ b/libcxx/include/__format/buffer.h
@@ -646,7 +646,7 @@ class _LIBCPP_TEMPLATE_VIS __retarget_buffer {
 
 } // namespace __format
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/concepts.h b/libcxx/include/__format/concepts.h
index 13380e9b91aff8..737783ed4bdeca 100644
--- a/libcxx/include/__format/concepts.h
+++ b/libcxx/include/__format/concepts.h
@@ -75,8 +75,8 @@ template <class _Tp>
 concept __fmt_pair_like =
     __is_specialization_v<_Tp, pair> || (__is_specialization_v<_Tp, tuple> && tuple_size_v<_Tp> == 2);
 
-#  endif //_LIBCPP_STD_VER >= 23
-#endif   //_LIBCPP_STD_VER >= 20
+#  endif // _LIBCPP_STD_VER >= 23
+#endif   // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/container_adaptor.h b/libcxx/include/__format/container_adaptor.h
index 9f49ca03bf4f50..d3be2e18956046 100644
--- a/libcxx/include/__format/container_adaptor.h
+++ b/libcxx/include/__format/container_adaptor.h
@@ -66,7 +66,7 @@ template <class _CharT, class _Tp, formattable<_CharT> _Container>
 struct _LIBCPP_TEMPLATE_VIS formatter<stack<_Tp, _Container>, _CharT>
     : public __formatter_container_adaptor<stack<_Tp, _Container>, _CharT> {};
 
-#endif //_LIBCPP_STD_VER >= 23
+#endif // _LIBCPP_STD_VER >= 23
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/enable_insertable.h b/libcxx/include/__format/enable_insertable.h
index 86ef94a325b192..29fe566ff06a3f 100644
--- a/libcxx/include/__format/enable_insertable.h
+++ b/libcxx/include/__format/enable_insertable.h
@@ -28,7 +28,7 @@ inline constexpr bool __enable_insertable = false;
 
 } // namespace __format
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/escaped_output_table.h b/libcxx/include/__format/escaped_output_table.h
index f7be2dc61f21a3..bdf86cb6f99ccb 100644
--- a/libcxx/include/__format/escaped_output_table.h
+++ b/libcxx/include/__format/escaped_output_table.h
@@ -856,7 +856,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[711] = {
 // clang-format on
 } // namespace __escaped_output_table
 
-#endif //_LIBCPP_STD_VER >= 23
+#endif // _LIBCPP_STD_VER >= 23
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/extended_grapheme_cluster_table.h b/libcxx/include/__format/extended_grapheme_cluster_table.h
index 48581d8a5dde3d..7dbc239f5f5cd6 100644
--- a/libcxx/include/__format/extended_grapheme_cluster_table.h
+++ b/libcxx/include/__format/extended_grapheme_cluster_table.h
@@ -1656,7 +1656,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[1496] = {
 
 } // namespace __extended_grapheme_custer_property_boundary
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h
index aa02f81dc40e2d..d1ce055874413e 100644
--- a/libcxx/include/__format/format_arg.h
+++ b/libcxx/include/__format/format_arg.h
@@ -392,7 +392,7 @@ _LIBCPP_DEPRECATED_IN_CXX26
   }
 }
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/format_arg_store.h b/libcxx/include/__format/format_arg_store.h
index 23a599e9957599..00de1c30b8733b 100644
--- a/libcxx/include/__format/format_arg_store.h
+++ b/libcxx/include/__format/format_arg_store.h
@@ -259,7 +259,7 @@ struct _LIBCPP_TEMPLATE_VIS __format_arg_store {
   _Storage __storage;
 };
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/format_args.h b/libcxx/include/__format/format_args.h
index 07923570f38930..e19b4458e41a5b 100644
--- a/libcxx/include/__format/format_args.h
+++ b/libcxx/include/__format/format_args.h
@@ -71,7 +71,7 @@ class _LIBCPP_TEMPLATE_VIS basic_format_args {
 template <class _Context, class... _Args>
 basic_format_args(__format_arg_store<_Context, _Args...>) -> basic_format_args<_Context>;
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/format_context.h b/libcxx/include/__format/format_context.h
index 71783c55d72540..a9be17b855837d 100644
--- a/libcxx/include/__format/format_context.h
+++ b/libcxx/include/__format/format_context.h
@@ -212,7 +212,7 @@ class _LIBCPP_TEMPLATE_VIS basic_format_context<typename __format::__retarget_bu
 };
 
 _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(basic_format_context);
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/format_error.h b/libcxx/include/__format/format_error.h
index ed40e395d6af72..35a39ee82f3daf 100644
--- a/libcxx/include/__format/format_error.h
+++ b/libcxx/include/__format/format_error.h
@@ -43,7 +43,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_format_error(const ch
 #  endif
 }
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index d14b49aff14957..1518ab5768d243 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -360,7 +360,7 @@ _LIBCPP_HIDE_FROM_ABI inline __runtime_format_string<wchar_t> runtime_format(wst
   return __fmt;
 }
 #    endif
-#  endif //_LIBCPP_STD_VER >= 26
+#  endif // _LIBCPP_STD_VER >= 26
 
 template <class _CharT, class... _Args>
 struct _LIBCPP_TEMPLATE_VIS basic_format_string {
@@ -671,7 +671,7 @@ formatted_size(locale __loc, wformat_string<_Args...> __fmt, _Args&&... __args)
 
 #  endif // _LIBCPP_HAS_NO_LOCALIZATION
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/format_parse_context.h b/libcxx/include/__format/format_parse_context.h
index aefcd5497f3b9b..54c23014e7dc60 100644
--- a/libcxx/include/__format/format_parse_context.h
+++ b/libcxx/include/__format/format_parse_context.h
@@ -98,7 +98,7 @@ using format_parse_context = basic_format_parse_context<char>;
 using wformat_parse_context = basic_format_parse_context<wchar_t>;
 #  endif
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/format_string.h b/libcxx/include/__format/format_string.h
index bdf3cff7f49b18..a499afee8874a5 100644
--- a/libcxx/include/__format/format_string.h
+++ b/libcxx/include/__format/format_string.h
@@ -153,7 +153,7 @@ __parse_arg_id(_Iterator __begin, _Iterator __end, auto& __parse_ctx) {
 
 } // namespace __format
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/format_to_n_result.h b/libcxx/include/__format/format_to_n_result.h
index 6f30546dec081c..344299e32f0ee6 100644
--- a/libcxx/include/__format/format_to_n_result.h
+++ b/libcxx/include/__format/format_to_n_result.h
@@ -28,7 +28,7 @@ struct _LIBCPP_TEMPLATE_VIS format_to_n_result {
 };
 _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(format_to_n_result);
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_bool.h b/libcxx/include/__format/formatter_bool.h
index 63aa815efbe9b3..a43eba53c93701 100644
--- a/libcxx/include/__format/formatter_bool.h
+++ b/libcxx/include/__format/formatter_bool.h
@@ -72,8 +72,8 @@ struct _LIBCPP_TEMPLATE_VIS formatter<bool, _CharT> {
 #  if _LIBCPP_STD_VER >= 23
 template <>
 inline constexpr bool enable_nonlocking_formatter_optimization<bool> = true;
-#  endif //_LIBCPP_STD_VER >= 23
-#endif   //_LIBCPP_STD_VER >= 20
+#  endif // _LIBCPP_STD_VER >= 23
+#endif   // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_char.h b/libcxx/include/__format/formatter_char.h
index abfd65a4282989..a96acba08d5ca5 100644
--- a/libcxx/include/__format/formatter_char.h
+++ b/libcxx/include/__format/formatter_char.h
@@ -92,9 +92,9 @@ inline constexpr bool enable_nonlocking_formatter_optimization<char> = true;
 template <>
 inline constexpr bool enable_nonlocking_formatter_optimization<wchar_t> = true;
 #    endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
-#  endif   //_LIBCPP_STD_VER >= 23
+#  endif   // _LIBCPP_STD_VER >= 23
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h
index 334755f4e8143b..fc95dd3f22bbe7 100644
--- a/libcxx/include/__format/formatter_floating_point.h
+++ b/libcxx/include/__format/formatter_floating_point.h
@@ -781,8 +781,8 @@ template <>
 inline constexpr bool enable_nonlocking_formatter_optimization<double> = true;
 template <>
 inline constexpr bool enable_nonlocking_formatter_optimization<long double> = true;
-#  endif //_LIBCPP_STD_VER >= 23
-#endif //_LIBCPP_STD_VER >= 20
+#  endif // _LIBCPP_STD_VER >= 23
+#endif   // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_integer.h b/libcxx/include/__format/formatter_integer.h
index 2c2e7995053671..b7f46014c57231 100644
--- a/libcxx/include/__format/formatter_integer.h
+++ b/libcxx/include/__format/formatter_integer.h
@@ -118,8 +118,8 @@ inline constexpr bool enable_nonlocking_formatter_optimization<unsigned long lon
 template <>
 inline constexpr bool enable_nonlocking_formatter_optimization<__uint128_t> = true;
 #    endif
-#  endif //_LIBCPP_STD_VER >= 23
-#endif   //_LIBCPP_STD_VER >= 20
+#  endif // _LIBCPP_STD_VER >= 23
+#endif   // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_integral.h b/libcxx/include/__format/formatter_integral.h
index eca966f8886f84..beed3ab8d93df1 100644
--- a/libcxx/include/__format/formatter_integral.h
+++ b/libcxx/include/__format/formatter_integral.h
@@ -436,7 +436,7 @@ __format_bool(bool __value, _FormatContext& __ctx, __format_spec::__parsed_speci
 
 } // namespace __formatter
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_output.h b/libcxx/include/__format/formatter_output.h
index 1498f64c4aeff7..34c4c87313a450 100644
--- a/libcxx/include/__format/formatter_output.h
+++ b/libcxx/include/__format/formatter_output.h
@@ -326,7 +326,7 @@ _LIBCPP_HIDE_FROM_ABI int __truncate(basic_string_view<_CharT>& __str, int __pre
 
 } // namespace __formatter
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_pointer.h b/libcxx/include/__format/formatter_pointer.h
index e1c062cec6ed2b..6e0fa9a1b4f196 100644
--- a/libcxx/include/__format/formatter_pointer.h
+++ b/libcxx/include/__format/formatter_pointer.h
@@ -72,8 +72,8 @@ template <>
 inline constexpr bool enable_nonlocking_formatter_optimization<void*> = true;
 template <>
 inline constexpr bool enable_nonlocking_formatter_optimization<const void*> = true;
-#  endif //_LIBCPP_STD_VER >= 23
-#endif //_LIBCPP_STD_VER >= 20
+#  endif // _LIBCPP_STD_VER >= 23
+#endif   // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_string.h b/libcxx/include/__format/formatter_string.h
index dee2b3ad073a51..b29e97847f0ba1 100644
--- a/libcxx/include/__format/formatter_string.h
+++ b/libcxx/include/__format/formatter_string.h
@@ -167,8 +167,8 @@ inline constexpr bool enable_nonlocking_formatter_optimization<basic_string<wcha
 template <class _Traits>
 inline constexpr bool enable_nonlocking_formatter_optimization<basic_string_view<wchar_t, _Traits>> = true;
 #    endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
-#  endif   //_LIBCPP_STD_VER >= 23
-#endif     //_LIBCPP_STD_VER >= 20
+#  endif   // _LIBCPP_STD_VER >= 23
+#endif     // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_tuple.h b/libcxx/include/__format/formatter_tuple.h
index 030097a8797dae..bb841ef11440dd 100644
--- a/libcxx/include/__format/formatter_tuple.h
+++ b/libcxx/include/__format/formatter_tuple.h
@@ -143,7 +143,7 @@ template <__fmt_char_type _CharT, formattable<_CharT>... _Args>
 struct _LIBCPP_TEMPLATE_VIS formatter<tuple<_Args...>, _CharT>
     : public __formatter_tuple<_CharT, tuple<_Args...>, _Args...> {};
 
-#endif //_LIBCPP_STD_VER >= 23
+#endif // _LIBCPP_STD_VER >= 23
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/indic_conjunct_break_table.h b/libcxx/include/__format/indic_conjunct_break_table.h
index 44521d27498c3c..39dd45da771fc2 100644
--- a/libcxx/include/__format/indic_conjunct_break_table.h
+++ b/libcxx/include/__format/indic_conjunct_break_table.h
@@ -343,7 +343,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[201] = {
 
 } // namespace __indic_conjunct_break
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/parser_std_format_spec.h b/libcxx/include/__format/parser_std_format_spec.h
index 150bdde89f3b39..28891e5d2876cd 100644
--- a/libcxx/include/__format/parser_std_format_spec.h
+++ b/libcxx/include/__format/parser_std_format_spec.h
@@ -1163,7 +1163,7 @@ __estimate_column_width(basic_string_view<_CharT> __str, size_t __maximum, __col
 
 } // namespace __format_spec
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/range_default_formatter.h b/libcxx/include/__format/range_default_formatter.h
index b35223ae933291..fb21b0f8beb3a1 100644
--- a/libcxx/include/__format/range_default_formatter.h
+++ b/libcxx/include/__format/range_default_formatter.h
@@ -207,7 +207,7 @@ template <ranges::input_range _Rp, class _CharT>
   requires(format_kind<_Rp> != range_format::disabled && formattable<ranges::range_reference_t<_Rp>, _CharT>)
 struct _LIBCPP_TEMPLATE_VIS formatter<_Rp, _CharT> : __range_default_formatter<format_kind<_Rp>, _Rp, _CharT> {};
 
-#endif //_LIBCPP_STD_VER >= 23
+#endif // _LIBCPP_STD_VER >= 23
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/range_formatter.h b/libcxx/include/__format/range_formatter.h
index 69156307434937..def55c86ce51cd 100644
--- a/libcxx/include/__format/range_formatter.h
+++ b/libcxx/include/__format/range_formatter.h
@@ -257,7 +257,7 @@ struct _LIBCPP_TEMPLATE_VIS range_formatter {
   basic_string_view<_CharT> __closing_bracket_ = _LIBCPP_STATICALLY_WIDEN(_CharT, "]");
 };
 
-#endif //_LIBCPP_STD_VER >= 23
+#endif // _LIBCPP_STD_VER >= 23
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/unicode.h b/libcxx/include/__format/unicode.h
index de7d0fea1df56a..ce6d55ae346a3f 100644
--- a/libcxx/include/__format/unicode.h
+++ b/libcxx/include/__format/unicode.h
@@ -595,7 +595,7 @@ class __code_point_view {
 
 } // namespace __unicode
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/width_estimation_table.h b/libcxx/include/__format/width_estimation_table.h
index 11f61dea18d696..23a08746b91031 100644
--- a/libcxx/include/__format/width_estimation_table.h
+++ b/libcxx/include/__format/width_estimation_table.h
@@ -263,7 +263,7 @@ inline constexpr uint32_t __table_upper_bound = 0x0003fffd;
 
 } // namespace __width_estimation_table
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__fwd/format.h b/libcxx/include/__fwd/format.h
index b30c220f8a0435..815e3e1922c62d 100644
--- a/libcxx/include/__fwd/format.h
+++ b/libcxx/include/__fwd/format.h
@@ -31,7 +31,7 @@ class _LIBCPP_TEMPLATE_VIS basic_format_context;
 template <class _Tp, class _CharT = char>
 struct _LIBCPP_TEMPLATE_VIS formatter;
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__memory/allocator.h b/libcxx/include/__memory/allocator.h
index 0dbdc41d3c3d14..6a9eed926e05f4 100644
--- a/libcxx/include/__memory/allocator.h
+++ b/libcxx/include/__memory/allocator.h
@@ -47,7 +47,7 @@ class _LIBCPP_TEMPLATE_VIS allocator<void> {
     typedef allocator<_Up> other;
   };
 };
-#endif   // _LIBCPP_STD_VER <= 17
+#endif // _LIBCPP_STD_VER <= 17
 
 // This class provides a non-trivial default constructor to the class that derives from it
 // if the condition is satisfied.
diff --git a/libcxx/include/__type_traits/is_member_pointer.h b/libcxx/include/__type_traits/is_member_pointer.h
index cc125e318cf919..3e2753ac4228c2 100644
--- a/libcxx/include/__type_traits/is_member_pointer.h
+++ b/libcxx/include/__type_traits/is_member_pointer.h
@@ -27,7 +27,7 @@ struct _LIBCPP_TEMPLATE_VIS is_member_object_pointer : _BoolConstant<__is_member
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS is_member_function_pointer : _BoolConstant<__is_member_function_pointer(_Tp)> {};
 
-#  if _LIBCPP_STD_VER >= 17
+#if _LIBCPP_STD_VER >= 17
 template <class _Tp>
 inline constexpr bool is_member_pointer_v = __is_member_pointer(_Tp);
 
@@ -36,7 +36,7 @@ inline constexpr bool is_member_object_pointer_v = __is_member_object_pointer(_T
 
 template <class _Tp>
 inline constexpr bool is_member_function_pointer_v = __is_member_function_pointer(_Tp);
-#  endif
+#endif
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/is_void.h b/libcxx/include/__type_traits/is_void.h
index 46316b0d3a534e..562faae9fba2cd 100644
--- a/libcxx/include/__type_traits/is_void.h
+++ b/libcxx/include/__type_traits/is_void.h
@@ -21,10 +21,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS is_void : _BoolConstant<__is_same(__remove_cv(_Tp), void)> {};
 
-#  if _LIBCPP_STD_VER >= 17
+#if _LIBCPP_STD_VER >= 17
 template <class _Tp>
 inline constexpr bool is_void_v = __is_same(__remove_cv(_Tp), void);
-#  endif
+#endif
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/array b/libcxx/include/array
index 4db0cb7bd7e3b5..588664ace0162a 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -427,8 +427,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const array<_Tp, _Size>& __x, const
 template <class _Tp, size_t _Size>
 _LIBCPP_HIDE_FROM_ABI constexpr __synth_three_way_result<_Tp>
 operator<=>(const array<_Tp, _Size>& __x, const array<_Tp, _Size>& __y) {
-  return std::lexicographical_compare_three_way(
-      __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
+  return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
 }
 
 #endif // _LIBCPP_STD_VER <= 17
diff --git a/libcxx/include/cstddef b/libcxx/include/cstddef
index 1a4049e4d34f2d..592f6261a6de3f 100644
--- a/libcxx/include/cstddef
+++ b/libcxx/include/cstddef
@@ -66,8 +66,8 @@ using ::max_align_t _LIBCPP_USING_IF_EXISTS;
 _LIBCPP_END_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 17
-namespace std // purposefully not versioned
-{
+namespace std { // purposefully not versioned
+
 enum class byte : unsigned char {};
 
 _LIBCPP_HIDE_FROM_ABI inline constexpr byte operator|(byte __lhs, byte __rhs) noexcept {
@@ -127,7 +127,6 @@ template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
 }
 
 } // namespace std
-
-#endif
+#endif // _LIBCPP_STD_VER >= 17
 
 #endif // _LIBCPP_CSTDDEF
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index b8e3d05588f96e..6c0dc5f96a5d5e 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -1517,8 +1517,7 @@ operator<=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>
 template <class _Tp, class _Allocator>
 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp>
 operator<=>(const forward_list<_Tp, _Allocator>& __x, const forward_list<_Tp, _Allocator>& __y) {
-  return std::lexicographical_compare_three_way(
-      __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
+  return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
 }
 
 #endif // #if _LIBCPP_STD_VER <= 17
diff --git a/libcxx/include/iosfwd b/libcxx/include/iosfwd
index 051c73995e98b4..eeafcc37c598ef 100644
--- a/libcxx/include/iosfwd
+++ b/libcxx/include/iosfwd
@@ -170,8 +170,8 @@ class __save_flags {
   _CharT __fill_;
 
 public:
-    __save_flags(const __save_flags&) = delete;
-    __save_flags& operator=(const __save_flags&) = delete;
+  __save_flags(const __save_flags&)            = delete;
+  __save_flags& operator=(const __save_flags&) = delete;
 
   _LIBCPP_HIDE_FROM_ABI explicit __save_flags(__stream_type& __stream)
       : __stream_(__stream), __fmtflags_(__stream.flags()), __fill_(__stream.fill()) {}
diff --git a/libcxx/include/list b/libcxx/include/list
index 929c84de7be449..76b1d9241b41ca 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -466,7 +466,7 @@ public:
 template <class _Tp, class _Alloc>
 class __list_imp {
 public:
-  __list_imp(const __list_imp&) = delete;
+  __list_imp(const __list_imp&)            = delete;
   __list_imp& operator=(const __list_imp&) = delete;
 
   typedef _Alloc allocator_type;
@@ -1679,8 +1679,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const list<_Tp, _Alloc>& __x, const
 template <class _Tp, class _Allocator>
 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp>
 operator<=>(const list<_Tp, _Allocator>& __x, const list<_Tp, _Allocator>& __y) {
-  return std::lexicographical_compare_three_way(
-      __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
+  return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
 }
 
 #endif // _LIBCPP_STD_VER <= 17
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 13d0dce34d97e3..f193b5d95f49f5 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -245,8 +245,15 @@ module std_stdexcept [system] {
   header "stdexcept"
   export *
 }
-module std_stop_token {
+module std_stop_token [system] {
   header "stop_token"
+  private header "__stop_token/atomic_unique_lock.h"
+  private header "__stop_token/intrusive_list_view.h"
+  private header "__stop_token/intrusive_shared_ptr.h"
+  private header "__stop_token/stop_callback.h"
+  private header "__stop_token/stop_source.h"
+  private header "__stop_token/stop_state.h"
+  private header "__stop_token/stop_token.h"
   export *
 }
 module std_streambuf [system] {
@@ -1592,41 +1599,25 @@ module std_private_numeric_transform_exclusive_scan [system] { header "__numeric
 module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" }
 module std_private_numeric_transform_reduce         [system] { header "__numeric/transform_reduce.h" }
 
-module std_private_pstl_backend                    [system] {
+module std_private_pstl [system] {
   header "__pstl/backend.h"
-  export *
-}
-module std_private_pstl_backend_fwd                [system] {
   header "__pstl/backend_fwd.h"
-  export *
-}
-module std_private_pstl_backends_default           [system] {
   header "__pstl/backends/default.h"
-  export *
-}
-module std_private_pstl_backends_libdispatch       [system] {
   header "__pstl/backends/libdispatch.h"
-  export *
-}
-module std_private_pstl_backends_serial            [system] {
   header "__pstl/backends/serial.h"
-  export *
-}
-module std_private_pstl_backends_std_thread        [system] {
   header "__pstl/backends/std_thread.h"
-  export *
+  header "__pstl/cpu_algos/any_of.h"
+  header "__pstl/cpu_algos/cpu_traits.h"
+  header "__pstl/cpu_algos/fill.h"
+  header "__pstl/cpu_algos/find_if.h"
+  header "__pstl/cpu_algos/for_each.h"
+  header "__pstl/cpu_algos/merge.h"
+  header "__pstl/cpu_algos/stable_sort.h"
+  header "__pstl/cpu_algos/transform.h"
+  header "__pstl/cpu_algos/transform_reduce.h"
+  header "__pstl/dispatch.h"
+  header "__pstl/handle_exception.h"
 }
-module std_private_pstl_cpu_algos_any_of           [system] { header "__pstl/cpu_algos/any_of.h" }
-module std_private_pstl_cpu_algos_cpu_traits       [system] { header "__pstl/cpu_algos/cpu_traits.h" }
-module std_private_pstl_cpu_algos_fill             [system] { header "__pstl/cpu_algos/fill.h" }
-module std_private_pstl_cpu_algos_find_if          [system] { header "__pstl/cpu_algos/find_if.h" }
-module std_private_pstl_cpu_algos_for_each         [system] { header "__pstl/cpu_algos/for_each.h" }
-module std_private_pstl_cpu_algos_merge            [system] { header "__pstl/cpu_algos/merge.h" }
-module std_private_pstl_cpu_algos_stable_sort      [system] { header "__pstl/cpu_algos/stable_sort.h" }
-module std_private_pstl_cpu_algos_transform        [system] { header "__pstl/cpu_algos/transform.h" }
-module std_private_pstl_cpu_algos_transform_reduce [system] { header "__pstl/cpu_algos/transform_reduce.h" }
-module std_private_pstl_dispatch                   [system] { header "__pstl/dispatch.h" }
-module std_private_pstl_handle_exception           [system] { header "__pstl/handle_exception.h" }
 
 module std_private_queue_fwd [system] { header "__fwd/queue.h" }
 
@@ -1781,23 +1772,6 @@ module std_private_span_span_fwd [system] { header "__fwd/span.h" }
 
 module std_private_stack_fwd [system] { header "__fwd/stack.h" }
 
-module std_private_stop_token_atomic_unique_lock   [system] { header "__stop_token/atomic_unique_lock.h" }
-module std_private_stop_token_intrusive_list_view  [system] { header "__stop_token/intrusive_list_view.h" }
-module std_private_stop_token_intrusive_shared_ptr [system] { header "__stop_token/intrusive_shared_ptr.h" }
-module std_private_stop_token_stop_callback        [system] { header "__stop_token/stop_callback.h" }
-module std_private_stop_token_stop_source          [system] {
-  header "__stop_token/stop_source.h"
-  export *
-}
-module std_private_stop_token_stop_state           [system] {
-  header "__stop_token/stop_state.h"
-  export *
-}
-module std_private_stop_token_stop_token           [system] {
-  header "__stop_token/stop_token.h"
-  export *
-}
-
 module std_private_string_char_traits           [system] {
   header "__string/char_traits.h"
   export *
diff --git a/libcxx/include/set b/libcxx/include/set
index 94533583798699..7e9661a0149ab9 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -1452,8 +1452,7 @@ operator<=(const multiset<_Key, _Compare, _Allocator>& __x, const multiset<_Key,
 template <class _Key, class _Allocator>
 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Key>
 operator<=>(const multiset<_Key, _Allocator>& __x, const multiset<_Key, _Allocator>& __y) {
-  return std::lexicographical_compare_three_way(
-      __x.begin(), __x.end(), __y.begin(), __y.end(), __synth_three_way);
+  return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), __synth_three_way);
 }
 
 #endif // _LIBCPP_STD_VER <= 17
diff --git a/libcxx/include/string b/libcxx/include/string
index 45be4050304125..15c7a2f6b988b4 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -2014,11 +2014,11 @@ private:
     (void)__old_mid;
     (void)__new_mid;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-  #if defined(__APPLE__)
+#  if defined(__APPLE__)
     // TODO: remove after addressing issue #96099 (https://github.com/llvm/llvm-project/issues/96099)
-    if(!__is_long())
+    if (!__is_long())
       return;
-  #endif
+#  endif
     std::__annotate_contiguous_container<_Allocator>(data(), data() + capacity() + 1, __old_mid, __new_mid);
 #endif
   }
diff --git a/libcxx/include/syncstream b/libcxx/include/syncstream
index e6f35b6f428eda..fea4c66b8e118f 100644
--- a/libcxx/include/syncstream
+++ b/libcxx/include/syncstream
@@ -46,7 +46,9 @@ namespace std {
         using streambuf_type = basic_streambuf<charT, traits>;
 
         // [syncstream.syncbuf.cons], construction and destruction
-        explicit basic_syncbuf(streambuf_type* obuf = nullptr)
+        basic_syncbuf()
+          : basic_syncbuf(nullptr) {}
+        explicit basic_syncbuf(streambuf_type* obuf)
           : basic_syncbuf(obuf, Allocator()) {}
         basic_syncbuf(streambuf_type*, const Allocator&);
         basic_syncbuf(basic_syncbuf&&);
@@ -253,8 +255,9 @@ public:
 
   // [syncstream.syncbuf.cons], construction and destruction
 
-  _LIBCPP_HIDE_FROM_ABI explicit basic_syncbuf(streambuf_type* __obuf = nullptr)
-      : basic_syncbuf(__obuf, _Allocator()) {}
+  _LIBCPP_HIDE_FROM_ABI basic_syncbuf() : basic_syncbuf(nullptr) {}
+
+  _LIBCPP_HIDE_FROM_ABI explicit basic_syncbuf(streambuf_type* __obuf) : basic_syncbuf(__obuf, _Allocator()) {}
 
   _LIBCPP_HIDE_FROM_ABI basic_syncbuf(streambuf_type* __obuf, _Allocator const& __alloc)
       : __wrapped_(__obuf), __str_(__alloc) {
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index 081b90c7bbec54..5161c2aa97c2ba 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -833,8 +833,8 @@ public:
 
   // [tuple.assign]
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
-  operator=(_If<_And<is_copy_assignable<_Tp>...>::value, tuple, __nat> const& __tuple)
-      noexcept(_And<is_nothrow_copy_assignable<_Tp>...>::value) {
+  operator=(_If<_And<is_copy_assignable<_Tp>...>::value, tuple, __nat> const& __tuple) noexcept(
+      _And<is_nothrow_copy_assignable<_Tp>...>::value) {
     std::__memberwise_copy_assign(*this, __tuple, typename __make_tuple_indices<sizeof...(_Tp)>::type());
     return *this;
   }
@@ -857,8 +857,8 @@ public:
 #  endif // _LIBCPP_STD_VER >= 23
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
-  operator=(_If<_And<is_move_assignable<_Tp>...>::value, tuple, __nat>&& __tuple)
-      noexcept(_And<is_nothrow_move_assignable<_Tp>...>::value) {
+  operator=(_If<_And<is_move_assignable<_Tp>...>::value, tuple, __nat>&& __tuple) noexcept(
+      _And<is_nothrow_move_assignable<_Tp>...>::value) {
     std::__memberwise_forward_assign(
         *this, std::move(__tuple), __tuple_types<_Tp...>(), typename __make_tuple_indices<sizeof...(_Tp)>::type());
     return *this;
@@ -868,8 +868,8 @@ public:
       class... _Up,
       __enable_if_t< _And< _BoolConstant<sizeof...(_Tp) == sizeof...(_Up)>, is_assignable<_Tp&, _Up const&>... >::value,
                      int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(tuple<_Up...> const& __tuple)
-      noexcept(_And<is_nothrow_assignable<_Tp&, _Up const&>...>::value) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
+  operator=(tuple<_Up...> const& __tuple) noexcept(_And<is_nothrow_assignable<_Tp&, _Up const&>...>::value) {
     std::__memberwise_copy_assign(*this, __tuple, typename __make_tuple_indices<sizeof...(_Tp)>::type());
     return *this;
   }
@@ -877,8 +877,8 @@ public:
   template <class... _Up,
             __enable_if_t< _And< _BoolConstant<sizeof...(_Tp) == sizeof...(_Up)>, is_assignable<_Tp&, _Up>... >::value,
                            int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(tuple<_Up...>&& __tuple)
-      noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
+  operator=(tuple<_Up...>&& __tuple) noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
     std::__memberwise_forward_assign(
         *this, std::move(__tuple), __tuple_types<_Up...>(), typename __make_tuple_indices<sizeof...(_Tp)>::type());
     return *this;
@@ -942,16 +942,16 @@ public:
   template <class _Up1,
             class _Up2,
             __enable_if_t< _EnableAssignFromPair<false, pair<_Up1, _Up2> const&>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(pair<_Up1, _Up2> const& __pair)
-      noexcept(_NothrowAssignFromPair<false, pair<_Up1, _Up2> const&>::value) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
+  operator=(pair<_Up1, _Up2> const& __pair) noexcept(_NothrowAssignFromPair<false, pair<_Up1, _Up2> const&>::value) {
     std::get<0>(*this) = __pair.first;
     std::get<1>(*this) = __pair.second;
     return *this;
   }
 
   template <class _Up1, class _Up2, __enable_if_t< _EnableAssignFromPair<false, pair<_Up1, _Up2>&&>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(pair<_Up1, _Up2>&& __pair)
-      noexcept(_NothrowAssignFromPair<false, pair<_Up1, _Up2>&&>::value) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
+  operator=(pair<_Up1, _Up2>&& __pair) noexcept(_NothrowAssignFromPair<false, pair<_Up1, _Up2>&&>::value) {
     std::get<0>(*this) = std::forward<_Up1>(__pair.first);
     std::get<1>(*this) = std::forward<_Up2>(__pair.second);
     return *this;
@@ -962,8 +962,8 @@ public:
       class _Up,
       size_t _Np,
       __enable_if_t< _And< _BoolConstant<_Np == sizeof...(_Tp)>, is_assignable<_Tp&, _Up const&>... >::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(array<_Up, _Np> const& __array)
-      noexcept(_And<is_nothrow_assignable<_Tp&, _Up const&>...>::value) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
+  operator=(array<_Up, _Np> const& __array) noexcept(_And<is_nothrow_assignable<_Tp&, _Up const&>...>::value) {
     std::__memberwise_copy_assign(*this, __array, typename __make_tuple_indices<sizeof...(_Tp)>::type());
     return *this;
   }
@@ -973,8 +973,8 @@ public:
             size_t _Np,
             class = void,
             __enable_if_t< _And< _BoolConstant<_Np == sizeof...(_Tp)>, is_assignable<_Tp&, _Up>... >::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple& operator=(array<_Up, _Np>&& __array)
-      noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
+  operator=(array<_Up, _Np>&& __array) noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
     std::__memberwise_forward_assign(
         *this,
         std::move(__array),
@@ -984,8 +984,8 @@ public:
   }
 
   // [tuple.swap]
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(tuple& __t)
-      noexcept(__all<__is_nothrow_swappable_v<_Tp>...>::value) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
+  swap(tuple& __t) noexcept(__all<__is_nothrow_swappable_v<_Tp>...>::value) {
     __base_.swap(__t.__base_);
   }
 
@@ -1043,8 +1043,8 @@ tuple(allocator_arg_t, _Alloc, tuple<_Tp...>) -> tuple<_Tp...>;
 #  endif
 
 template <class... _Tp, __enable_if_t<__all<__is_swappable_v<_Tp>...>::value, int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(tuple<_Tp...>& __t, tuple<_Tp...>& __u)
-    noexcept(__all<__is_nothrow_swappable_v<_Tp>...>::value) {
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
+swap(tuple<_Tp...>& __t, tuple<_Tp...>& __u) noexcept(__all<__is_nothrow_swappable_v<_Tp>...>::value) {
   __t.swap(__u);
 }
 
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 81aab9407714cc..0f852e7f36c29c 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -2938,8 +2938,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const vector<_Tp, _Allocator>& __x,
 template <class _Tp, class _Allocator>
 _LIBCPP_HIDE_FROM_ABI constexpr __synth_three_way_result<_Tp>
 operator<=>(const vector<_Tp, _Allocator>& __x, const vector<_Tp, _Allocator>& __y) {
-  return std::lexicographical_compare_three_way(
-      __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
+  return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
 }
 
 #endif // _LIBCPP_STD_VER <= 17
diff --git a/libcxx/modules/std/format.inc b/libcxx/modules/std/format.inc
index 09aa03ad73e388..8daf0de85cc412 100644
--- a/libcxx/modules/std/format.inc
+++ b/libcxx/modules/std/format.inc
@@ -30,7 +30,7 @@ export namespace std {
 #endif
 #if _LIBCPP_STD_VER >= 26
   using std::runtime_format;
-#endif //_LIBCPP_STD_VER >= 26
+#endif // _LIBCPP_STD_VER >= 26
 
   // [format.functions], formatting functions
   using std::format;
diff --git a/libcxx/src/include/refstring.h b/libcxx/src/include/refstring.h
index 78452249f4fecf..3e0ec7a97c7bec 100644
--- a/libcxx/src/include/refstring.h
+++ b/libcxx/src/include/refstring.h
@@ -124,4 +124,4 @@ inline bool __libcpp_refstring::__uses_refcount() const {
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif //_LIBCPP_REFSTRING_H
+#endif // _LIBCPP_REFSTRING_H
diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp
index 2a9b828f4389ce..44d51921ac74ad 100644
--- a/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.stoptoken/atomic_unique_lock.pass.cpp
@@ -5,12 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
-
-// XFAIL: availability-synchronization_library-missing
 
+// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: availability-synchronization_library-missing
+// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header
 
 #include <__stop_token/atomic_unique_lock.h>
 #include <atomic>
diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp
index 85cd9786258955..d8cd2fb68e132e 100644
--- a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_list_view.pass.cpp
@@ -8,6 +8,7 @@
 //
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header
 
 #include <__stop_token/intrusive_list_view.h>
 #include <cassert>
diff --git a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp
index 47440015f2c50c..99d4226662a0b7 100644
--- a/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.stoptoken/intrusive_shared_ptr.pass.cpp
@@ -8,6 +8,7 @@
 //
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header
 
 #include <__stop_token/intrusive_shared_ptr.h>
 #include <atomic>
diff --git a/libcxx/test/std/input.output/syncstream/syncbuf/syncstream.syncbuf.cons/cons.default.pass.cpp b/libcxx/test/std/input.output/syncstream/syncbuf/syncstream.syncbuf.cons/cons.default.pass.cpp
index aa0eb2d41e0f01..beebc36c76758e 100644
--- a/libcxx/test/std/input.output/syncstream/syncbuf/syncstream.syncbuf.cons/cons.default.pass.cpp
+++ b/libcxx/test/std/input.output/syncstream/syncbuf/syncstream.syncbuf.cons/cons.default.pass.cpp
@@ -25,8 +25,15 @@
 #include "constexpr_char_traits.h"
 #include "test_allocator.h"
 
+template <class CharT>
+std::basic_syncbuf<CharT> lwg3253_default_constructor_is_not_explicit() {
+  return {};
+}
+
 template <class CharT>
 void test() {
+  lwg3253_default_constructor_is_not_explicit<CharT>();
+
   {
     using Buf = std::basic_syncbuf<CharT>;
     static_assert(std::default_initializable<Buf>);
diff --git a/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp b/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp
index c9c2ba20021491..cd032d48648953 100644
--- a/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp
+++ b/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp
@@ -8,21 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// The following platforms have sizeof(long double) == sizeof(double), so this test doesn't apply to them.
-// This test does apply to aarch64 where Arm's AAPCS64 is followed. There they are different sizes.
-// XFAIL: target={{arm64|arm64e|armv(7|8)(l|m)?|powerpc|powerpc64}}-{{.+}}
-
-// MSVC configurations have long double equal to regular double on all
-// architectures.
-// XFAIL: target={{.+}}-pc-windows-msvc
-
-// ARM/AArch64 MinGW also has got long double equal to regular double, just
-// like MSVC (thus match both MinGW and MSVC here, for those architectures).
-// XFAIL: target={{aarch64|armv7}}-{{.*}}-windows-{{.+}}
-
-// Android's 32-bit x86 target has long double equal to regular double.
-// XFAIL: target=i686-{{.+}}-android{{.*}}
-
 // <compare>
 
 // template<class T> constexpr strong_ordering strong_order(const T& a, const T& b);
@@ -37,5 +22,9 @@
 
 void f() {
     long double ld = 3.14;
+#ifdef TEST_LONG_DOUBLE_IS_DOUBLE
+    (void)ld; // expected-no-diagnostics
+#else
     (void)std::strong_order(ld, ld);  // expected-error@*:* {{std::strong_order is unimplemented for this floating-point type}}
+#endif
 }
diff --git a/libcxx/test/std/numerics/bit/bit.cast/bit_cast.pass.cpp b/libcxx/test/std/numerics/bit/bit.cast/bit_cast.pass.cpp
index f73877416a7170..044589298439c1 100644
--- a/libcxx/test/std/numerics/bit/bit.cast/bit_cast.pass.cpp
+++ b/libcxx/test/std/numerics/bit/bit.cast/bit_cast.pass.cpp
@@ -229,7 +229,7 @@ bool tests() {
         test_roundtrip_through_nested_T<false>(i);
         test_roundtrip_through_buffer<false>(i);
 
-#if __SIZEOF_LONG_DOUBLE__ == __SIZEOF_DOUBLE__
+#ifdef TEST_LONG_DOUBLE_IS_DOUBLE
         test_roundtrip_through<double, false>(i);
 #endif
 #if defined(__SIZEOF_INT128__) && __SIZEOF_LONG_DOUBLE__ == __SIZEOF_INT128__ &&                                       \
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index 6f7ec3aa0c1f9f..5d4c1a65cfafb2 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -511,4 +511,8 @@ inline Tp const& DoNotOptimize(Tp const& value) {
 #  define TEST_CONSTEXPR_OPERATOR_NEW
 #endif
 
+#if __SIZEOF_LONG_DOUBLE__ == __SIZEOF_DOUBLE__
+#  define TEST_LONG_DOUBLE_IS_DOUBLE
+#endif
+
 #endif // SUPPORT_TEST_MACROS_HPP
diff --git a/libcxx/utils/generate_escaped_output_table.py b/libcxx/utils/generate_escaped_output_table.py
index 9dcecaa5575cdd..41524e8fe7186c 100755
--- a/libcxx/utils/generate_escaped_output_table.py
+++ b/libcxx/utils/generate_escaped_output_table.py
@@ -235,7 +235,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
 // clang-format on
 }} // namespace __escaped_output_table
 
-#endif //_LIBCPP_STD_VER >= 23
+#endif // _LIBCPP_STD_VER >= 23
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/utils/generate_extended_grapheme_cluster_table.py b/libcxx/utils/generate_extended_grapheme_cluster_table.py
index 76d1e78e9239c6..558b606186130f 100755
--- a/libcxx/utils/generate_extended_grapheme_cluster_table.py
+++ b/libcxx/utils/generate_extended_grapheme_cluster_table.py
@@ -230,7 +230,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
 {content}
 }} // namespace __extended_grapheme_custer_property_boundary
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/utils/generate_indic_conjunct_break_table.py b/libcxx/utils/generate_indic_conjunct_break_table.py
index 762dfa73b51f7b..e41f6e9be233d7 100755
--- a/libcxx/utils/generate_indic_conjunct_break_table.py
+++ b/libcxx/utils/generate_indic_conjunct_break_table.py
@@ -223,7 +223,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
 {content}
 }} // namespace __indic_conjunct_break
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/utils/generate_width_estimation_table.py b/libcxx/utils/generate_width_estimation_table.py
index f4cce1071d1f15..d8c036f34e8353 100644
--- a/libcxx/utils/generate_width_estimation_table.py
+++ b/libcxx/utils/generate_width_estimation_table.py
@@ -261,7 +261,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
 {content}
 }} // namespace __width_estimation_table
 
-#endif //_LIBCPP_STD_VER >= 20
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/lld/test/ELF/avr-reloc.s b/lld/test/ELF/avr-reloc.s
index ec088eaa149d01..41c32580f63a1c 100644
--- a/lld/test/ELF/avr-reloc.s
+++ b/lld/test/ELF/avr-reloc.s
@@ -76,32 +76,6 @@ adiw r24, b   ; R_AVR_6_ADIW
 in    r20, b  ; R_AVR_PORT6
 sbic  b, 1    ; R_AVR_PORT5
 
-.section .PCREL,"ax",@progbits
-; CHECK-LABEL: section .PCREL
-; CHECK:       rjmp .+30
-; CHECK-NEXT:  rjmp .-36
-; CHECK-NEXT:  breq .+26
-; CHECK-NEXT:  breq .-40
-; CHECK-NEXT:  rjmp .-4096
-; CHECK-NEXT:  rjmp .+4094
-; CHECK-NEXT:  rjmp .+4094
-; CHECK-NEXT:  rjmp .-4096
-; CHECK-NEXT:  breq .-128
-; CHECK-NEXT:  breq .+126
-; HEX-LABEL:   section .PCREL:
-; HEX-NEXT:    0fc0eecf 69f061f3
-foo:
-rjmp foo + 32  ; R_AVR_13_PCREL
-rjmp foo - 32  ; R_AVR_13_PCREL
-breq foo + 32  ; R_AVR_7_PCREL
-breq foo - 32  ; R_AVR_7_PCREL
-rjmp 1f - 4096  $ 1:  ; R_AVR_13_PCREL
-rjmp 1f + 4094  $ 1:  ; R_AVR_13_PCREL
-rjmp 1f - 4098  $ 1:  ; R_AVR_13_PCREL (overflow)
-rjmp 1f + 4096  $ 1:  ; R_AVR_13_PCREL (overflow)
-breq 1f - 128   $ 1:  ; R_AVR_7_PCREL
-breq 1f + 126   $ 1:  ; R_AVR_7_PCREL
-
 .section .LDSSTS,"ax",@progbits
 ; CHECK-LABEL: section .LDSSTS:
 ; CHECK:       lds r20, 0x1e
diff --git a/lldb/bindings/interface/SBErrorDocstrings.i b/lldb/bindings/interface/SBErrorDocstrings.i
index b64c3d64c6c77b..c272ffb7605ffb 100644
--- a/lldb/bindings/interface/SBErrorDocstrings.i
+++ b/lldb/bindings/interface/SBErrorDocstrings.i
@@ -10,8 +10,10 @@ For example (from test/python_api/hello_world/TestHelloWorld.py), ::
 
         # Spawn a new process and don't display the stdout if not in TraceOn() mode.
         import subprocess
-        popen = subprocess.Popen([self.exe, 'abc', 'xyz'],
-                                 stdout = open(os.devnull, 'w') if not self.TraceOn() else None)
+        popen = subprocess.Popen(
+            [self.exe, 'abc', 'xyz'],
+            stdout=subprocess.DEVNULL if not self.TraceOn() else None,
+        )
 
         listener = lldb.SBListener('my.attach.listener')
         error = lldb.SBError()
diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h
index 5239ac6f4055f5..172824dc78a6bc 100644
--- a/lldb/include/lldb/Core/SourceManager.h
+++ b/lldb/include/lldb/Core/SourceManager.h
@@ -9,6 +9,7 @@
 #ifndef LLDB_CORE_SOURCEMANAGER_H
 #define LLDB_CORE_SOURCEMANAGER_H
 
+#include "lldb/Utility/Checksum.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/lldb-defines.h"
 #include "lldb/lldb-forward.h"
@@ -37,8 +38,8 @@ class SourceManager {
                            const SourceManager::File &rhs);
 
   public:
-    File(const FileSpec &file_spec, lldb::TargetSP target_sp);
-    File(const FileSpec &file_spec, lldb::DebuggerSP debugger_sp);
+    File(lldb::SupportFileSP support_file_sp, lldb::TargetSP target_sp);
+    File(lldb::SupportFileSP support_file_sp, lldb::DebuggerSP debugger_sp);
 
     bool ModificationTimeIsStale() const;
     bool PathRemappingIsStale() const;
@@ -56,7 +57,10 @@ class SourceManager {
 
     bool LineIsValid(uint32_t line);
 
-    const FileSpec &GetFileSpec() { return m_file_spec; }
+    lldb::SupportFileSP GetSupportFile() const {
+      assert(m_support_file_sp && "SupportFileSP must always be valid");
+      return m_support_file_sp;
+    }
 
     uint32_t GetSourceMapModificationID() const { return m_source_map_mod_id; }
 
@@ -68,17 +72,20 @@ class SourceManager {
 
     llvm::sys::TimePoint<> GetTimestamp() const { return m_mod_time; }
 
+    const Checksum &GetChecksum() const { return m_checksum; }
+
   protected:
     /// Set file and update modification time.
-    void SetFileSpec(FileSpec file_spec);
+    void SetSupportFile(lldb::SupportFileSP support_file_sp);
 
     bool CalculateLineOffsets(uint32_t line = UINT32_MAX);
 
-    FileSpec m_file_spec_orig; // The original file spec that was used (can be
-                               // different from m_file_spec)
-    FileSpec m_file_spec; // The actually file spec being used (if the target
-                          // has source mappings, this might be different from
-                          // m_file_spec_orig)
+    /// The support file. If the target has source mappings, this might be
+    /// different from the original support file passed to the constructor.
+    lldb::SupportFileSP m_support_file_sp;
+
+    /// Keep track of the on-disk checksum.
+    Checksum m_checksum;
 
     // Keep the modification time that this file data is valid for
     llvm::sys::TimePoint<> m_mod_time;
@@ -93,7 +100,8 @@ class SourceManager {
     lldb::TargetWP m_target_wp;
 
   private:
-    void CommonInitializer(const FileSpec &file_spec, lldb::TargetSP target_sp);
+    void CommonInitializer(lldb::SupportFileSP support_file_sp,
+                           lldb::TargetSP target_sp);
   };
 
   typedef std::shared_ptr<File> FileSP;
@@ -139,14 +147,13 @@ class SourceManager {
 
   ~SourceManager();
 
-  FileSP GetLastFile() { return GetFile(m_last_file_spec); }
+  FileSP GetLastFile() { return GetFile(m_last_support_file_sp); }
 
-  size_t
-  DisplaySourceLinesWithLineNumbers(const FileSpec &file, uint32_t line,
-                                    uint32_t column, uint32_t context_before,
-                                    uint32_t context_after,
-                                    const char *current_line_cstr, Stream *s,
-                                    const SymbolContextList *bp_locs = nullptr);
+  size_t DisplaySourceLinesWithLineNumbers(
+      lldb::SupportFileSP support_file_sp, uint32_t line, uint32_t column,
+      uint32_t context_before, uint32_t context_after,
+      const char *current_line_cstr, Stream *s,
+      const SymbolContextList *bp_locs = nullptr);
 
   // This variant uses the last file we visited.
   size_t DisplaySourceLinesWithLineNumbersUsingLastFile(
@@ -157,22 +164,31 @@ class SourceManager {
   size_t DisplayMoreWithLineNumbers(Stream *s, uint32_t count, bool reverse,
                                     const SymbolContextList *bp_locs = nullptr);
 
-  bool SetDefaultFileAndLine(const FileSpec &file_spec, uint32_t line);
+  bool SetDefaultFileAndLine(lldb::SupportFileSP support_file_sp,
+                             uint32_t line);
+
+  struct SupportFileAndLine {
+    lldb::SupportFileSP support_file_sp;
+    uint32_t line;
+    SupportFileAndLine(lldb::SupportFileSP support_file_sp, uint32_t line)
+        : support_file_sp(support_file_sp), line(line) {}
+  };
 
-  bool GetDefaultFileAndLine(FileSpec &file_spec, uint32_t &line);
+  std::optional<SupportFileAndLine> GetDefaultFileAndLine();
 
   bool DefaultFileAndLineSet() {
-    return (GetFile(m_last_file_spec).get() != nullptr);
+    return (GetFile(m_last_support_file_sp).get() != nullptr);
   }
 
-  void FindLinesMatchingRegex(FileSpec &file_spec, RegularExpression &regex,
-                              uint32_t start_line, uint32_t end_line,
+  void FindLinesMatchingRegex(lldb::SupportFileSP support_file_sp,
+                              RegularExpression &regex, uint32_t start_line,
+                              uint32_t end_line,
                               std::vector<uint32_t> &match_lines);
 
-  FileSP GetFile(const FileSpec &file_spec);
+  FileSP GetFile(lldb::SupportFileSP support_file_sp);
 
 protected:
-  FileSpec m_last_file_spec;
+  lldb::SupportFileSP m_last_support_file_sp;
   uint32_t m_last_line;
   uint32_t m_last_count;
   bool m_default_set;
diff --git a/lldb/include/lldb/Utility/SupportFile.h b/lldb/include/lldb/Utility/SupportFile.h
index 334a0aaac2c27e..6a091bb84ada35 100644
--- a/lldb/include/lldb/Utility/SupportFile.h
+++ b/lldb/include/lldb/Utility/SupportFile.h
@@ -14,10 +14,10 @@
 
 namespace lldb_private {
 
-/// Wraps either a FileSpec that represents a local file or a source
-/// file whose contents is known (for example because it can be
-/// reconstructed from debug info), but that hasn't been written to a
-/// file yet. This also stores an optional checksum of the on-disk content.
+/// Wraps a FileSpec and an optional Checksum. The FileSpec represents either a
+/// path to a file or a source file whose contents is known (for example because
+/// it can be reconstructed from debug info), but that hasn't been written to a
+/// file yet.
 class SupportFile {
 public:
   SupportFile() : m_file_spec(), m_checksum() {}
diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index 0e8ca159efd55d..834f01aaa61e6b 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -467,9 +467,8 @@ def should_skip_simulator_test():
         if lldbplatformutil.getHostPlatform() not in ["darwin", "macosx"]:
             return "simulator tests are run only on darwin hosts."
         try:
-            DEVNULL = open(os.devnull, "w")
             output = subprocess.check_output(
-                ["xcodebuild", "-showsdks"], stderr=DEVNULL
+                ["xcodebuild", "-showsdks"], stderr=subprocess.DEVNULL
             ).decode("utf-8")
             if re.search("%ssimulator" % platform, output):
                 return None
@@ -1094,9 +1093,8 @@ def skipUnlessFeature(feature):
     def is_feature_enabled():
         if platform.system() == "Darwin":
             try:
-                DEVNULL = open(os.devnull, "w")
                 output = subprocess.check_output(
-                    ["/usr/sbin/sysctl", feature], stderr=DEVNULL
+                    ["/usr/sbin/sysctl", feature], stderr=subprocess.DEVNULL
                 ).decode("utf-8")
                 # If 'feature: 1' was output, then this feature is available and
                 # the test should not be skipped.
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index b57c3bdd87c83c..e0da7cbd1ddd6e 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -31,7 +31,6 @@
 import abc
 from functools import wraps
 import gc
-import glob
 import io
 import json
 import os.path
@@ -416,7 +415,7 @@ def launch(self, executable, args, extra_env):
 
         self._proc = Popen(
             [executable] + args,
-            stdout=open(os.devnull) if not self._trace_on else None,
+            stdout=DEVNULL if not self._trace_on else None,
             stdin=PIPE,
             env=env,
         )
diff --git a/lldb/source/API/SBSourceManager.cpp b/lldb/source/API/SBSourceManager.cpp
index e46f990698d826..4b96f1222bc88f 100644
--- a/lldb/source/API/SBSourceManager.cpp
+++ b/lldb/source/API/SBSourceManager.cpp
@@ -46,15 +46,15 @@ class SourceManagerImpl {
     lldb::TargetSP target_sp(m_target_wp.lock());
     if (target_sp) {
       return target_sp->GetSourceManager().DisplaySourceLinesWithLineNumbers(
-          file, line, column, context_before, context_after, current_line_cstr,
-          s);
+          std::make_shared<SupportFile>(file), line, column, context_before,
+          context_after, current_line_cstr, s);
     } else {
       lldb::DebuggerSP debugger_sp(m_debugger_wp.lock());
       if (debugger_sp) {
         return debugger_sp->GetSourceManager()
-            .DisplaySourceLinesWithLineNumbers(file, line, column,
-                                               context_before, context_after,
-                                               current_line_cstr, s);
+            .DisplaySourceLinesWithLineNumbers(
+                std::make_shared<SupportFile>(file), line, column,
+                context_before, context_after, current_line_cstr, s);
       }
     }
     return 0;
diff --git a/lldb/source/Breakpoint/BreakpointResolverFileRegex.cpp b/lldb/source/Breakpoint/BreakpointResolverFileRegex.cpp
index 0509924e6300be..05fa7b93096889 100644
--- a/lldb/source/Breakpoint/BreakpointResolverFileRegex.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolverFileRegex.cpp
@@ -102,7 +102,8 @@ Searcher::CallbackReturn BreakpointResolverFileRegex::SearchCallback(
   FileSpec cu_file_spec = cu->GetPrimaryFile();
   std::vector<uint32_t> line_matches;
   context.target_sp->GetSourceManager().FindLinesMatchingRegex(
-      cu_file_spec, m_regex, 1, UINT32_MAX, line_matches);
+      std::make_shared<SupportFile>(cu_file_spec), m_regex, 1, UINT32_MAX,
+      line_matches);
 
   uint32_t num_matches = line_matches.size();
   for (uint32_t i = 0; i < num_matches; i++) {
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index abde27b2b53ad8..ede3dd2f2a864c 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -769,20 +769,26 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
 private:
   bool GetDefaultFile(Target &target, FileSpec &file,
                       CommandReturnObject &result) {
-    uint32_t default_line;
     // First use the Source Manager's default file. Then use the current stack
     // frame's file.
-    if (!target.GetSourceManager().GetDefaultFileAndLine(file, default_line)) {
+    if (auto maybe_file_and_line =
+            target.GetSourceManager().GetDefaultFileAndLine()) {
+      file = maybe_file_and_line->support_file_sp->GetSpecOnly();
+      return true;
+    }
+
       StackFrame *cur_frame = m_exe_ctx.GetFramePtr();
       if (cur_frame == nullptr) {
         result.AppendError(
             "No selected frame to use to find the default file.");
         return false;
-      } else if (!cur_frame->HasDebugInformation()) {
+      }
+      if (!cur_frame->HasDebugInformation()) {
         result.AppendError("Cannot use the selected frame to find the default "
                            "file, it has no debug info.");
         return false;
-      } else {
+      }
+
         const SymbolContext &sc =
             cur_frame->GetSymbolContext(eSymbolContextLineEntry);
         if (sc.line_entry.GetFile()) {
@@ -791,8 +797,6 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
           result.AppendError("Can't find the file for the selected frame to "
                              "use as the default file.");
           return false;
-        }
-      }
     }
     return true;
   }
diff --git a/lldb/source/Commands/CommandObjectSource.cpp b/lldb/source/Commands/CommandObjectSource.cpp
index 5ddd46ac5fdc07..1fc122420388d8 100644
--- a/lldb/source/Commands/CommandObjectSource.cpp
+++ b/lldb/source/Commands/CommandObjectSource.cpp
@@ -777,14 +777,16 @@ class CommandObjectSourceList : public CommandObjectParsed {
     if (sc.function) {
       Target &target = GetTarget();
 
-      FileSpec start_file;
+      SupportFileSP start_file = std::make_shared<SupportFile>();
       uint32_t start_line;
       uint32_t end_line;
       FileSpec end_file;
 
       if (sc.block == nullptr) {
         // Not an inlined function
-        sc.function->GetStartLineSourceInfo(start_file, start_line);
+        FileSpec function_file_spec;
+        sc.function->GetStartLineSourceInfo(function_file_spec, start_line);
+        start_file = std::make_shared<SupportFile>(function_file_spec);
         if (start_line == 0) {
           result.AppendErrorWithFormat("Could not find line information for "
                                        "start of function: \"%s\".\n",
@@ -794,7 +796,7 @@ class CommandObjectSourceList : public CommandObjectParsed {
         sc.function->GetEndLineSourceInfo(end_file, end_line);
       } else {
         // We have an inlined function
-        start_file = source_info.line_entry.GetFile();
+        start_file = source_info.line_entry.file_sp;
         start_line = source_info.line_entry.line;
         end_line = start_line + m_options.num_lines;
       }
@@ -825,14 +827,15 @@ class CommandObjectSourceList : public CommandObjectParsed {
 
       if (m_options.show_bp_locs) {
         const bool show_inlines = true;
-        m_breakpoint_locations.Reset(start_file, 0, show_inlines);
+        m_breakpoint_locations.Reset(start_file->GetSpecOnly(), 0,
+                                     show_inlines);
         SearchFilterForUnconstrainedSearches target_search_filter(
             m_exe_ctx.GetTargetSP());
         target_search_filter.Search(m_breakpoint_locations);
       }
 
-      result.AppendMessageWithFormat("File: %s\n",
-                                     start_file.GetPath().c_str());
+      result.AppendMessageWithFormat(
+          "File: %s\n", start_file->GetSpecOnly().GetPath().c_str());
       // We don't care about the column here.
       const uint32_t column = 0;
       return target.GetSourceManager().DisplaySourceLinesWithLineNumbers(
@@ -1050,8 +1053,9 @@ class CommandObjectSourceList : public CommandObjectParsed {
                   ? sc.line_entry.column
                   : 0;
           target.GetSourceManager().DisplaySourceLinesWithLineNumbers(
-              sc.comp_unit->GetPrimaryFile(), sc.line_entry.line, column,
-              lines_to_back_up, m_options.num_lines - lines_to_back_up, "->",
+              std::make_shared<SupportFile>(sc.comp_unit->GetPrimaryFile()),
+              sc.line_entry.line, column, lines_to_back_up,
+              m_options.num_lines - lines_to_back_up, "->",
               &result.GetOutputStream(), GetBreakpointLocations());
           result.SetStatus(eReturnStatusSuccessFinishResult);
         }
@@ -1076,8 +1080,8 @@ class CommandObjectSourceList : public CommandObjectParsed {
               target.GetSourceManager().GetLastFile());
           if (last_file_sp) {
             const bool show_inlines = true;
-            m_breakpoint_locations.Reset(last_file_sp->GetFileSpec(), 0,
-                                         show_inlines);
+            m_breakpoint_locations.Reset(
+                last_file_sp->GetSupportFile()->GetSpecOnly(), 0, show_inlines);
             SearchFilterForUnconstrainedSearches target_search_filter(
                 target.shared_from_this());
             target_search_filter.Search(m_breakpoint_locations);
@@ -1170,9 +1174,9 @@ class CommandObjectSourceList : public CommandObjectParsed {
             m_options.num_lines = 10;
           const uint32_t column = 0;
           target.GetSourceManager().DisplaySourceLinesWithLineNumbers(
-              sc.comp_unit->GetPrimaryFile(), m_options.start_line, column, 0,
-              m_options.num_lines, "", &result.GetOutputStream(),
-              GetBreakpointLocations());
+              std::make_shared<SupportFile>(sc.comp_unit->GetPrimaryFile()),
+              m_options.start_line, column, 0, m_options.num_lines, "",
+              &result.GetOutputStream(), GetBreakpointLocations());
 
           result.SetStatus(eReturnStatusSuccessFinishResult);
         } else {
diff --git a/lldb/source/Core/Disassembler.cpp b/lldb/source/Core/Disassembler.cpp
index 9286f62058bc8d..d071e3bfe4f77d 100644
--- a/lldb/source/Core/Disassembler.cpp
+++ b/lldb/source/Core/Disassembler.cpp
@@ -517,7 +517,8 @@ void Disassembler::PrintInstructions(Debugger &debugger, const ArchSpec &arch,
             line_highlight = "**";
           }
           source_manager.DisplaySourceLinesWithLineNumbers(
-              ln.file, ln.line, ln.column, 0, 0, line_highlight, &strm);
+              std::make_shared<SupportFile>(ln.file), ln.line, ln.column, 0, 0,
+              line_highlight, &strm);
         }
         if (source_lines_to_display.print_source_context_end_eol)
           strm.EOL();
diff --git a/lldb/source/Core/IOHandlerCursesGUI.cpp b/lldb/source/Core/IOHandlerCursesGUI.cpp
index d922d32f910583..3d69aedb6b13ee 100644
--- a/lldb/source/Core/IOHandlerCursesGUI.cpp
+++ b/lldb/source/Core/IOHandlerCursesGUI.cpp
@@ -6894,8 +6894,8 @@ class SourceFileWindowDelegate : public WindowDelegate {
           if (context_changed)
             m_selected_line = m_pc_line;
 
-          if (m_file_sp &&
-              m_file_sp->GetFileSpec() == m_sc.line_entry.GetFile()) {
+          if (m_file_sp && m_file_sp->GetSupportFile()->GetSpecOnly() ==
+                               m_sc.line_entry.GetFile()) {
             // Same file, nothing to do, we should either have the lines or
             // not (source file missing)
             if (m_selected_line >= static_cast<size_t>(m_first_visible_line)) {
@@ -6910,8 +6910,8 @@ class SourceFileWindowDelegate : public WindowDelegate {
           } else {
             // File changed, set selected line to the line with the PC
             m_selected_line = m_pc_line;
-            m_file_sp = m_debugger.GetSourceManager().GetFile(
-                m_sc.line_entry.GetFile());
+            m_file_sp =
+                m_debugger.GetSourceManager().GetFile(m_sc.line_entry.file_sp);
             if (m_file_sp) {
               const size_t num_lines = m_file_sp->GetNumLines();
               m_line_width = 1;
@@ -7001,7 +7001,8 @@ class SourceFileWindowDelegate : public WindowDelegate {
             LineEntry bp_loc_line_entry;
             if (bp_loc_sp->GetAddress().CalculateSymbolContextLineEntry(
                     bp_loc_line_entry)) {
-              if (m_file_sp->GetFileSpec() == bp_loc_line_entry.GetFile()) {
+              if (m_file_sp->GetSupportFile()->GetSpecOnly() ==
+                  bp_loc_line_entry.GetFile()) {
                 bp_lines.insert(bp_loc_line_entry.line);
               }
             }
@@ -7332,7 +7333,7 @@ class SourceFileWindowDelegate : public WindowDelegate {
         if (exe_ctx.HasProcessScope() && exe_ctx.GetProcessRef().IsAlive()) {
           BreakpointSP bp_sp = exe_ctx.GetTargetRef().CreateBreakpoint(
               nullptr, // Don't limit the breakpoint to certain modules
-              m_file_sp->GetFileSpec(), // Source file
+              m_file_sp->GetSupportFile()->GetSpecOnly(), // Source file
               m_selected_line +
                   1, // Source line number (m_selected_line is zero based)
               0,     // Unspecified column.
@@ -7478,7 +7479,8 @@ class SourceFileWindowDelegate : public WindowDelegate {
           LineEntry bp_loc_line_entry;
           if (bp_loc_sp->GetAddress().CalculateSymbolContextLineEntry(
                   bp_loc_line_entry)) {
-            if (m_file_sp->GetFileSpec() == bp_loc_line_entry.GetFile() &&
+            if (m_file_sp->GetSupportFile()->GetSpecOnly() ==
+                    bp_loc_line_entry.GetFile() &&
                 m_selected_line + 1 == bp_loc_line_entry.line) {
               bool removed =
                   exe_ctx.GetTargetRef().RemoveBreakpointByID(bp_sp->GetID());
@@ -7492,7 +7494,7 @@ class SourceFileWindowDelegate : public WindowDelegate {
       // No breakpoint found on the location, add it.
       BreakpointSP bp_sp = exe_ctx.GetTargetRef().CreateBreakpoint(
           nullptr, // Don't limit the breakpoint to certain modules
-          m_file_sp->GetFileSpec(), // Source file
+          m_file_sp->GetSupportFile()->GetSpecOnly(), // Source file
           m_selected_line +
               1, // Source line number (m_selected_line is zero based)
           0,     // No column specified.
diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp
index 0d70c554e5342b..f6e59ce731a573 100644
--- a/lldb/source/Core/SourceManager.cpp
+++ b/lldb/source/Core/SourceManager.cpp
@@ -63,18 +63,22 @@ static void resolve_tilde(FileSpec &file_spec) {
 
 // SourceManager constructor
 SourceManager::SourceManager(const TargetSP &target_sp)
-    : m_last_line(0), m_last_count(0), m_default_set(false),
-      m_target_wp(target_sp),
+    : m_last_support_file_sp(std::make_shared<SupportFile>()), m_last_line(0),
+      m_last_count(0), m_default_set(false), m_target_wp(target_sp),
       m_debugger_wp(target_sp->GetDebugger().shared_from_this()) {}
 
 SourceManager::SourceManager(const DebuggerSP &debugger_sp)
-    : m_last_line(0), m_last_count(0), m_default_set(false), m_target_wp(),
+    : m_last_support_file_sp(std::make_shared<SupportFile>()), m_last_line(0),
+      m_last_count(0), m_default_set(false), m_target_wp(),
       m_debugger_wp(debugger_sp) {}
 
 // Destructor
 SourceManager::~SourceManager() = default;
 
-SourceManager::FileSP SourceManager::GetFile(const FileSpec &file_spec) {
+SourceManager::FileSP SourceManager::GetFile(SupportFileSP support_file_sp) {
+  assert(support_file_sp && "SupportFileSP must be valid");
+
+  FileSpec file_spec = support_file_sp->GetSpecOnly();
   if (!file_spec)
     return {};
 
@@ -87,8 +91,8 @@ SourceManager::FileSP SourceManager::GetFile(const FileSpec &file_spec) {
     LLDB_LOG(log, "Source file caching disabled: creating new source file: {0}",
              file_spec);
     if (target_sp)
-      return std::make_shared<File>(file_spec, target_sp);
-    return std::make_shared<File>(file_spec, debugger_sp);
+      return std::make_shared<File>(support_file_sp, target_sp);
+    return std::make_shared<File>(support_file_sp, debugger_sp);
   }
 
   ProcessSP process_sp = target_sp ? target_sp->GetProcessSP() : ProcessSP();
@@ -136,7 +140,8 @@ SourceManager::FileSP SourceManager::GetFile(const FileSpec &file_spec) {
   }
 
   // Check if the file exists on disk.
-  if (file_sp && !FileSystem::Instance().Exists(file_sp->GetFileSpec())) {
+  if (file_sp && !FileSystem::Instance().Exists(
+                     file_sp->GetSupportFile()->GetSpecOnly())) {
     LLDB_LOG(log, "File doesn't exist on disk: {0}", file_spec);
     file_sp.reset();
   }
@@ -148,9 +153,9 @@ SourceManager::FileSP SourceManager::GetFile(const FileSpec &file_spec) {
 
     // (Re)create the file.
     if (target_sp)
-      file_sp = std::make_shared<File>(file_spec, target_sp);
+      file_sp = std::make_shared<File>(support_file_sp, target_sp);
     else
-      file_sp = std::make_shared<File>(file_spec, debugger_sp);
+      file_sp = std::make_shared<File>(support_file_sp, debugger_sp);
 
     // Add the file to the debugger and process cache. If the file was
     // invalidated, this will overwrite it.
@@ -230,11 +235,8 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbersUsingLastFile(
       start_line = 1;
   }
 
-  if (!m_default_set) {
-    FileSpec tmp_spec;
-    uint32_t tmp_line;
-    GetDefaultFileAndLine(tmp_spec, tmp_line);
-  }
+  if (!m_default_set)
+    GetDefaultFileAndLine();
 
   m_last_line = start_line;
   m_last_count = count;
@@ -305,11 +307,12 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbersUsingLastFile(
 }
 
 size_t SourceManager::DisplaySourceLinesWithLineNumbers(
-    const FileSpec &file_spec, uint32_t line, uint32_t column,
+    lldb::SupportFileSP support_file_sp, uint32_t line, uint32_t column,
     uint32_t context_before, uint32_t context_after,
     const char *current_line_cstr, Stream *s,
     const SymbolContextList *bp_locs) {
-  FileSP file_sp(GetFile(file_spec));
+  assert(support_file_sp && "SupportFile must be valid");
+  FileSP file_sp(GetFile(support_file_sp));
 
   uint32_t start_line;
   uint32_t count = context_before + context_after + 1;
@@ -322,8 +325,9 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbers(
   if (last_file_sp.get() != file_sp.get()) {
     if (line == 0)
       m_last_line = 0;
-    m_last_file_spec = file_spec;
+    m_last_support_file_sp = support_file_sp;
   }
+
   return DisplaySourceLinesWithLineNumbersUsingLastFile(
       start_line, count, line, column, current_line_cstr, s, bp_locs);
 }
@@ -334,11 +338,8 @@ size_t SourceManager::DisplayMoreWithLineNumbers(
   // to figure it out here.
   FileSP last_file_sp(GetLastFile());
   const bool have_default_file_line = last_file_sp && m_last_line > 0;
-  if (!m_default_set) {
-    FileSpec tmp_spec;
-    uint32_t tmp_line;
-    GetDefaultFileAndLine(tmp_spec, tmp_line);
-  }
+  if (!m_default_set)
+    GetDefaultFileAndLine();
 
   if (last_file_sp) {
     if (m_last_line == UINT32_MAX)
@@ -373,26 +374,27 @@ size_t SourceManager::DisplayMoreWithLineNumbers(
   return 0;
 }
 
-bool SourceManager::SetDefaultFileAndLine(const FileSpec &file_spec,
+bool SourceManager::SetDefaultFileAndLine(lldb::SupportFileSP support_file_sp,
                                           uint32_t line) {
+  assert(support_file_sp && "SupportFile must be valid");
+
   m_default_set = true;
-  FileSP file_sp(GetFile(file_spec));
 
-  if (file_sp) {
+  if (FileSP file_sp = GetFile(support_file_sp)) {
     m_last_line = line;
-    m_last_file_spec = file_spec;
+    m_last_support_file_sp = support_file_sp;
     return true;
-  } else {
-    return false;
   }
+
+  return false;
 }
 
-bool SourceManager::GetDefaultFileAndLine(FileSpec &file_spec, uint32_t &line) {
-  if (FileSP last_file_sp = GetLastFile()) {
-    file_spec = m_last_file_spec;
-    line = m_last_line;
-    return true;
-  } else if (!m_default_set) {
+std::optional<SourceManager::SupportFileAndLine>
+SourceManager::GetDefaultFileAndLine() {
+  if (FileSP last_file_sp = GetLastFile())
+    return SupportFileAndLine(m_last_support_file_sp, m_last_line);
+
+  if (!m_default_set) {
     TargetSP target_sp(m_target_wp.lock());
 
     if (target_sp) {
@@ -418,51 +420,51 @@ bool SourceManager::GetDefaultFileAndLine(FileSpec &file_spec, uint32_t &line) {
             if (sc.function->GetAddressRange()
                     .GetBaseAddress()
                     .CalculateSymbolContextLineEntry(line_entry)) {
-              SetDefaultFileAndLine(line_entry.GetFile(), line_entry.line);
-              file_spec = m_last_file_spec;
-              line = m_last_line;
-              return true;
+              SetDefaultFileAndLine(line_entry.file_sp, line_entry.line);
+              return SupportFileAndLine(line_entry.file_sp, m_last_line);
             }
           }
         }
       }
     }
   }
-  return false;
+
+  return std::nullopt;
 }
 
-void SourceManager::FindLinesMatchingRegex(FileSpec &file_spec,
+void SourceManager::FindLinesMatchingRegex(SupportFileSP support_file_sp,
                                            RegularExpression &regex,
                                            uint32_t start_line,
                                            uint32_t end_line,
                                            std::vector<uint32_t> &match_lines) {
   match_lines.clear();
-  FileSP file_sp = GetFile(file_spec);
+  FileSP file_sp = GetFile(support_file_sp);
   if (!file_sp)
     return;
   return file_sp->FindLinesMatchingRegex(regex, start_line, end_line,
                                          match_lines);
 }
 
-SourceManager::File::File(const FileSpec &file_spec,
+SourceManager::File::File(SupportFileSP support_file_sp,
                           lldb::DebuggerSP debugger_sp)
-    : m_file_spec_orig(file_spec), m_file_spec(), m_mod_time(),
-      m_debugger_wp(debugger_sp), m_target_wp(TargetSP()) {
-  CommonInitializer(file_spec, {});
+    : m_support_file_sp(std::make_shared<SupportFile>()), m_checksum(),
+      m_mod_time(), m_debugger_wp(debugger_sp), m_target_wp(TargetSP()) {
+  CommonInitializer(support_file_sp, {});
 }
 
-SourceManager::File::File(const FileSpec &file_spec, TargetSP target_sp)
-    : m_file_spec_orig(file_spec), m_file_spec(), m_mod_time(),
+SourceManager::File::File(SupportFileSP support_file_sp, TargetSP target_sp)
+    : m_support_file_sp(std::make_shared<SupportFile>()), m_checksum(),
+      m_mod_time(),
       m_debugger_wp(target_sp ? target_sp->GetDebugger().shared_from_this()
                               : DebuggerSP()),
       m_target_wp(target_sp) {
-  CommonInitializer(file_spec, target_sp);
+  CommonInitializer(support_file_sp, target_sp);
 }
 
-void SourceManager::File::CommonInitializer(const FileSpec &file_spec,
+void SourceManager::File::CommonInitializer(SupportFileSP support_file_sp,
                                             TargetSP target_sp) {
   // Set the file and update the modification time.
-  SetFileSpec(file_spec);
+  SetSupportFile(support_file_sp);
 
   // Always update the source map modification ID if we have a target.
   if (target_sp)
@@ -472,65 +474,78 @@ void SourceManager::File::CommonInitializer(const FileSpec &file_spec,
   if (m_mod_time == llvm::sys::TimePoint<>()) {
     if (target_sp) {
       // If this is just a file name, try finding it in the target.
-      if (!file_spec.GetDirectory() && file_spec.GetFilename()) {
-        bool check_inlines = false;
-        SymbolContextList sc_list;
-        size_t num_matches =
-            target_sp->GetImages().ResolveSymbolContextForFilePath(
-                file_spec.GetFilename().AsCString(), 0, check_inlines,
-                SymbolContextItem(eSymbolContextModule |
-                                  eSymbolContextCompUnit),
-                sc_list);
-        bool got_multiple = false;
-        if (num_matches != 0) {
-          if (num_matches > 1) {
-            CompileUnit *test_cu = nullptr;
-            for (const SymbolContext &sc : sc_list) {
-              if (sc.comp_unit) {
-                if (test_cu) {
-                  if (test_cu != sc.comp_unit)
-                    got_multiple = true;
-                  break;
-                } else
-                  test_cu = sc.comp_unit;
+      {
+        FileSpec file_spec = support_file_sp->GetSpecOnly();
+        if (!file_spec.GetDirectory() && file_spec.GetFilename()) {
+          bool check_inlines = false;
+          SymbolContextList sc_list;
+          size_t num_matches =
+              target_sp->GetImages().ResolveSymbolContextForFilePath(
+                  file_spec.GetFilename().AsCString(), 0, check_inlines,
+                  SymbolContextItem(eSymbolContextModule |
+                                    eSymbolContextCompUnit),
+                  sc_list);
+          bool got_multiple = false;
+          if (num_matches != 0) {
+            if (num_matches > 1) {
+              CompileUnit *test_cu = nullptr;
+              for (const SymbolContext &sc : sc_list) {
+                if (sc.comp_unit) {
+                  if (test_cu) {
+                    if (test_cu != sc.comp_unit)
+                      got_multiple = true;
+                    break;
+                  } else
+                    test_cu = sc.comp_unit;
+                }
               }
             }
-          }
-          if (!got_multiple) {
-            SymbolContext sc;
-            sc_list.GetContextAtIndex(0, sc);
-            if (sc.comp_unit)
-              SetFileSpec(sc.comp_unit->GetPrimaryFile());
+            if (!got_multiple) {
+              SymbolContext sc;
+              sc_list.GetContextAtIndex(0, sc);
+              if (sc.comp_unit)
+                SetSupportFile(std::make_shared<SupportFile>(
+                    sc.comp_unit->GetPrimaryFile()));
+            }
           }
         }
       }
 
       // Try remapping the file if it doesn't exist.
-      if (!FileSystem::Instance().Exists(m_file_spec)) {
-        // Check target specific source remappings (i.e., the
-        // target.source-map setting), then fall back to the module
-        // specific remapping (i.e., the .dSYM remapping dictionary).
-        auto remapped = target_sp->GetSourcePathMap().FindFile(m_file_spec);
-        if (!remapped) {
-          FileSpec new_spec;
-          if (target_sp->GetImages().FindSourceFile(m_file_spec, new_spec))
-            remapped = new_spec;
+      {
+        FileSpec file_spec = support_file_sp->GetSpecOnly();
+        if (!FileSystem::Instance().Exists(file_spec)) {
+          // Check target specific source remappings (i.e., the
+          // target.source-map setting), then fall back to the module
+          // specific remapping (i.e., the .dSYM remapping dictionary).
+          auto remapped = target_sp->GetSourcePathMap().FindFile(file_spec);
+          if (!remapped) {
+            FileSpec new_spec;
+            if (target_sp->GetImages().FindSourceFile(file_spec, new_spec))
+              remapped = new_spec;
+          }
+          if (remapped)
+            SetSupportFile(std::make_shared<SupportFile>(
+                *remapped, support_file_sp->GetChecksum()));
         }
-        if (remapped)
-          SetFileSpec(*remapped);
       }
     }
   }
 
   // If the file exists, read in the data.
-  if (m_mod_time != llvm::sys::TimePoint<>())
-    m_data_sp = FileSystem::Instance().CreateDataBuffer(m_file_spec);
+  if (m_mod_time != llvm::sys::TimePoint<>()) {
+    m_data_sp = FileSystem::Instance().CreateDataBuffer(
+        m_support_file_sp->GetSpecOnly());
+    m_checksum = llvm::MD5::hash(m_data_sp->GetData());
+  }
 }
 
-void SourceManager::File::SetFileSpec(FileSpec file_spec) {
+void SourceManager::File::SetSupportFile(lldb::SupportFileSP support_file_sp) {
+  FileSpec file_spec = support_file_sp->GetSpecOnly();
   resolve_tilde(file_spec);
-  m_file_spec = std::move(file_spec);
-  m_mod_time = FileSystem::Instance().GetModificationTime(m_file_spec);
+  m_support_file_sp =
+      std::make_shared<SupportFile>(file_spec, support_file_sp->GetChecksum());
+  m_mod_time = FileSystem::Instance().GetModificationTime(file_spec);
 }
 
 uint32_t SourceManager::File::GetLineOffset(uint32_t line) {
@@ -603,7 +618,8 @@ bool SourceManager::File::ModificationTimeIsStale() const {
   // TODO: use host API to sign up for file modifications to anything in our
   // source cache and only update when we determine a file has been updated.
   // For now we check each time we want to display info for the file.
-  auto curr_mod_time = FileSystem::Instance().GetModificationTime(m_file_spec);
+  auto curr_mod_time = FileSystem::Instance().GetModificationTime(
+      m_support_file_sp->GetSpecOnly());
   return curr_mod_time != llvm::sys::TimePoint<>() &&
          m_mod_time != curr_mod_time;
 }
@@ -644,7 +660,8 @@ size_t SourceManager::File::DisplaySourceLines(uint32_t line,
                        debugger_sp->GetStopShowColumnAnsiSuffix());
 
   HighlighterManager mgr;
-  std::string path = GetFileSpec().GetPath(/*denormalize*/ false);
+  std::string path =
+      GetSupportFile()->GetSpecOnly().GetPath(/*denormalize*/ false);
   // FIXME: Find a way to get the definitive language this file was written in
   // and pass it to the highlighter.
   const auto &h = mgr.getHighlighterFor(lldb::eLanguageTypeUnknown, path);
@@ -698,7 +715,8 @@ void SourceManager::File::FindLinesMatchingRegex(
 
 bool lldb_private::operator==(const SourceManager::File &lhs,
                               const SourceManager::File &rhs) {
-  if (lhs.m_file_spec != rhs.m_file_spec)
+  if (!lhs.GetSupportFile()->Equal(*rhs.GetSupportFile(),
+                                   SupportFile::eEqualChecksumIfSet))
     return false;
   return lhs.m_mod_time == rhs.m_mod_time;
 }
@@ -778,9 +796,9 @@ void SourceManager::SourceFileCache::AddSourceFile(const FileSpec &file_spec,
   assert(file_sp && "invalid FileSP");
 
   AddSourceFileImpl(file_spec, file_sp);
-  const FileSpec &resolved_file_spec = file_sp->GetFileSpec();
+  const FileSpec &resolved_file_spec = file_sp->GetSupportFile()->GetSpecOnly();
   if (file_spec != resolved_file_spec)
-    AddSourceFileImpl(file_sp->GetFileSpec(), file_sp);
+    AddSourceFileImpl(file_sp->GetSupportFile()->GetSpecOnly(), file_sp);
 }
 
 void SourceManager::SourceFileCache::RemoveSourceFile(const FileSP &file_sp) {
@@ -820,14 +838,24 @@ SourceManager::FileSP SourceManager::SourceFileCache::FindSourceFile(
   return {};
 }
 
+static std::string toString(const Checksum &checksum) {
+  if (!checksum)
+    return "";
+  return std::string(llvm::formatv("{0}", checksum.digest()));
+}
+
 void SourceManager::SourceFileCache::Dump(Stream &stream) const {
-  stream << "Modification time   Lines    Path\n";
-  stream << "------------------- -------- --------------------------------\n";
+  // clang-format off
+  stream << "Modification time   MD5 Checksum (on-disk)           MD5 Checksum (line table)        Lines    Path\n";
+  stream << "------------------- -------------------------------- -------------------------------- -------- --------------------------------\n";
+  // clang-format on
   for (auto &entry : m_file_cache) {
     if (!entry.second)
       continue;
     FileSP file = entry.second;
-    stream.Format("{0:%Y-%m-%d %H:%M:%S} {1,8:d} {2}\n", file->GetTimestamp(),
+    stream.Format("{0:%Y-%m-%d %H:%M:%S} {1,32} {2,32} {3,8:d} {4}\n",
+                  file->GetTimestamp(), toString(file->GetChecksum()),
+                  toString(file->GetSupportFile()->GetChecksum()),
                   file->GetNumLines(), entry.first.GetPath());
   }
 }
diff --git a/lldb/source/Expression/REPL.cpp b/lldb/source/Expression/REPL.cpp
index a6a4ffb5e0af9e..56c50e346b39b8 100644
--- a/lldb/source/Expression/REPL.cpp
+++ b/lldb/source/Expression/REPL.cpp
@@ -473,7 +473,8 @@ void REPL::IOHandlerInputComplete(IOHandler &io_handler, std::string &code) {
 
             // Now set the default file and line to the REPL source file
             m_target.GetSourceManager().SetDefaultFileAndLine(
-                FileSpec(m_repl_source_path), new_default_line);
+                std::make_shared<SupportFile>(FileSpec(m_repl_source_path)),
+                new_default_line);
           }
           static_cast<IOHandlerEditline &>(io_handler)
               .SetBaseLineNumber(m_code.GetSize() + 1);
@@ -570,13 +571,11 @@ Status REPL::RunLoop() {
 
   lldb::IOHandlerSP io_handler_sp(GetIOHandler());
 
-  FileSpec save_default_file;
-  uint32_t save_default_line = 0;
+  std::optional<SourceManager::SupportFileAndLine> default_file_line;
 
   if (!m_repl_source_path.empty()) {
     // Save the current default file and line
-    m_target.GetSourceManager().GetDefaultFileAndLine(save_default_file,
-                                                      save_default_line);
+    default_file_line = m_target.GetSourceManager().GetDefaultFileAndLine();
   }
 
   debugger.RunIOHandlerAsync(io_handler_sp);
@@ -615,8 +614,8 @@ Status REPL::RunLoop() {
   }
 
   // Restore the default file and line
-  if (save_default_file && save_default_line != 0)
-    m_target.GetSourceManager().SetDefaultFileAndLine(save_default_file,
-                                                      save_default_line);
+  if (default_file_line)
+    m_target.GetSourceManager().SetDefaultFileAndLine(
+        default_file_line->support_file_sp, default_file_line->line);
   return error;
 }
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 695801da9da69a..b0f49ebf2d2cbb 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -4241,6 +4241,9 @@ TypeSystemClang::GetTypeClass(lldb::opaque_compiler_type_t type) {
   // We don't handle pack indexing yet
   case clang::Type::PackIndexing:
     break;
+
+  case clang::Type::HLSLAttributedResource:
+    break;
   }
   // We don't know hot to display this type...
   return lldb::eTypeClassOther;
@@ -5148,6 +5151,9 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
   // We don't handle pack indexing yet
   case clang::Type::PackIndexing:
     break;
+
+  case clang::Type::HLSLAttributedResource:
+    break;
   }
   count = 0;
   return lldb::eEncodingInvalid;
@@ -5309,6 +5315,9 @@ lldb::Format TypeSystemClang::GetFormat(lldb::opaque_compiler_type_t type) {
   // We don't handle pack indexing yet
   case clang::Type::PackIndexing:
     break;
+
+  case clang::Type::HLSLAttributedResource:
+    break;
   }
   // We don't know hot to display this type...
   return lldb::eFormatBytes;
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index 5d90ed90b3d3fd..e35a4c318d358f 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1923,7 +1923,7 @@ bool StackFrame::GetStatus(Stream &strm, bool show_frame_info, bool show_source,
 
           size_t num_lines =
               target->GetSourceManager().DisplaySourceLinesWithLineNumbers(
-                  m_sc.line_entry.GetFile(), start_line, m_sc.line_entry.column,
+                  m_sc.line_entry.file_sp, start_line, m_sc.line_entry.column,
                   source_lines_before, source_lines_after, "->", &strm);
           if (num_lines != 0)
             have_source = true;
diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp
index 7808bd3674ab19..3849ec5ed178d9 100644
--- a/lldb/source/Target/StackFrameList.cpp
+++ b/lldb/source/Target/StackFrameList.cpp
@@ -886,7 +886,7 @@ void StackFrameList::SetDefaultFileAndLineToSelectedFrame() {
       SymbolContext sc = frame_sp->GetSymbolContext(eSymbolContextLineEntry);
       if (sc.line_entry.GetFile())
         m_thread.CalculateTarget()->GetSourceManager().SetDefaultFileAndLine(
-            sc.line_entry.GetFile(), sc.line_entry.line);
+            sc.line_entry.file_sp, sc.line_entry.line);
     }
   }
 }
diff --git a/lldb/unittests/Core/SourceManagerTest.cpp b/lldb/unittests/Core/SourceManagerTest.cpp
index 58d6f6cb3f8503..26ab0edffb398d 100644
--- a/lldb/unittests/Core/SourceManagerTest.cpp
+++ b/lldb/unittests/Core/SourceManagerTest.cpp
@@ -8,6 +8,7 @@
 
 #include "lldb/Core/SourceManager.h"
 #include "lldb/Host/FileSystem.h"
+#include "lldb/Utility/SupportFile.h"
 #include "gtest/gtest.h"
 
 #include "TestingSupport/MockTildeExpressionResolver.h"
@@ -29,8 +30,8 @@ TEST_F(SourceFileCache, FindSourceFileFound) {
 
   // Insert: foo
   FileSpec foo_file_spec("foo");
-  auto foo_file_sp =
-      std::make_shared<SourceManager::File>(foo_file_spec, lldb::DebuggerSP());
+  auto foo_file_sp = std::make_shared<SourceManager::File>(
+      std::make_shared<SupportFile>(foo_file_spec), lldb::DebuggerSP());
   cache.AddSourceFile(foo_file_spec, foo_file_sp);
 
   // Query: foo, expect found.
@@ -43,8 +44,8 @@ TEST_F(SourceFileCache, FindSourceFileNotFound) {
 
   // Insert: foo
   FileSpec foo_file_spec("foo");
-  auto foo_file_sp =
-      std::make_shared<SourceManager::File>(foo_file_spec, lldb::DebuggerSP());
+  auto foo_file_sp = std::make_shared<SourceManager::File>(
+      std::make_shared<SupportFile>(foo_file_spec), lldb::DebuggerSP());
   cache.AddSourceFile(foo_file_spec, foo_file_sp);
 
   // Query: bar, expect not found.
@@ -63,7 +64,8 @@ TEST_F(SourceFileCache, FindSourceFileByUnresolvedPath) {
 
   // Create the file with the resolved file spec.
   auto foo_file_sp = std::make_shared<SourceManager::File>(
-      resolved_foo_file_spec, lldb::DebuggerSP());
+      std::make_shared<SupportFile>(resolved_foo_file_spec),
+      lldb::DebuggerSP());
 
   // Cache the result with the unresolved file spec.
   cache.AddSourceFile(foo_file_spec, foo_file_sp);
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index cf0a6f96fb012e..c75b75edaf2ca0 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -2189,10 +2189,6 @@ example:
 ``nosanitize_coverage``
     This attribute indicates that SanitizerCoverage instrumentation is disabled
     for this function.
-``nosanitize_realtime``
-    This attribute indicates that the Realtime Sanitizer instrumentation is
-    disabled for this function.
-    This attribute is incompatible with the ``sanitize_realtime`` attribute.
 ``null_pointer_is_valid``
    If ``null_pointer_is_valid`` is set, then the ``null`` address
    in address-space 0 is considered to be a valid address for memory loads and
@@ -2319,7 +2315,6 @@ example:
     This attribute indicates that RealtimeSanitizer checks
     (realtime safety analysis - no allocations, syscalls or exceptions) are enabled
     for this function.
-    This attribute is incompatible with the ``nosanitize_realtime`` attribute.
 ``speculative_load_hardening``
     This attribute indicates that
     `Speculative Load Hardening <https://llvm.org/docs/SpeculativeLoadHardening.html>`_
diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst
index 9bd2b1d435fd0a..2b5b5139858e7f 100644
--- a/llvm/docs/Security.rst
+++ b/llvm/docs/Security.rst
@@ -46,9 +46,9 @@ username for an individual isn't available, the brackets will be empty.
 * Josh Stone (Red Hat; Rust) [@cuviper]
 * Kristof Beyls (ARM) [@kbeyls]
 * Matthew Riley (Google) [@mmdriley]
+* Matthew Voss (Sony) [@ormris]
 * Nikhil Gupta (Nvidia) []
 * Oliver Hunt (Apple) [@ojhunt]
-* Paul Robinson (Sony) [@pogo59]
 * Peter Smith (ARM) [@smithp35]
 * Pietro Albini (Ferrous Systems; Rust) [@pietroalbini]
 * Serge Guelton (Mozilla) [@serge-sans-paille]
diff --git a/llvm/docs/TestSuiteGuide.md b/llvm/docs/TestSuiteGuide.md
index 9552cd89aa1c1b..19db0ee7d01b82 100644
--- a/llvm/docs/TestSuiteGuide.md
+++ b/llvm/docs/TestSuiteGuide.md
@@ -134,6 +134,44 @@ Every program can work as a correctness test. Some programs are unsuitable for
 performance measurements. Setting the `TEST_SUITE_BENCHMARKING_ONLY` CMake
 option to `ON` will disable them.
 
+The MultiSource benchmarks consist of the following apps and benchmarks:
+
+| MultiSource          | Language  | Application Area              | Remark               |
+|----------------------|-----------|-------------------------------|----------------------|
+| 7zip                 |  C/C++    | Compression/Decompression     |                      |
+| ASCI_Purple          |  C        | SMG2000 benchmark and solver  | Memory intensive app |
+| ASC_Sequoia          |  C        | Simulation and solver         |                      |
+| BitBench             |  C        | uudecode/uuencode utility     | Bit Stream benchmark for functional compilers |
+| Bullet               |  C++      | Bullet 2.75 physics engine    |                      |
+| DOE-ProxyApps-C++    |  C++      | HPC/scientific apps           | Small applications, representative of our larger DOE workloads |
+| DOE-ProxyApps-C      |  C        | HPC/scientific apps           | "                    |
+| Fhourstones          |  C        | Game/solver                   | Integer benchmark that efficiently solves positions in the game of Connect-4 |
+| Fhourstones-3.1      |  C        | Game/solver                   | "                    |
+| FreeBench            |  C        | Benchmark suite               | Raytracer, four in a row, neural network, file compressor, Fast Fourier/Cosine/Sine Transform |
+| llubenchmark         |  C        | Linked-list micro-benchmark   |                      |
+| mafft                |  C        | Bioinformatics                | A multiple sequence alignment program |
+| MallocBench          |  C        | Benchmark suite               | cfrac, espresso, gawk, gs, make, p2c, perl |
+| McCat                |  C        | Benchmark suite               | Quicksort, bubblesort, eigenvalues |
+| mediabench           |  C        | Benchmark suite               | adpcm, g721, gsm, jpeg, mpeg2 |
+| MiBench              |  C        | Embedded benchmark suite      | Automotive, consumer, office, security, telecom apps  |
+| nbench               |  C        |                               | BYTE Magazine's BYTEmark benchmark program |
+| NPB-serial           |  C        | Parallel computing            | Serial version of the NPB IS code |
+| Olden                |  C        | Data Structures               | SGI version of the Olden benchmark |
+| OptimizerEval        |  C        | Solver                        | Preston Brigg's optimizer evaluation framework |
+| PAQ8p                |  C++      | Data compression              |                      |
+| Prolangs-C++         |  C++      | Benchmark suite               | city, employ, life, NP, ocean, primes, simul, vcirc |
+| Prolangs-C           |  C        | Benchmark suite               | agrep, archie-client, bison, gnugo, unix-smail |
+| Ptrdist              |  C        | Pointer-Intensive Benchmark Suite |                  |
+| Rodinia              |  C        | Scientific apps              | backprop, pathfinder, srad |
+| SciMark2-C           |  C        | Scientific apps              | FFT, LU, Montecarlo, sparse matmul |
+| sim                  |  C        | Dynamic programming          | A Time-Efficient, Linear-Space Local Similarity Algorithm |
+| tramp3d-v4           |  C++      | Numerical analysis           | Template-intensive numerical program based on FreePOOMA |
+| Trimaran             |  C        | Encryption                   | 3des, md5, crc |
+| TSVC                 |  C        | Vectorization benchmark      | Test Suite for Vectorizing Compilers (TSVC) |
+| VersaBench           |  C        | Benchmark suite              | 8b10b, beamformer, bmm, dbms, ecbdes |
+
+All MultiSource applications are suitable for performance measurements
+and will run when CMake option `TEST_SUITE_BENCHMARKING_ONLY` is set.
 
 Configuration
 -------------
diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index f5c23b1b4e014d..237d328721609b 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -157,7 +157,7 @@ class PtrUseVisitorBase {
   ///
   /// This will visit the users with the same offset of the current visit
   /// (including an unknown offset if that is the current state).
-  void enqueueUsers(Instruction &I);
+  void enqueueUsers(Value &I);
 
   /// Walk the operands of a GEP and adjust the offset as appropriate.
   ///
@@ -208,11 +208,14 @@ class PtrUseVisitor : protected InstVisitor<DerivedT>,
 
   /// Recursively visit the uses of the given pointer.
   /// \returns An info struct about the pointer. See \c PtrInfo for details.
-  PtrInfo visitPtr(Instruction &I) {
+  /// We may also need to process Argument pointers, so the input uses is
+  /// a common Value type.
+  PtrInfo visitPtr(Value &I) {
     // This must be a pointer type. Get an integer type suitable to hold
     // offsets on this pointer.
     // FIXME: Support a vector of pointers.
     assert(I.getType()->isPointerTy());
+    assert(isa<Instruction>(I) || isa<Argument>(I));
     IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(I.getType()));
     IsOffsetKnown = true;
     Offset = APInt(IntIdxTy->getBitWidth(), 0);
diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index a4cc814549c95b..21e28d546286ee 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -313,13 +313,19 @@ enum class ResourceKind : uint32_t {
 
 ArrayRef<EnumEntry<ResourceKind>> getResourceKinds();
 
-#define RESOURCE_FLAG(Val, Enum) Enum = Val,
-enum class ResourceFlag : uint32_t {
-#include "DXContainerConstants.def"
+#define RESOURCE_FLAG(Index, Enum) bool Enum = false;
+struct ResourceFlags {
+  ResourceFlags() {};
+  struct FlagsBits {
+#include "llvm/BinaryFormat/DXContainerConstants.def"
+  };
+  union {
+    uint32_t Flags;
+    FlagsBits Bits;
+  };
+  bool operator==(const uint32_t RFlags) const { return Flags == RFlags; }
 };
 
-ArrayRef<EnumEntry<ResourceFlag>> getResourceFlags();
-
 namespace v0 {
 struct RuntimeInfo {
   PipelinePSVInfo StageInfo;
@@ -439,12 +445,12 @@ struct RuntimeInfo : public v1::RuntimeInfo {
 
 struct ResourceBindInfo : public v0::ResourceBindInfo {
   ResourceKind Kind;
-  uint32_t Flags;
+  ResourceFlags Flags;
 
   void swapBytes() {
     v0::ResourceBindInfo::swapBytes();
     sys::swapByteOrder(Kind);
-    sys::swapByteOrder(Flags);
+    sys::swapByteOrder(Flags.Flags);
   }
 };
 
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 4111cecb018bb3..1aacbb2f65b27f 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -190,8 +190,7 @@ RESOURCE_KIND(18, FeedbackTexture2DArray)
 #endif // RESOURCE_KIND
 
 #ifdef RESOURCE_FLAG
-RESOURCE_FLAG(0, None)
-RESOURCE_FLAG(1, UsedByAtomic64)
+RESOURCE_FLAG(0, UsedByAtomic64)
 
 #undef RESOURCE_FLAG
 #endif // RESOURCE_FLAG
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 8a2e6583af87c5..4beac37a583445 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -759,7 +759,6 @@ enum AttributeKindCodes {
   ATTR_KIND_INITIALIZES = 94,
   ATTR_KIND_HYBRID_PATCHABLE = 95,
   ATTR_KIND_SANITIZE_REALTIME = 96,
-  ATTR_KIND_NO_SANITIZE_REALTIME = 97,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 80936c0ee83355..891e34fec0c798 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -212,9 +212,6 @@ def NoSanitizeBounds : EnumAttr<"nosanitize_bounds", [FnAttr]>;
 /// No SanitizeCoverage instrumentation.
 def NoSanitizeCoverage : EnumAttr<"nosanitize_coverage", [FnAttr]>;
 
-/// No SanitizeRealtime instrumentation.
-def NoSanitizeRealtime : EnumAttr<"nosanitize_realtime", [FnAttr]>;
-
 /// Null pointer in address space zero is valid.
 def NullPointerIsValid : EnumAttr<"null_pointer_is_valid", [FnAttr]>;
 
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index e432359b7bbd07..66ad057ab0e30f 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -72,6 +72,7 @@ struct ShaderHash {
   std::vector<llvm::yaml::Hex8> Digest;
 };
 
+using ResourceFlags = dxbc::PSV::ResourceFlags;
 using ResourceBindInfo = dxbc::PSV::v2::ResourceBindInfo;
 
 struct SignatureElement {
@@ -178,7 +179,6 @@ LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ComponentType)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::InterpolationMode)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceType)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceKind)
-LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::PSV::ResourceFlag)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::D3DSystemValue)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigComponentType)
 LLVM_YAML_DECLARE_ENUM_TRAITS(llvm::dxbc::SigMinPrecision)
@@ -221,6 +221,10 @@ template <> struct MappingTraits<DXContainerYAML::Object> {
   static void mapping(IO &IO, DXContainerYAML::Object &Obj);
 };
 
+template <> struct MappingTraits<DXContainerYAML::ResourceFlags> {
+  static void mapping(IO &IO, DXContainerYAML::ResourceFlags &Flags);
+};
+
 template <> struct MappingTraits<DXContainerYAML::ResourceBindInfo> {
   static void mapping(IO &IO, DXContainerYAML::ResourceBindInfo &Res);
 };
diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 0f7752eda6d66f..2ed7243fa612f4 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -113,6 +113,7 @@ namespace sandboxir {
 
 class BasicBlock;
 class ConstantInt;
+class ConstantFP;
 class Context;
 class Function;
 class Instruction;
@@ -597,6 +598,94 @@ class ConstantInt : public Constant {
 #endif
 };
 
+// TODO: This should inherit from ConstantData.
+class ConstantFP final : public Constant {
+  ConstantFP(llvm::ConstantFP *C, Context &Ctx)
+      : Constant(ClassID::ConstantFP, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// This returns a ConstantFP, or a vector containing a splat of a ConstantFP,
+  /// for the specified value in the specified type. This should only be used
+  /// for simple constant values like 2.0/1.0 etc, that are known-valid both as
+  /// host double and as the target format.
+  static Constant *get(Type *Ty, double V);
+
+  /// If Ty is a vector type, return a Constant with a splat of the given
+  /// value. Otherwise return a ConstantFP for the given value.
+  static Constant *get(Type *Ty, const APFloat &V);
+
+  static Constant *get(Type *Ty, StringRef Str);
+
+  static ConstantFP *get(const APFloat &V, Context &Ctx);
+
+  static Constant *getNaN(Type *Ty, bool Negative = false,
+                          uint64_t Payload = 0);
+  static Constant *getQNaN(Type *Ty, bool Negative = false,
+                           APInt *Payload = nullptr);
+  static Constant *getSNaN(Type *Ty, bool Negative = false,
+                           APInt *Payload = nullptr);
+  static Constant *getZero(Type *Ty, bool Negative = false);
+
+  static Constant *getNegativeZero(Type *Ty);
+  static Constant *getInfinity(Type *Ty, bool Negative = false);
+
+  /// Return true if Ty is big enough to represent V.
+  static bool isValueValidForType(Type *Ty, const APFloat &V);
+
+  inline const APFloat &getValueAPF() const {
+    return cast<llvm::ConstantFP>(Val)->getValueAPF();
+  }
+  inline const APFloat &getValue() const {
+    return cast<llvm::ConstantFP>(Val)->getValue();
+  }
+
+  /// Return true if the value is positive or negative zero.
+  bool isZero() const { return cast<llvm::ConstantFP>(Val)->isZero(); }
+
+  /// Return true if the sign bit is set.
+  bool isNegative() const { return cast<llvm::ConstantFP>(Val)->isNegative(); }
+
+  /// Return true if the value is infinity
+  bool isInfinity() const { return cast<llvm::ConstantFP>(Val)->isInfinity(); }
+
+  /// Return true if the value is a NaN.
+  bool isNaN() const { return cast<llvm::ConstantFP>(Val)->isNaN(); }
+
+  /// We don't rely on operator== working on double values, as it returns true
+  /// for things that are clearly not equal, like -0.0 and 0.0.
+  /// As such, this method can be used to do an exact bit-for-bit comparison of
+  /// two floating point values.  The version with a double operand is retained
+  /// because it's so convenient to write isExactlyValue(2.0), but please use
+  /// it only for simple constants.
+  bool isExactlyValue(const APFloat &V) const {
+    return cast<llvm::ConstantFP>(Val)->isExactlyValue(V);
+  }
+
+  bool isExactlyValue(double V) const {
+    return cast<llvm::ConstantFP>(Val)->isExactlyValue(V);
+  }
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::ConstantFP;
+  }
+
+  // TODO: Better name: getOperandNo(const Use&). Should be private.
+  unsigned getUseOperandNo(const Use &Use) const final {
+    llvm_unreachable("ConstantFP has no operands!");
+  }
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::ConstantFP>(Val) && "Expected a ConstantFP!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
 /// Iterator for `Instruction`s in a `BasicBlock.
 /// \Returns an sandboxir::Instruction & when derereferenced.
 class BBIterator {
@@ -3156,7 +3245,10 @@ class Context {
   Constant *getOrCreateConstant(llvm::Constant *LLVMC) {
     return cast<Constant>(getOrCreateValueInternal(LLVMC, 0));
   }
-  friend class ConstantInt; // For getOrCreateConstant().
+  // Friends for getOrCreateConstant().
+#define DEF_CONST(ID, CLASS) friend class CLASS;
+#include "llvm/SandboxIR/SandboxIRValues.def"
+
   /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will
   /// also create all contents of the block.
   BasicBlock *createBasicBlock(llvm::BasicBlock *BB);
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index d29fc3b5e95871..2fc24ed71c4cf6 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -26,6 +26,7 @@ DEF_USER(User, User)
 DEF_VALUE(Block, BasicBlock)
 DEF_CONST(Constant, Constant)
 DEF_CONST(ConstantInt, ConstantInt)
+DEF_CONST(ConstantFP, ConstantFP)
 
 #ifndef DEF_INSTR
 #define DEF_INSTR(ID, OPCODE, CLASS)
diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h
index 4588cd2f738876..89e787f5f5d4b2 100644
--- a/llvm/include/llvm/SandboxIR/Type.h
+++ b/llvm/include/llvm/SandboxIR/Type.h
@@ -27,6 +27,7 @@ class PointerType;
 class VectorType;
 class FunctionType;
 #define DEF_INSTR(ID, OPCODE, CLASS) class CLASS;
+#define DEF_CONST(ID, CLASS) class CLASS;
 #include "llvm/SandboxIR/SandboxIRValues.def"
 
 /// Just like llvm::Type these are immutable, unique, never get freed and can
@@ -42,7 +43,7 @@ class Type {
   friend class ConstantInt;  // For LLVMTy.
   // Friend all instruction classes because `create()` functions use LLVMTy.
 #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS;
-  // TODO: Friend DEF_CONST()
+#define DEF_CONST(ID, CLASS) friend class CLASS;
 #include "llvm/SandboxIR/SandboxIRValues.def"
   Context &Ctx;
 
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index f5258601fd5d49..ba3619417114c7 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -1040,28 +1040,13 @@ Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp,
   case RecurKind::Xor:
   case RecurKind::Add:
   case RecurKind::Or:
-    // Adding, Xoring, Oring zero to a number does not change it.
-    return ConstantInt::get(Tp, 0);
   case RecurKind::Mul:
-    // Multiplying a number by 1 does not change it.
-    return ConstantInt::get(Tp, 1);
   case RecurKind::And:
-    // AND-ing a number with an all-1 value does not change it.
-    return ConstantInt::get(Tp, -1, true);
   case RecurKind::FMul:
-    // Multiplying a number by 1 does not change it.
-    return ConstantFP::get(Tp, 1.0L);
-  case RecurKind::FMulAdd:
   case RecurKind::FAdd:
-    // Adding zero to a number does not change it.
-    // FIXME: Ideally we should not need to check FMF for FAdd and should always
-    // use -0.0. However, this will currently result in mixed vectors of 0.0/-0.0.
-    // Instead, we should ensure that 1) the FMF from FAdd are propagated to the PHI
-    // nodes where possible, and 2) PHIs with the nsz flag + -0.0 use 0.0. This would
-    // mean we can then remove the check for noSignedZeros() below (see D98963).
-    if (FMF.noSignedZeros())
-      return ConstantFP::get(Tp, 0.0L);
-    return ConstantFP::get(Tp, -0.0L);
+    return ConstantExpr::getBinOpIdentity(getOpcode(K), Tp, false, FMF.noSignedZeros());
+  case RecurKind::FMulAdd:
+    return ConstantExpr::getBinOpIdentity(Instruction::FAdd, Tp, false, FMF.noSignedZeros());
   case RecurKind::UMin:
     return ConstantInt::get(Tp, -1, true);
   case RecurKind::UMax:
diff --git a/llvm/lib/Analysis/PtrUseVisitor.cpp b/llvm/lib/Analysis/PtrUseVisitor.cpp
index 49304818d7efed..9c79546f491eff 100644
--- a/llvm/lib/Analysis/PtrUseVisitor.cpp
+++ b/llvm/lib/Analysis/PtrUseVisitor.cpp
@@ -17,7 +17,7 @@
 
 using namespace llvm;
 
-void detail::PtrUseVisitorBase::enqueueUsers(Instruction &I) {
+void detail::PtrUseVisitorBase::enqueueUsers(Value &I) {
   for (Use &U : I.uses()) {
     if (VisitedUses.insert(&U).second) {
       UseToVisit NewU = {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 173faa32a3878d..533fe62fb8cdd6 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -5921,6 +5921,61 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
 
     break;
   }
+  case Instruction::BitCast: {
+    const Value *Src;
+    if (!match(Op, m_ElementWiseBitCast(m_Value(Src))) ||
+        !Src->getType()->isIntOrIntVectorTy())
+      break;
+
+    const Type *Ty = Op->getType()->getScalarType();
+    KnownBits Bits(Ty->getScalarSizeInBits());
+    computeKnownBits(Src, DemandedElts, Bits, Depth + 1, Q);
+
+    // Transfer information from the sign bit.
+    if (Bits.isNonNegative())
+      Known.signBitMustBeZero();
+    else if (Bits.isNegative())
+      Known.signBitMustBeOne();
+
+    if (Ty->isIEEE()) {
+      // IEEE floats are NaN when all bits of the exponent plus at least one of
+      // the fraction bits are 1. This means:
+      //   - If we assume unknown bits are 0 and the value is NaN, it will
+      //     always be NaN
+      //   - If we assume unknown bits are 1 and the value is not NaN, it can
+      //     never be NaN
+      if (APFloat(Ty->getFltSemantics(), Bits.One).isNaN())
+        Known.KnownFPClasses = fcNan;
+      else if (!APFloat(Ty->getFltSemantics(), ~Bits.Zero).isNaN())
+        Known.knownNot(fcNan);
+
+      // Build KnownBits representing Inf and check if it must be equal or
+      // unequal to this value.
+      auto InfKB = KnownBits::makeConstant(
+          APFloat::getInf(Ty->getFltSemantics()).bitcastToAPInt());
+      InfKB.Zero.clearSignBit();
+      if (const auto InfResult = KnownBits::eq(Bits, InfKB)) {
+        assert(!InfResult.value());
+        Known.knownNot(fcInf);
+      } else if (Bits == InfKB) {
+        Known.KnownFPClasses = fcInf;
+      }
+
+      // Build KnownBits representing Zero and check if it must be equal or
+      // unequal to this value.
+      auto ZeroKB = KnownBits::makeConstant(
+          APFloat::getZero(Ty->getFltSemantics()).bitcastToAPInt());
+      ZeroKB.Zero.clearSignBit();
+      if (const auto ZeroResult = KnownBits::eq(Bits, ZeroKB)) {
+        assert(!ZeroResult.value());
+        Known.knownNot(fcZero);
+      } else if (Bits == ZeroKB) {
+        Known.KnownFPClasses = fcZero;
+      }
+    }
+
+    break;
+  }
   default:
     break;
   }
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index cc742ab35f4498..32ce34114b2f50 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -66,9 +66,15 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::umul_fix:
   case Intrinsic::umul_fix_sat:
   case Intrinsic::sqrt: // Begin floating-point.
+  case Intrinsic::asin:
+  case Intrinsic::acos:
+  case Intrinsic::atan:
   case Intrinsic::sin:
   case Intrinsic::cos:
   case Intrinsic::tan:
+  case Intrinsic::sinh:
+  case Intrinsic::cosh:
+  case Intrinsic::tanh:
   case Intrinsic::exp:
   case Intrinsic::exp2:
   case Intrinsic::log:
diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp
index 790947cc729c0b..97ceb16ccf53f4 100644
--- a/llvm/lib/BinaryFormat/DXContainer.cpp
+++ b/llvm/lib/BinaryFormat/DXContainer.cpp
@@ -109,13 +109,3 @@ static const EnumEntry<PSV::ResourceKind> ResourceKindNames[] = {
 ArrayRef<EnumEntry<PSV::ResourceKind>> PSV::getResourceKinds() {
   return ArrayRef(ResourceKindNames);
 }
-
-#define RESOURCE_FLAG(Val, Enum) {#Enum, PSV::ResourceFlag::Enum},
-
-static const EnumEntry<PSV::ResourceFlag> ResourceFlagNames[] = {
-#include "llvm/BinaryFormat/DXContainerConstants.def"
-};
-
-ArrayRef<EnumEntry<PSV::ResourceFlag>> PSV::getResourceFlags() {
-  return ArrayRef(ResourceFlagNames);
-}
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 974a05023c72a5..654be985a3229c 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2093,8 +2093,6 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::NoSanitizeBounds;
   case bitc::ATTR_KIND_NO_SANITIZE_COVERAGE:
     return Attribute::NoSanitizeCoverage;
-  case bitc::ATTR_KIND_NO_SANITIZE_REALTIME:
-    return Attribute::NoSanitizeRealtime;
   case bitc::ATTR_KIND_NULL_POINTER_IS_VALID:
     return Attribute::NullPointerIsValid;
   case bitc::ATTR_KIND_OPTIMIZE_FOR_DEBUGGING:
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 3c5097f4af7c56..26fd02b3e1a043 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -795,8 +795,6 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NO_SANITIZE_BOUNDS;
   case Attribute::NoSanitizeCoverage:
     return bitc::ATTR_KIND_NO_SANITIZE_COVERAGE;
-  case llvm::Attribute::NoSanitizeRealtime:
-    return bitc::ATTR_KIND_NO_SANITIZE_REALTIME;
   case Attribute::NullPointerIsValid:
     return bitc::ATTR_KIND_NULL_POINTER_IS_VALID;
   case Attribute::OptimizeForDebugging:
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 675d88d6d38cd9..5140f5951d6d3f 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include <optional>
 
 using namespace llvm;
@@ -437,69 +438,33 @@ CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
   default:
     llvm_unreachable("Impossible reduction kind");
   case Intrinsic::vp_reduce_add:
-    Reduction = Builder.CreateAddReduce(RedOp);
-    Reduction = Builder.CreateAdd(Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_mul:
-    Reduction = Builder.CreateMulReduce(RedOp);
-    Reduction = Builder.CreateMul(Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_and:
-    Reduction = Builder.CreateAndReduce(RedOp);
-    Reduction = Builder.CreateAnd(Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_or:
-    Reduction = Builder.CreateOrReduce(RedOp);
-    Reduction = Builder.CreateOr(Reduction, Start);
-    break;
-  case Intrinsic::vp_reduce_xor:
-    Reduction = Builder.CreateXorReduce(RedOp);
-    Reduction = Builder.CreateXor(Reduction, Start);
-    break;
-  case Intrinsic::vp_reduce_smax:
-    Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ true);
+  case Intrinsic::vp_reduce_xor: {
+    Intrinsic::ID RedID = *VPI.getFunctionalIntrinsicID();
+    unsigned Opc = getArithmeticReductionInstruction(RedID);
+    assert(Instruction::isBinaryOp(Opc));
+    Reduction = Builder.CreateUnaryIntrinsic(RedID, RedOp);
     Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::smax, Reduction, Start);
+        Builder.CreateBinOp((Instruction::BinaryOps)Opc, Reduction, Start);
     break;
+  }
+  case Intrinsic::vp_reduce_smax:
   case Intrinsic::vp_reduce_smin:
-    Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ true);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::smin, Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_umax:
-    Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ false);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::umax, Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_umin:
-    Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ false);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::umin, Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_fmax:
-    Reduction = Builder.CreateFPMaxReduce(RedOp);
-    transferDecorations(*Reduction, VPI);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_fmin:
-    Reduction = Builder.CreateFPMinReduce(RedOp);
-    transferDecorations(*Reduction, VPI);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::minnum, Reduction, Start);
-    break;
   case Intrinsic::vp_reduce_fmaximum:
-    Reduction = Builder.CreateFPMaximumReduce(RedOp);
-    transferDecorations(*Reduction, VPI);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::maximum, Reduction, Start);
-    break;
-  case Intrinsic::vp_reduce_fminimum:
-    Reduction = Builder.CreateFPMinimumReduce(RedOp);
+  case Intrinsic::vp_reduce_fminimum: {
+    Intrinsic::ID RedID = *VPI.getFunctionalIntrinsicID();
+    Intrinsic::ID ScalarID = getMinMaxReductionIntrinsicOp(RedID);
+    Reduction = Builder.CreateUnaryIntrinsic(RedID, RedOp);
     transferDecorations(*Reduction, VPI);
-    Reduction =
-        Builder.CreateBinaryIntrinsic(Intrinsic::minimum, Reduction, Start);
+    Reduction = Builder.CreateBinaryIntrinsic(ScalarID, Reduction, Start);
     break;
+  }
   case Intrinsic::vp_reduce_fadd:
     Reduction = Builder.CreateFAddReduce(Start, RedOp);
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 2557fa288606e7..87221c14433ab5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -135,6 +135,9 @@ class VectorLegalizer {
   SDValue ExpandVP_SELECT(SDNode *Node);
   SDValue ExpandVP_MERGE(SDNode *Node);
   SDValue ExpandVP_REM(SDNode *Node);
+  SDValue ExpandVP_FNEG(SDNode *Node);
+  SDValue ExpandVP_FABS(SDNode *Node);
+  SDValue ExpandVP_FCOPYSIGN(SDNode *Node);
   SDValue ExpandSELECT(SDNode *Node);
   std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
   SDValue ExpandStore(SDNode *N);
@@ -699,6 +702,11 @@ void VectorLegalizer::Promote(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
     // These operations are used to do promotion so they can't be promoted
     // themselves.
     llvm_unreachable("Don't know how to promote this operation!");
+  case ISD::VP_FABS:
+  case ISD::VP_FCOPYSIGN:
+  case ISD::VP_FNEG:
+    // Promoting fabs, fneg, and fcopysign changes their semantics.
+    llvm_unreachable("These operations should not be promoted");
   }
 
   // There are currently two cases of vector promotion:
@@ -887,6 +895,24 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
     }
     break;
+  case ISD::VP_FNEG:
+    if (SDValue Expanded = ExpandVP_FNEG(Node)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::VP_FABS:
+    if (SDValue Expanded = ExpandVP_FABS(Node)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::VP_FCOPYSIGN:
+    if (SDValue Expanded = ExpandVP_FCOPYSIGN(Node)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
   case ISD::SELECT:
     Results.push_back(ExpandSELECT(Node));
     return;
@@ -1557,6 +1583,80 @@ SDValue VectorLegalizer::ExpandVP_REM(SDNode *Node) {
   return DAG.getNode(ISD::VP_SUB, DL, VT, Dividend, Mul, Mask, EVL);
 }
 
+SDValue VectorLegalizer::ExpandVP_FNEG(SDNode *Node) {
+  EVT VT = Node->getValueType(0);
+  EVT IntVT = VT.changeVectorElementTypeToInteger();
+
+  if (!TLI.isOperationLegalOrCustom(ISD::VP_XOR, IntVT))
+    return SDValue();
+
+  SDValue Mask = Node->getOperand(1);
+  SDValue EVL = Node->getOperand(2);
+
+  SDLoc DL(Node);
+  SDValue Cast = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0));
+  SDValue SignMask = DAG.getConstant(
+      APInt::getSignMask(IntVT.getScalarSizeInBits()), DL, IntVT);
+  SDValue Xor = DAG.getNode(ISD::VP_XOR, DL, IntVT, Cast, SignMask, Mask, EVL);
+  return DAG.getNode(ISD::BITCAST, DL, VT, Xor);
+}
+
+SDValue VectorLegalizer::ExpandVP_FABS(SDNode *Node) {
+  EVT VT = Node->getValueType(0);
+  EVT IntVT = VT.changeVectorElementTypeToInteger();
+
+  if (!TLI.isOperationLegalOrCustom(ISD::VP_AND, IntVT))
+    return SDValue();
+
+  SDValue Mask = Node->getOperand(1);
+  SDValue EVL = Node->getOperand(2);
+
+  SDLoc DL(Node);
+  SDValue Cast = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0));
+  SDValue ClearSignMask = DAG.getConstant(
+      APInt::getSignedMaxValue(IntVT.getScalarSizeInBits()), DL, IntVT);
+  SDValue ClearSign =
+      DAG.getNode(ISD::VP_AND, DL, IntVT, Cast, ClearSignMask, Mask, EVL);
+  return DAG.getNode(ISD::BITCAST, DL, VT, ClearSign);
+}
+
+SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) {
+  EVT VT = Node->getValueType(0);
+
+  if (VT != Node->getOperand(1).getValueType())
+    return SDValue();
+
+  EVT IntVT = VT.changeVectorElementTypeToInteger();
+  if (!TLI.isOperationLegalOrCustom(ISD::VP_AND, IntVT) ||
+      !TLI.isOperationLegalOrCustom(ISD::VP_XOR, IntVT))
+    return SDValue();
+
+  SDValue Mask = Node->getOperand(2);
+  SDValue EVL = Node->getOperand(3);
+
+  SDLoc DL(Node);
+  SDValue Mag = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(0));
+  SDValue Sign = DAG.getNode(ISD::BITCAST, DL, IntVT, Node->getOperand(1));
+
+  SDValue SignMask = DAG.getConstant(
+      APInt::getSignMask(IntVT.getScalarSizeInBits()), DL, IntVT);
+  SDValue SignBit =
+      DAG.getNode(ISD::VP_AND, DL, IntVT, Sign, SignMask, Mask, EVL);
+
+  SDValue ClearSignMask = DAG.getConstant(
+      APInt::getSignedMaxValue(IntVT.getScalarSizeInBits()), DL, IntVT);
+  SDValue ClearedSign =
+      DAG.getNode(ISD::VP_AND, DL, IntVT, Mag, ClearSignMask, Mask, EVL);
+
+  SDNodeFlags Flags;
+  Flags.setDisjoint(true);
+
+  SDValue CopiedSign = DAG.getNode(ISD::VP_OR, DL, IntVT, ClearedSign, SignBit,
+                                   Mask, EVL, Flags);
+
+  return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign);
+}
+
 void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,
                                        SmallVectorImpl<SDValue> &Results) {
   // Attempt to expand using TargetLowering.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 9efcd3f25797b5..7f57b6db40ef49 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -13267,7 +13267,9 @@ SDValue SelectionDAG::getNeutralElement(unsigned Opcode, const SDLoc &DL,
   case ISD::SMIN:
     return getConstant(APInt::getSignedMaxValue(VT.getSizeInBits()), DL, VT);
   case ISD::FADD:
-    return getConstantFP(-0.0, DL, VT);
+    // If flags allow, prefer positive zero single it's generally cheaper
+    // to materialize on most targets.
+    return getConstantFP(Flags.hasNoSignedZeros() ? 0.0 : -0.0, DL, VT);
   case ISD::FMUL:
     return getConstantFP(1.0, DL, VT);
   case ISD::FMINNUM:
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 9fcb850a147a8e..e9900adf2b3130 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2223,12 +2223,6 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
           "Attributes 'optdebug and optnone' are incompatible!", V);
   }
 
-  Check(!(Attrs.hasFnAttr(Attribute::SanitizeRealtime) &&
-          Attrs.hasFnAttr(Attribute::NoSanitizeRealtime)),
-        "Attributes "
-        "'sanitize_realtime and nosanitize_realtime' are incompatible!",
-        V);
-
   if (Attrs.hasFnAttr(Attribute::OptimizeForDebugging)) {
     Check(!Attrs.hasFnAttr(Attribute::OptimizeForSize),
           "Attributes 'optsize and optdebug' are incompatible!", V);
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 21a966d5abd132..5dee1221b27c01 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -206,6 +206,12 @@ void MappingTraits<DXContainerYAML::Object>::mapping(
   IO.mapRequired("Parts", Obj.Parts);
 }
 
+void MappingTraits<DXContainerYAML::ResourceFlags>::mapping(
+    IO &IO, DXContainerYAML::ResourceFlags &Flags) {
+#define RESOURCE_FLAG(FlagIndex, Enum) IO.mapRequired(#Enum, Flags.Bits.Enum);
+#include "llvm/BinaryFormat/DXContainerConstants.def"
+}
+
 void MappingTraits<DXContainerYAML::ResourceBindInfo>::mapping(
     IO &IO, DXContainerYAML::ResourceBindInfo &Res) {
   IO.mapRequired("Type", Res.Type);
@@ -266,12 +272,6 @@ void ScalarEnumerationTraits<dxbc::PSV::ResourceKind>::enumeration(
     IO.enumCase(Value, E.Name.str().c_str(), E.Value);
 }
 
-void ScalarEnumerationTraits<dxbc::PSV::ResourceFlag>::enumeration(
-    IO &IO, dxbc::PSV::ResourceFlag &Value) {
-  for (const auto &E : dxbc::PSV::getResourceFlags())
-    IO.enumCase(Value, E.Name.str().c_str(), E.Value);
-}
-
 void ScalarEnumerationTraits<dxbc::D3DSystemValue>::enumeration(
     IO &IO, dxbc::D3DSystemValue &Value) {
   for (const auto &E : dxbc::getD3DSystemValues())
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index bf224b73f3bad2..6bdc580f751d18 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -2248,6 +2248,54 @@ ConstantInt *ConstantInt::get(Type *Ty, uint64_t V, bool IsSigned) {
   return cast<ConstantInt>(Ty->getContext().getOrCreateConstant(LLVMC));
 }
 
+Constant *ConstantFP::get(Type *Ty, double V) {
+  auto *LLVMC = llvm::ConstantFP::get(Ty->LLVMTy, V);
+  return Ty->getContext().getOrCreateConstant(LLVMC);
+}
+
+Constant *ConstantFP::get(Type *Ty, const APFloat &V) {
+  auto *LLVMC = llvm::ConstantFP::get(Ty->LLVMTy, V);
+  return Ty->getContext().getOrCreateConstant(LLVMC);
+}
+
+Constant *ConstantFP::get(Type *Ty, StringRef Str) {
+  auto *LLVMC = llvm::ConstantFP::get(Ty->LLVMTy, Str);
+  return Ty->getContext().getOrCreateConstant(LLVMC);
+}
+
+ConstantFP *ConstantFP::get(const APFloat &V, Context &Ctx) {
+  auto *LLVMC = llvm::ConstantFP::get(Ctx.LLVMCtx, V);
+  return cast<ConstantFP>(Ctx.getOrCreateConstant(LLVMC));
+}
+
+Constant *ConstantFP::getNaN(Type *Ty, bool Negative, uint64_t Payload) {
+  auto *LLVMC = llvm::ConstantFP::getNaN(Ty->LLVMTy, Negative, Payload);
+  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+Constant *ConstantFP::getQNaN(Type *Ty, bool Negative, APInt *Payload) {
+  auto *LLVMC = llvm::ConstantFP::getQNaN(Ty->LLVMTy, Negative, Payload);
+  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+Constant *ConstantFP::getSNaN(Type *Ty, bool Negative, APInt *Payload) {
+  auto *LLVMC = llvm::ConstantFP::getSNaN(Ty->LLVMTy, Negative, Payload);
+  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+Constant *ConstantFP::getZero(Type *Ty, bool Negative) {
+  auto *LLVMC = llvm::ConstantFP::getZero(Ty->LLVMTy, Negative);
+  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+Constant *ConstantFP::getNegativeZero(Type *Ty) {
+  auto *LLVMC = llvm::ConstantFP::getNegativeZero(Ty->LLVMTy);
+  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
+  auto *LLVMC = llvm::ConstantFP::getInfinity(Ty->LLVMTy, Negative);
+  return cast<Constant>(Ty->getContext().getOrCreateConstant(LLVMC));
+}
+bool ConstantFP::isValueValidForType(Type *Ty, const APFloat &V) {
+  return llvm::ConstantFP::isValueValidForType(Ty->LLVMTy, V);
+}
+
 FunctionType *Function::getFunctionType() const {
   return cast<FunctionType>(
       Ctx.getType(cast<llvm::Function>(Val)->getFunctionType()));
@@ -2339,6 +2387,10 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
       It->second = std::unique_ptr<ConstantInt>(new ConstantInt(CI, *this));
       return It->second.get();
     }
+    if (auto *CF = dyn_cast<llvm::ConstantFP>(C)) {
+      It->second = std::unique_ptr<ConstantFP>(new ConstantFP(CF, *this));
+      return It->second.get();
+    }
     if (auto *F = dyn_cast<llvm::Function>(LLVMV))
       It->second = std::unique_ptr<Function>(new Function(F, *this));
     else
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3296f63a9b8876..11aca69db0a148 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11463,7 +11463,9 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
     // movw+movk is fused). So we limit up to 2 instrdduction at most.
     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
     AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn);
-    unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
+    assert(Insn.size() <= 4 &&
+           "Should be able to build any value with at most 4 moves");
+    unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
     IsLegal = Insn.size() <= Limit;
   }
 
@@ -19852,7 +19854,6 @@ static SDValue performConcatVectorsCombine(SDNode *N,
     // This optimization reduces instruction count.
     if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
         N00->getOperand(1) == N10->getOperand(1)) {
-
       SDValue N000 = N00->getOperand(0);
       SDValue N100 = N10->getOperand(0);
       uint64_t N001ConstVal = N00->getConstantOperandVal(1),
@@ -19860,7 +19861,8 @@ static SDValue performConcatVectorsCombine(SDNode *N,
                NScalarSize = N->getValueType(0).getScalarSizeInBits();
 
       if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
-
+        N000 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N000);
+        N100 = DAG.getNode(AArch64ISD::NVCAST, dl, VT, N100);
         SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, dl, VT, N000, N100);
         SDValue NewShiftConstant =
             DAG.getConstant(N001ConstVal - NScalarSize, dl, MVT::i32);
@@ -29344,8 +29346,10 @@ void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const {
     assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
            "Expected vectors of equal size!");
     // TODO: Enable assert once bogus creations have been fixed.
-    // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
-    //       "Expected result vector with half the lanes of its input!");
+    if (VT.isScalableVector())
+      break;
+    assert(OpVT.getVectorElementCount() == VT.getVectorElementCount() * 2 &&
+           "Expected result vector with half the lanes of its input!");
     break;
   }
   case AArch64ISD::TRN1:
@@ -29362,7 +29366,9 @@ void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const {
     assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
            "Expected vectors!");
     // TODO: Enable assert once bogus creations have been fixed.
-    // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
+    if (VT.isScalableVector())
+      break;
+    assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
     break;
   }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index bd5684a287381a..9f96f6c5e83ec4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -98,7 +98,7 @@ static cl::opt<bool> EnableCollectLOH(
 static cl::opt<bool>
     EnableDeadRegisterElimination("aarch64-enable-dead-defs", cl::Hidden,
                                   cl::desc("Enable the pass that removes dead"
-                                           " definitons and replaces stores to"
+                                           " definitions and replaces stores to"
                                            " them with stores to the zero"
                                            " register"),
                                   cl::init(true));
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 37add682b150e7..34c0fad45fc499 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -6947,10 +6947,14 @@ static void ExpandCryptoAEK(const AArch64::ArchInfo &ArchInfo,
   }
 }
 
+static SMLoc incrementLoc(SMLoc L, int Offset) {
+  return SMLoc::getFromPointer(L.getPointer() + Offset);
+}
+
 /// parseDirectiveArch
 ///   ::= .arch token
 bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
-  SMLoc ArchLoc = getLoc();
+  SMLoc CurLoc = getLoc();
 
   StringRef Arch, ExtensionString;
   std::tie(Arch, ExtensionString) =
@@ -6958,7 +6962,7 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
 
   const AArch64::ArchInfo *ArchInfo = AArch64::parseArch(Arch);
   if (!ArchInfo)
-    return Error(ArchLoc, "unknown arch name");
+    return Error(CurLoc, "unknown arch name");
 
   if (parseToken(AsmToken::EndOfStatement))
     return true;
@@ -6978,27 +6982,30 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
     ExtensionString.split(RequestedExtensions, '+');
 
   ExpandCryptoAEK(*ArchInfo, RequestedExtensions);
+  CurLoc = incrementLoc(CurLoc, Arch.size());
 
-  FeatureBitset Features = STI.getFeatureBits();
-  setAvailableFeatures(ComputeAvailableFeatures(Features));
   for (auto Name : RequestedExtensions) {
+    // Advance source location past '+'.
+    CurLoc = incrementLoc(CurLoc, 1);
+
     bool EnableFeature = !Name.consume_front_insensitive("no");
 
-    for (const auto &Extension : ExtensionMap) {
-      if (Extension.Name != Name)
-        continue;
+    auto It = llvm::find_if(ExtensionMap, [&Name](const auto &Extension) {
+      return Extension.Name == Name;
+    });
 
-      if (Extension.Features.none())
-        report_fatal_error("unsupported architectural extension: " + Name);
+    if (It == std::end(ExtensionMap))
+      Error(CurLoc, "unsupported architectural extension: " + Name);
 
-      FeatureBitset ToggleFeatures =
-          EnableFeature
-              ? STI.SetFeatureBitsTransitively(~Features & Extension.Features)
-              : STI.ToggleFeature(Features & Extension.Features);
-      setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures));
-      break;
-    }
+    if (EnableFeature)
+      STI.SetFeatureBitsTransitively(It->Features);
+    else
+      STI.ClearFeatureBitsTransitively(It->Features);
+
+    CurLoc = incrementLoc(CurLoc, Name.size());
   }
+  FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits());
+  setAvailableFeatures(Features);
   return false;
 }
 
@@ -7018,28 +7025,21 @@ bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) {
     Name = Name.substr(2);
   }
 
-  MCSubtargetInfo &STI = copySTI();
-  FeatureBitset Features = STI.getFeatureBits();
-  for (const auto &Extension : ExtensionMap) {
-    if (Extension.Name != Name)
-      continue;
-
-    if (Extension.Features.none())
-      return Error(ExtLoc, "unsupported architectural extension: " + Name);
-
-    FeatureBitset ToggleFeatures =
-        EnableFeature
-            ? STI.SetFeatureBitsTransitively(~Features & Extension.Features)
-            : STI.ToggleFeature(Features & Extension.Features);
-    setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures));
-    return false;
-  }
+  auto It = llvm::find_if(ExtensionMap, [&Name](const auto &Extension) {
+    return Extension.Name == Name;
+  });
 
-  return Error(ExtLoc, "unknown architectural extension: " + Name);
-}
+  if (It == std::end(ExtensionMap))
+    return Error(ExtLoc, "unsupported architectural extension: " + Name);
 
-static SMLoc incrementLoc(SMLoc L, int Offset) {
-  return SMLoc::getFromPointer(L.getPointer() + Offset);
+  MCSubtargetInfo &STI = copySTI();
+  if (EnableFeature)
+    STI.SetFeatureBitsTransitively(It->Features);
+  else
+    STI.ClearFeatureBitsTransitively(It->Features);
+  FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits());
+  setAvailableFeatures(Features);
+  return false;
 }
 
 /// parseDirectiveCPU
@@ -7075,30 +7075,22 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
 
     bool EnableFeature = !Name.consume_front_insensitive("no");
 
-    bool FoundExtension = false;
-    for (const auto &Extension : ExtensionMap) {
-      if (Extension.Name != Name)
-        continue;
-
-      if (Extension.Features.none())
-        report_fatal_error("unsupported architectural extension: " + Name);
-
-      FeatureBitset Features = STI.getFeatureBits();
-      FeatureBitset ToggleFeatures =
-          EnableFeature
-              ? STI.SetFeatureBitsTransitively(~Features & Extension.Features)
-              : STI.ToggleFeature(Features & Extension.Features);
-      setAvailableFeatures(ComputeAvailableFeatures(ToggleFeatures));
-      FoundExtension = true;
+    auto It = llvm::find_if(ExtensionMap, [&Name](const auto &Extension) {
+      return Extension.Name == Name;
+    });
 
-      break;
-    }
+    if (It == std::end(ExtensionMap))
+      Error(CurLoc, "unsupported architectural extension: " + Name);
 
-    if (!FoundExtension)
-      Error(CurLoc, "unsupported architectural extension");
+    if (EnableFeature)
+      STI.SetFeatureBitsTransitively(It->Features);
+    else
+      STI.ClearFeatureBitsTransitively(It->Features);
 
     CurLoc = incrementLoc(CurLoc, Name.size());
   }
+  FeatureBitset Features = ComputeAvailableFeatures(STI.getFeatureBits());
+  setAvailableFeatures(Features);
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index a5807a70582b39..df084cf41c4783 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -7,36 +7,33 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file Implements a module splitting algorithm designed to support the
-/// FullLTO --lto-partitions option for parallel codegen.
+/// FullLTO --lto-partitions option for parallel codegen. This is completely
+/// different from the common SplitModule pass, as this system is designed with
+/// AMDGPU in mind.
 ///
-/// The role of this module splitting pass is the same as
-/// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions
-/// across a set of N partitions to allow for parallel codegen.
+/// The basic idea of this module splitting implementation is the same as
+/// SplitModule: load-balance the module's functions across a set of N
+/// partitions to allow parallel codegen. However, it does it very
+/// differently than the target-agnostic variant:
+///   - The module has "split roots", which are kernels in the vast
+//      majority of cases.
+///   - Each root has a set of dependencies, and when a root and its
+///     dependencies is considered "big", we try to put it in a partition where
+///     most dependencies are already imported, to avoid duplicating large
+///     amounts of code.
+///   - There's special care for indirect calls in order to ensure
+///     AMDGPUResourceUsageAnalysis can work correctly.
 ///
-/// The similarities mostly end here, as this pass achieves load-balancing in a
-/// more elaborate fashion which is targeted towards AMDGPU modules. It can take
-/// advantage of the structure of AMDGPU modules (which are mostly
-/// self-contained) to allow for more efficient splitting without affecting
-/// codegen negatively, or causing innaccurate resource usage analysis.
-///
-/// High-level pass overview:
-///   - SplitGraph & associated classes
-///      - Graph representation of the module and of the dependencies that
-///      matter for splitting.
-///   - RecursiveSearchSplitting
-///     - Core splitting algorithm.
-///   - SplitProposal
-///     - Represents a suggested solution for splitting the input module. These
-///     solutions can be scored to determine the best one when multiple
-///     solutions are available.
-///   - Driver/pass "run" function glues everything together.
+/// This file also includes a more elaborate logging system to enable
+/// users to easily generate logs that (if desired) do not include any value
+/// names, in order to not leak information about the source file.
+/// Such logs are very helpful to understand and fix potential issues with
+/// module splitting.
 
 #include "AMDGPUSplitModule.h"
 #include "AMDGPUTargetMachine.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/EquivalenceClasses.h"
-#include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -47,56 +44,44 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Timer.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/SHA256.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm>
 #include <cassert>
-#include <cmath>
 #include <iterator>
 #include <memory>
 #include <utility>
 #include <vector>
 
-#ifndef NDEBUG
-#include "llvm/Support/LockFileManager.h"
-#endif
+using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-split-module"
 
-namespace llvm {
 namespace {
 
-static cl::opt<unsigned> MaxDepth(
-    "amdgpu-module-splitting-max-depth",
-    cl::desc(
-        "maximum search depth. 0 forces a greedy approach. "
-        "warning: the algorithm is up to O(2^N), where N is the max depth."),
-    cl::init(8));
-
 static cl::opt<float> LargeFnFactor(
-    "amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
+    "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
+    cl::Hidden,
     cl::desc(
-        "when max depth is reached and we can no longer branch out, this "
-        "value determines if a function is worth merging into an already "
-        "existing partition to reduce code duplication. This is a factor "
-        "of the ideal partition size, e.g. 2.0 means we consider the "
-        "function for merging if its cost (including its callees) is 2x the "
-        "size of an ideal partition."));
+        "consider a function as large and needing special treatment when the "
+        "cost of importing it into a partition"
+        "exceeds the average cost of a partition by this factor; e;g. 2.0 "
+        "means if the function and its dependencies is 2 times bigger than "
+        "an average partition; 0 disables large functions handling entirely"));
 
 static cl::opt<float> LargeFnOverlapForMerge(
-    "amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
-    cl::desc("when a function is considered for merging into a partition that "
-             "already contains some of its callees, do the merge if at least "
-             "n% of the code it can reach is already present inside the "
-             "partition; e.g. 0.7 means only merge >70%"));
+    "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
+    cl::Hidden,
+    cl::desc(
+        "defines how much overlap between two large function's dependencies "
+        "is needed to put them in the same partition"));
 
 static cl::opt<bool> NoExternalizeGlobals(
     "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
@@ -104,92 +89,142 @@ static cl::opt<bool> NoExternalizeGlobals(
              "may cause globals to be duplicated which increases binary size"));
 
 static cl::opt<std::string>
-    ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
-                       cl::Hidden,
-                       cl::desc("output file to write out the dotgraph "
-                                "representation of the input module"));
+    LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
+              cl::desc("output directory for AMDGPU module splitting logs"));
 
-static cl::opt<std::string> PartitionSummariesOutput(
-    "amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
-    cl::desc("output file to write out a summary of "
-             "the partitions created for each module"));
-
-#ifndef NDEBUG
 static cl::opt<bool>
-    UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
-                cl::desc("use a lock file so only one process in the system "
-                         "can run this pass at once. useful to avoid mangled "
-                         "debug output in multithreaded environments."));
+    LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
+               cl::desc("hash value names before printing them in the AMDGPU "
+                        "module splitting logs"));
 
-static cl::opt<bool>
-    DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
-                        cl::Hidden,
-                        cl::desc("print all proposals received and whether "
-                                 "they were rejected or accepted"));
-#endif
+using CostType = InstructionCost::CostType;
+using PartitionID = unsigned;
+using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
 
-struct SplitModuleTimer : NamedRegionTimer {
-  SplitModuleTimer(StringRef Name, StringRef Desc)
-      : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
-                         TimePassesIsEnabled) {}
-};
+static bool isEntryPoint(const Function *F) {
+  return AMDGPU::isEntryFunctionCC(F->getCallingConv());
+}
 
-//===----------------------------------------------------------------------===//
-// Utils
-//===----------------------------------------------------------------------===//
+static std::string getName(const Value &V) {
+  static bool HideNames;
 
-using CostType = InstructionCost::CostType;
-using FunctionsCostMap = DenseMap<const Function *, CostType>;
-using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
-static constexpr unsigned InvalidPID = -1;
+  static llvm::once_flag HideNameInitFlag;
+  llvm::call_once(HideNameInitFlag, [&]() {
+    if (LogPrivate.getNumOccurrences())
+      HideNames = LogPrivate;
+    else {
+      const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
+      HideNames = (EV.value_or("0") != "0");
+    }
+  });
 
-/// \param Num numerator
-/// \param Dem denominator
-/// \returns a printable object to print (Num/Dem) using "%0.2f".
-static auto formatRatioOf(CostType Num, CostType Dem) {
-  return format("%0.2f", (static_cast<double>(Num) / Dem) * 100);
+  if (!HideNames)
+    return V.getName().str();
+  return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
+               /*LowerCase=*/true);
 }
 
-/// Checks whether a given function is non-copyable.
+/// Main logging helper.
 ///
-/// Non-copyable functions cannot be cloned into multiple partitions, and only
-/// one copy of the function can be present across all partitions.
+/// Logging can be configured by the following environment variable.
+///   AMD_SPLIT_MODULE_LOG_DIR=<filepath>
+///     If set, uses <filepath> as the directory to write logfiles to
+///     each time module splitting is used.
+///   AMD_SPLIT_MODULE_LOG_PRIVATE
+///     If set to anything other than zero, all names are hidden.
 ///
-/// External functions fall into this category. If we were to clone them, we
-/// would end up with multiple symbol definitions and a very unhappy linker.
-static bool isNonCopyable(const Function &F) {
-  assert(AMDGPU::isEntryFunctionCC(F.getCallingConv())
-             ? F.hasExternalLinkage()
-             : true && "Kernel w/o external linkage?");
-  return F.hasExternalLinkage() || !F.isDefinitionExact();
-}
+/// Both environment variables have corresponding CL options which
+/// takes priority over them.
+///
+/// Any output printed to the log files is also printed to dbgs() when -debug is
+/// used and LLVM_DEBUG is defined.
+///
+/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
+/// cannot be removed from the code (by building without debug). This probably
+/// has a small performance cost because if some computation/formatting is
+/// needed for logging purpose, it may be done everytime only to be ignored
+/// by the logger.
+///
+/// As this pass only runs once and is not doing anything computationally
+/// expensive, this is likely a reasonable trade-off.
+///
+/// If some computation should really be avoided when unused, users of the class
+/// can check whether any logging will occur by using the bool operator.
+///
+/// \code
+///   if (SML) {
+///     // Executes only if logging to a file or if -debug is available and
+///     used.
+///   }
+/// \endcode
+class SplitModuleLogger {
+public:
+  SplitModuleLogger(const Module &M) {
+    std::string LogDir = LogDirOpt;
+    if (LogDir.empty())
+      LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
+
+    // No log dir specified means we don't need to log to a file.
+    // We may still log to dbgs(), though.
+    if (LogDir.empty())
+      return;
+
+    // If a log directory is specified, create a new file with a unique name in
+    // that directory.
+    int Fd;
+    SmallString<0> PathTemplate;
+    SmallString<0> RealPath;
+    sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
+    if (auto Err =
+            sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
+      report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
+                             "': " + Err.message(),
+                         /*CrashDiag=*/false);
+    }
 
-/// If \p GV has local linkage, make it external + hidden.
-static void externalize(GlobalValue &GV) {
-  if (GV.hasLocalLinkage()) {
-    GV.setLinkage(GlobalValue::ExternalLinkage);
-    GV.setVisibility(GlobalValue::HiddenVisibility);
+    FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
   }
 
-  // Unnamed entities must be named consistently between modules. setName will
-  // give a distinct name to each such entity.
-  if (!GV.hasName())
-    GV.setName("__llvmsplit_unnamed");
+  bool hasLogFile() const { return FileOS != nullptr; }
+
+  raw_ostream &logfile() {
+    assert(FileOS && "no logfile!");
+    return *FileOS;
+  }
+
+  /// \returns true if this SML will log anything either to a file or dbgs().
+  /// Can be used to avoid expensive computations that are ignored when logging
+  /// is disabled.
+  operator bool() const {
+    return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
+  }
+
+private:
+  std::unique_ptr<raw_fd_ostream> FileOS;
+};
+
+template <typename Ty>
+static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
+  static_assert(
+      !std::is_same_v<Ty, Value>,
+      "do not print values to logs directly, use handleName instead!");
+  LLVM_DEBUG(dbgs() << Val);
+  if (SML.hasLogFile())
+    SML.logfile() << Val;
+  return SML;
 }
 
-/// Cost analysis function. Calculates the cost of each function in \p M
-///
+/// Calculate the cost of each function in \p M
+/// \param SML Log Helper
 /// \param GetTTI Abstract getter for TargetTransformInfo.
 /// \param M Module to analyze.
 /// \param CostMap[out] Resulting Function -> Cost map.
 /// \return The module's total cost.
-static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
-                                       FunctionsCostMap &CostMap) {
-  SplitModuleTimer SMT("calculateFunctionCosts", "cost analysis");
-
-  LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n");
+static CostType
+calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
+                       DenseMap<const Function *, CostType> &CostMap) {
   CostType ModuleCost = 0;
-  [[maybe_unused]] CostType KernelCost = 0;
+  CostType KernelCost = 0;
 
   for (auto &Fn : M) {
     if (Fn.isDeclaration())
@@ -216,30 +251,23 @@ static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
     assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
     ModuleCost += FnCost;
 
-    if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv()))
+    if (isEntryPoint(&Fn))
       KernelCost += FnCost;
   }
 
-  if (CostMap.empty())
-    return 0;
-
-  assert(ModuleCost);
-  LLVM_DEBUG({
-    const CostType FnCost = ModuleCost - KernelCost;
-    dbgs() << " - total module cost is " << ModuleCost << ". kernels cost "
-           << "" << KernelCost << " ("
-           << format("%0.2f", (float(KernelCost) / ModuleCost) * 100)
-           << "% of the module), functions cost " << FnCost << " ("
-           << format("%0.2f", (float(FnCost) / ModuleCost) * 100)
-           << "% of the module)\n";
-  });
+  CostType FnCost = (ModuleCost - KernelCost);
+  CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1;
+  SML << "=> Total Module Cost: " << ModuleCost << '\n'
+      << "  => KernelCost: " << KernelCost << " ("
+      << format("%0.2f", (float(KernelCost) / ModuleCostOr1) * 100) << "%)\n"
+      << "  => FnsCost: " << FnCost << " ("
+      << format("%0.2f", (float(FnCost) / ModuleCostOr1) * 100) << "%)\n";
 
   return ModuleCost;
 }
 
-/// \return true if \p F can be indirectly called
 static bool canBeIndirectlyCalled(const Function &F) {
-  if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+  if (F.isDeclaration() || isEntryPoint(&F))
     return false;
   return !F.hasLocalLinkage() ||
          F.hasAddressTaken(/*PutOffender=*/nullptr,
@@ -250,1081 +278,351 @@ static bool canBeIndirectlyCalled(const Function &F) {
                            /*IgnoreCastedDirectCall=*/true);
 }
 
-//===----------------------------------------------------------------------===//
-// Graph-based Module Representation
-//===----------------------------------------------------------------------===//
-
-/// AMDGPUSplitModule's view of the source Module, as a graph of all components
-/// that can be split into different modules.
-///
-/// The most trivial instance of this graph is just the CallGraph of the module,
-/// but it is not guaranteed that the graph is strictly equal to the CG. It
-/// currently always is but it's designed in a way that would eventually allow
-/// us to create abstract nodes, or nodes for different entities such as global
-/// variables or any other meaningful constraint we must consider.
+/// When a function or any of its callees performs an indirect call, this
+/// takes over \ref addAllDependencies and adds all potentially callable
+/// functions to \p Fns so they can be counted as dependencies of the function.
 ///
-/// The graph is only mutable by this class, and is generally not modified
-/// after \ref SplitGraph::buildGraph runs. No consumers of the graph can
-/// mutate it.
-class SplitGraph {
-public:
-  class Node;
-
-  enum class EdgeKind : uint8_t {
-    /// The nodes are related through a direct call. This is a "strong" edge as
-    /// it means the Src will directly reference the Dst.
-    DirectCall,
-    /// The nodes are related through an indirect call.
-    /// This is a "weaker" edge and is only considered when traversing the graph
-    /// starting from a kernel. We need this edge for resource usage analysis.
-    ///
-    /// The reason why we have this edge in the first place is due to how
-    /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call,
-    /// the resource usage of the kernel containing the indirect call is the
-    /// max resource usage of all functions that can be indirectly called.
-    IndirectCall,
-  };
-
-  /// An edge between two nodes. Edges are directional, and tagged with a
-  /// "kind".
-  struct Edge {
-    Edge(Node *Src, Node *Dst, EdgeKind Kind)
-        : Src(Src), Dst(Dst), Kind(Kind) {}
-
-    Node *Src; ///< Source
-    Node *Dst; ///< Destination
-    EdgeKind Kind;
-  };
-
-  using EdgesVec = SmallVector<const Edge *, 0>;
-  using edges_iterator = EdgesVec::const_iterator;
-  using nodes_iterator = const Node *const *;
-
-  SplitGraph(const Module &M, const FunctionsCostMap &CostMap,
-             CostType ModuleCost)
-      : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
-
-  void buildGraph(CallGraph &CG);
-
-#ifndef NDEBUG
-  bool verifyGraph() const;
-#endif
-
-  bool empty() const { return Nodes.empty(); }
-  const iterator_range<nodes_iterator> nodes() const {
-    return {Nodes.begin(), Nodes.end()};
+/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
+/// presence of an indirect call, the function's resource usage is the same as
+/// the most expensive function in the module.
+/// \param M    The module.
+/// \param Fns[out] Resulting list of functions.
+static void addAllIndirectCallDependencies(const Module &M,
+                                           DenseSet<const Function *> &Fns) {
+  for (const auto &Fn : M) {
+    if (canBeIndirectlyCalled(Fn))
+      Fns.insert(&Fn);
   }
-  const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
-
-  unsigned getNumNodes() const { return Nodes.size(); }
-  BitVector createNodesBitVector() const { return BitVector(Nodes.size()); }
-
-  const Module &getModule() const { return M; }
-
-  CostType getModuleCost() const { return ModuleCost; }
-  CostType getCost(const Function &F) const { return CostMap.at(&F); }
-
-  /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node
-  /// IDs).
-  CostType calculateCost(const BitVector &BV) const;
-
-private:
-  /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and
-  /// updates \p Cache.
-  Node &getNode(DenseMap<const GlobalValue *, Node *> &Cache,
-                const GlobalValue &GV);
-
-  // Create a new edge between two nodes and add it to both nodes.
-  const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK);
-
-  const Module &M;
-  const FunctionsCostMap &CostMap;
-  CostType ModuleCost;
-
-  // Final list of nodes with stable ordering.
-  SmallVector<Node *> Nodes;
-
-  SpecificBumpPtrAllocator<Node> NodesPool;
-
-  // Edges are trivially destructible objects, so as a small optimization we
-  // use a BumpPtrAllocator which avoids destructor calls but also makes
-  // allocation faster.
-  static_assert(
-      std::is_trivially_destructible_v<Edge>,
-      "Edge must be trivially destructible to use the BumpPtrAllocator");
-  BumpPtrAllocator EdgesPool;
-};
+}
 
-/// Nodes in the SplitGraph contain both incoming, and outgoing edges.
-/// Incoming edges have this node as their Dst, and Outgoing ones have this node
-/// as their Src.
+/// Adds the functions that \p Fn may call to \p Fns, then recurses into each
+/// callee until all reachable functions have been gathered.
 ///
-/// Edge objects are shared by both nodes in Src/Dst. They provide immediate
-/// feedback on how two nodes are related, and in which direction they are
-/// related, which is valuable information to make splitting decisions.
-///
-/// Nodes are fundamentally abstract, and any consumers of the graph should
-/// treat them as such. While a node will be a function most of the time, we
-/// could also create nodes for any other reason. In the future, we could have
-/// single nodes for multiple functions, or nodes for GVs, etc.
-class SplitGraph::Node {
-  friend class SplitGraph;
-
-public:
-  Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost,
-       bool IsNonCopyable)
-      : ID(ID), GV(GV), IndividualCost(IndividualCost),
-        IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) {
-    if (auto *Fn = dyn_cast<Function>(&GV))
-      IsEntryFnCC = AMDGPU::isEntryFunctionCC(Fn->getCallingConv());
-  }
-
-  /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of
-  /// nodes in the graph. This ID can be used as an index in a BitVector.
-  unsigned getID() const { return ID; }
-
-  const Function &getFunction() const { return cast<Function>(GV); }
-
-  /// \returns the cost to import this component into a given module, not
-  /// accounting for any dependencies that may need to be imported as well.
-  CostType getIndividualCost() const { return IndividualCost; }
-
-  bool isNonCopyable() const { return IsNonCopyable; }
-  bool isEntryFunctionCC() const { return IsEntryFnCC; }
-
-  /// \returns whether this is an entry point in the graph. Entry points are
-  /// defined as follows: if you take all entry points in the graph, and iterate
-  /// their dependencies, you are guaranteed to visit all nodes in the graph at
-  /// least once.
-  bool isGraphEntryPoint() const { return IsGraphEntry; }
-
-  StringRef getName() const { return GV.getName(); }
-
-  bool hasAnyIncomingEdges() const { return IncomingEdges.size(); }
-  bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const {
-    return any_of(IncomingEdges, [&](const auto *E) { return E->Kind == EK; });
-  }
-
-  bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); }
-  bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const {
-    return any_of(OutgoingEdges, [&](const auto *E) { return E->Kind == EK; });
-  }
-
-  iterator_range<edges_iterator> incoming_edges() const {
-    return IncomingEdges;
-  }
-
-  iterator_range<edges_iterator> outgoing_edges() const {
-    return OutgoingEdges;
-  }
-
-  bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); }
-
-  /// Visit all children of this node in a recursive fashion. Also visits Self.
-  /// If \ref shouldFollowIndirectCalls returns false, then this only follows
-  /// DirectCall edges.
-  ///
-  /// \param Visitor Visitor Function.
-  void visitAllDependencies(std::function<void(const Node &)> Visitor) const;
-
-  /// Adds the depedencies of this node in \p BV by setting the bit
-  /// corresponding to each node.
-  ///
-  /// Implemented using \ref visitAllDependencies, hence it follows the same
-  /// rules regarding dependencies traversal.
-  ///
-  /// \param[out] BV The bitvector where the bits should be set.
-  void getDependencies(BitVector &BV) const {
-    visitAllDependencies([&](const Node &N) { BV.set(N.getID()); });
-  }
-
-  /// Uses \ref visitAllDependencies to aggregate the individual cost of this
-  /// node and all of its dependencies.
-  ///
-  /// This is cached.
-  CostType getFullCost() const;
-
-private:
-  void markAsGraphEntry() { IsGraphEntry = true; }
-
-  unsigned ID;
-  const GlobalValue &GV;
-  CostType IndividualCost;
-  bool IsNonCopyable : 1;
-  bool IsEntryFnCC : 1;
-  bool IsGraphEntry : 1;
-
-  // TODO: Cache dependencies as well?
-  mutable CostType FullCost = 0;
-
-  // TODO: Use a single sorted vector (with all incoming/outgoing edges grouped
-  // together)
-  EdgesVec IncomingEdges;
-  EdgesVec OutgoingEdges;
-};
-
-void SplitGraph::Node::visitAllDependencies(
-    std::function<void(const Node &)> Visitor) const {
-  const bool FollowIndirect = shouldFollowIndirectCalls();
-  // FIXME: If this can access SplitGraph in the future, use a BitVector
-  // instead.
-  DenseSet<const Node *> Seen;
-  SmallVector<const Node *, 8> WorkList({this});
+/// \param SML Log Helper
+/// \param CG Call graph for \p Fn's module.
+/// \param Fn Current function to look at.
+/// \param Fns[out] Resulting list of functions.
+/// \param OnlyDirect Whether to only consider direct callees.
+/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
+/// point, either in \p Fn or in one of the function it calls. When that
+/// happens, we fall back to adding all callable functions inside \p Fn's module
+/// to \p Fns.
+static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
+                               const Function &Fn,
+                               DenseSet<const Function *> &Fns, bool OnlyDirect,
+                               bool &HadIndirectCall) {
+  assert(!Fn.isDeclaration());
+
+  const Module &M = *Fn.getParent();
+  SmallVector<const Function *> WorkList({&Fn});
   while (!WorkList.empty()) {
-    const Node *CurN = WorkList.pop_back_val();
-    if (auto [It, Inserted] = Seen.insert(CurN); !Inserted)
-      continue;
-
-    Visitor(*CurN);
-
-    for (const Edge *E : CurN->outgoing_edges()) {
-      if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall)
-        continue;
-      WorkList.push_back(E->Dst);
-    }
-  }
-}
-
-CostType SplitGraph::Node::getFullCost() const {
-  if (FullCost)
-    return FullCost;
-
-  assert(FullCost == 0);
-  visitAllDependencies(
-      [&](const Node &N) { FullCost += N.getIndividualCost(); });
-  return FullCost;
-}
+    const auto &CurFn = *WorkList.pop_back_val();
+    assert(!CurFn.isDeclaration());
 
-void SplitGraph::buildGraph(CallGraph &CG) {
-  SplitModuleTimer SMT("buildGraph", "graph construction");
-  LLVM_DEBUG(
-      dbgs()
-      << "[build graph] constructing graph representation of the input\n");
-
-  // We build the graph by just iterating all functions in the module and
-  // working on their direct callees. At the end, all nodes should be linked
-  // together as expected.
-  DenseMap<const GlobalValue *, Node *> Cache;
-  SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
-  for (const Function &Fn : M) {
-    if (Fn.isDeclaration())
-      continue;
+    // Scan for an indirect call. If such a call is found, we have to
+    // conservatively assume this can call all non-entrypoint functions in the
+    // module.
 
-    // Look at direct callees and create the necessary edges in the graph.
-    bool HasIndirectCall = false;
-    Node &N = getNode(Cache, Fn);
-    for (auto &CGEntry : *CG[&Fn]) {
+    for (auto &CGEntry : *CG[&CurFn]) {
       auto *CGNode = CGEntry.second;
       auto *Callee = CGNode->getFunction();
       if (!Callee) {
-        // TODO: Don't consider inline assembly as indirect calls.
-        if (CGNode == CG.getCallsExternalNode())
-          HasIndirectCall = true;
+        if (OnlyDirect)
+          continue;
+
+        // Functions have an edge towards CallsExternalNode if they're external
+        // declarations, or if they do an indirect call. As we only process
+        // definitions here, we know this means the function has an indirect
+        // call. We then have to conservatively assume this can call all
+        // non-entrypoint functions in the module.
+        if (CGNode != CG.getCallsExternalNode())
+          continue; // this is another function-less node we don't care about.
+
+        SML << "Indirect call detected in " << getName(CurFn)
+            << " - treating all non-entrypoint functions as "
+               "potential dependencies\n";
+
+        // TODO: Print an ORE as well ?
+        addAllIndirectCallDependencies(M, Fns);
+        HadIndirectCall = true;
         continue;
       }
 
-      if (!Callee->isDeclaration())
-        createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);
-    }
-
-    // Keep track of this function if it contains an indirect call and/or if it
-    // can be indirectly called.
-    if (HasIndirectCall) {
-      LLVM_DEBUG(dbgs() << "indirect call found in " << Fn.getName() << "\n");
-      FnsWithIndirectCalls.push_back(&Fn);
-    }
-
-    if (canBeIndirectlyCalled(Fn))
-      IndirectlyCallableFns.push_back(&Fn);
-  }
+      if (Callee->isDeclaration())
+        continue;
 
-  // Post-process functions with indirect calls.
-  for (const Function *Fn : FnsWithIndirectCalls) {
-    for (const Function *Candidate : IndirectlyCallableFns) {
-      Node &Src = getNode(Cache, *Fn);
-      Node &Dst = getNode(Cache, *Candidate);
-      createEdge(Src, Dst, EdgeKind::IndirectCall);
+      auto [It, Inserted] = Fns.insert(Callee);
+      if (Inserted)
+        WorkList.push_back(Callee);
     }
   }
-
-  // Now, find all entry points.
-  SmallVector<Node *, 16> CandidateEntryPoints;
-  BitVector NodesReachableByKernels = createNodesBitVector();
-  for (Node *N : Nodes) {
-    // Functions with an Entry CC are always graph entry points too.
-    if (N->isEntryFunctionCC()) {
-      N->markAsGraphEntry();
-      N->getDependencies(NodesReachableByKernels);
-    } else if (!N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
-      CandidateEntryPoints.push_back(N);
-  }
-
-  for (Node *N : CandidateEntryPoints) {
-    // This can be another entry point if it's not reachable by a kernel
-    // TODO: We could sort all of the possible new entries in a stable order
-    // (e.g. by cost), then consume them one by one until
-    // NodesReachableByKernels is all 1s. It'd allow us to avoid
-    // considering some nodes as non-entries in some specific cases.
-    if (!NodesReachableByKernels.test(N->getID()))
-      N->markAsGraphEntry();
-  }
-
-#ifndef NDEBUG
-  assert(verifyGraph());
-#endif
 }
 
-#ifndef NDEBUG
-bool SplitGraph::verifyGraph() const {
-  unsigned ExpectedID = 0;
-  // Exceptionally using a set here in case IDs are messed up.
-  DenseSet<const Node *> SeenNodes;
-  DenseSet<const Function *> SeenFunctionNodes;
-  for (const Node *N : Nodes) {
-    if (N->getID() != (ExpectedID++)) {
-      errs() << "Node IDs are incorrect!\n";
-      return false;
-    }
-
-    if (!SeenNodes.insert(N).second) {
-      errs() << "Node seen more than once!\n";
-      return false;
-    }
-
-    if (&getNode(N->getID()) != N) {
-      errs() << "getNode doesn't return the right node\n";
-      return false;
-    }
-
-    for (const Edge *E : N->IncomingEdges) {
-      if (!E->Src || !E->Dst || (E->Dst != N) ||
-          (find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) {
-        errs() << "ill-formed incoming edges\n";
-        return false;
-      }
-    }
-
-    for (const Edge *E : N->OutgoingEdges) {
-      if (!E->Src || !E->Dst || (E->Src != N) ||
-          (find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) {
-        errs() << "ill-formed outgoing edges\n";
-        return false;
-      }
-    }
-
-    const Function &Fn = N->getFunction();
-    if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
-      if (N->hasAnyIncomingEdges()) {
-        errs() << "Kernels cannot have incoming edges\n";
-        return false;
-      }
-    }
-
-    if (Fn.isDeclaration()) {
-      errs() << "declarations shouldn't have nodes!\n";
-      return false;
-    }
-
-    auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
-    if (!Inserted) {
-      errs() << "one function has multiple nodes!\n";
-      return false;
+/// Contains information about a function and its dependencies.
+/// This is a splitting root. The splitting algorithm works by
+/// assigning these to partitions.
+struct FunctionWithDependencies {
+  FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
+                           const DenseMap<const Function *, CostType> &FnCosts,
+                           const Function *Fn)
+      : Fn(Fn) {
+    // When Fn is not a kernel, we don't need to collect indirect callees.
+    // Resource usage analysis is only performed on kernels, and we collect
+    // indirect callees for resource usage analysis.
+    addAllDependencies(SML, CG, *Fn, Dependencies,
+                       /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall);
+    TotalCost = FnCosts.at(Fn);
+    for (const auto *Dep : Dependencies) {
+      TotalCost += FnCosts.at(Dep);
+
+      // We cannot duplicate functions with external linkage, or functions that
+      // may be overriden at runtime.
+      HasNonDuplicatableDependecy |=
+          (Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
     }
   }
 
-  if (ExpectedID != Nodes.size()) {
-    errs() << "Node IDs out of sync!\n";
-    return false;
-  }
-
-  if (createNodesBitVector().size() != getNumNodes()) {
-    errs() << "nodes bit vector doesn't have the right size!\n";
-    return false;
-  }
-
-  // Check we respect the promise of Node::isKernel
-  BitVector BV = createNodesBitVector();
-  for (const Node *N : nodes()) {
-    if (N->isGraphEntryPoint())
-      N->getDependencies(BV);
-  }
-
-  // Ensure each function in the module has an associated node.
-  for (const auto &Fn : M) {
-    if (!Fn.isDeclaration()) {
-      if (!SeenFunctionNodes.contains(&Fn)) {
-        errs() << "Fn has no associated node in the graph!\n";
-        return false;
-      }
-    }
-  }
-
-  if (!BV.all()) {
-    errs() << "not all nodes are reachable through the graph's entry points!\n";
-    return false;
-  }
-
-  return true;
-}
-#endif
-
-CostType SplitGraph::calculateCost(const BitVector &BV) const {
-  CostType Cost = 0;
-  for (unsigned NodeID : BV.set_bits())
-    Cost += getNode(NodeID).getIndividualCost();
-  return Cost;
-}
-
-SplitGraph::Node &
-SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
-                    const GlobalValue &GV) {
-  auto &N = Cache[&GV];
-  if (N)
-    return *N;
-
-  CostType Cost = 0;
-  bool NonCopyable = false;
-  if (const Function *Fn = dyn_cast<Function>(&GV)) {
-    NonCopyable = isNonCopyable(*Fn);
-    Cost = CostMap.at(Fn);
-  }
-  N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable);
-  Nodes.push_back(N);
-  assert(&getNode(N->getID()) == N);
-  return *N;
-}
-
-const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst,
-                                               EdgeKind EK) {
-  const Edge *E = new (EdgesPool.Allocate<Edge>(1)) Edge(&Src, &Dst, EK);
-  Src.OutgoingEdges.push_back(E);
-  Dst.IncomingEdges.push_back(E);
-  return *E;
-}
-
-//===----------------------------------------------------------------------===//
-// Split Proposals
-//===----------------------------------------------------------------------===//
-
-/// Represents a module splitting proposal.
-///
-/// Proposals are made of N BitVectors, one for each partition, where each bit
-/// set indicates that the node is present and should be copied inside that
-/// partition.
-///
-/// Proposals have several metrics attached so they can be compared/sorted,
-/// which the driver to try multiple strategies resultings in multiple proposals
-/// and choose the best one out of them.
-class SplitProposal {
-public:
-  SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) {
-    Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
-  }
+  const Function *Fn = nullptr;
+  DenseSet<const Function *> Dependencies;
+  /// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
+  bool HasIndirectCall = false;
+  /// Whether any of \p Fn's dependencies cannot be duplicated.
+  bool HasNonDuplicatableDependecy = false;
 
-  void setName(StringRef NewName) { Name = NewName; }
-  StringRef getName() const { return Name; }
-
-  const BitVector &operator[](unsigned PID) const {
-    return Partitions[PID].second;
-  }
-
-  void add(unsigned PID, const BitVector &BV) {
-    Partitions[PID].second |= BV;
-    updateScore(PID);
-  }
-
-  void print(raw_ostream &OS) const;
-  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
-
-  // Find the cheapest partition (lowest cost). In case of ties, always returns
-  // the highest partition number.
-  unsigned findCheapestPartition() const;
-
-  /// Calculate the CodeSize and Bottleneck scores.
-  void calculateScores();
-
-#ifndef NDEBUG
-  void verifyCompleteness() const;
-#endif
-
-  /// Only available after \ref calculateScores is called.
-  ///
-  /// A positive number indicating the % of code duplication that this proposal
-  /// creates. e.g. 0.2 means this proposal adds roughly 20% code size by
-  /// duplicating some functions across partitions.
-  ///
-  /// Value is always rounded up to 3 decimal places.
-  ///
-  /// A perfect score would be 0.0, and anything approaching 1.0 is very bad.
-  double getCodeSizeScore() const { return CodeSizeScore; }
-
-  /// Only available after \ref calculateScores is called.
-  ///
-  /// A number between [0, 1] which indicates how big of a bottleneck is
-  /// expected from the largest partition.
-  ///
-  /// A score of 1.0 means the biggest partition is as big as the source module,
-  /// so build time will be equal to or greater than the build time of the
-  /// initial input.
-  ///
-  /// Value is always rounded up to 3 decimal places.
-  ///
-  /// This is one of the metrics used to estimate this proposal's build time.
-  double getBottleneckScore() const { return BottleneckScore; }
-
-private:
-  void updateScore(unsigned PID) {
-    assert(SG);
-    for (auto &[PCost, Nodes] : Partitions) {
-      TotalCost -= PCost;
-      PCost = SG->calculateCost(Nodes);
-      TotalCost += PCost;
-    }
-  }
-
-  /// \see getCodeSizeScore
-  double CodeSizeScore = 0.0;
-  /// \see getBottleneckScore
-  double BottleneckScore = 0.0;
-  /// Aggregated cost of all partitions
   CostType TotalCost = 0;
 
-  const SplitGraph *SG = nullptr;
-  std::string Name;
-
-  std::vector<std::pair<CostType, BitVector>> Partitions;
-};
-
-void SplitProposal::print(raw_ostream &OS) const {
-  assert(SG);
-
-  OS << "[proposal] " << Name << ", total cost:" << TotalCost
-     << ", code size score:" << format("%0.3f", CodeSizeScore)
-     << ", bottleneck score:" << format("%0.3f", BottleneckScore) << '\n';
-  for (const auto &[PID, Part] : enumerate(Partitions)) {
-    const auto &[Cost, NodeIDs] = Part;
-    OS << "  - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost
-       << '|' << formatRatioOf(Cost, SG->getModuleCost()) << "%\n";
-  }
-}
-
-unsigned SplitProposal::findCheapestPartition() const {
-  assert(!Partitions.empty());
-  CostType CurCost = std::numeric_limits<CostType>::max();
-  unsigned CurPID = InvalidPID;
-  for (const auto &[Idx, Part] : enumerate(Partitions)) {
-    if (Part.first <= CurCost) {
-      CurPID = Idx;
-      CurCost = Part.first;
-    }
-  }
-  assert(CurPID != InvalidPID);
-  return CurPID;
-}
-
-void SplitProposal::calculateScores() {
-  if (Partitions.empty())
-    return;
-
-  assert(SG);
-  CostType LargestPCost = 0;
-  for (auto &[PCost, Nodes] : Partitions) {
-    if (PCost > LargestPCost)
-      LargestPCost = PCost;
+  /// \returns true if this function and its dependencies can be considered
+  /// large according to \p Threshold.
+  bool isLarge(CostType Threshold) const {
+    return TotalCost > Threshold && !Dependencies.empty();
   }
-
-  CostType ModuleCost = SG->getModuleCost();
-  CodeSizeScore = double(TotalCost) / ModuleCost;
-  assert(CodeSizeScore >= 0.0);
-
-  BottleneckScore = double(LargestPCost) / ModuleCost;
-
-  CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
-  BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
-}
-
-#ifndef NDEBUG
-void SplitProposal::verifyCompleteness() const {
-  if (Partitions.empty())
-    return;
-
-  BitVector Result = Partitions[0].second;
-  for (const auto &P : drop_begin(Partitions))
-    Result |= P.second;
-  assert(Result.all() && "some nodes are missing from this proposal!");
-}
-#endif
-
-//===-- RecursiveSearchStrategy -------------------------------------------===//
-
-/// Partitioning algorithm.
-///
-/// This is a recursive search algorithm that can explore multiple possiblities.
-///
-/// When a cluster of nodes can go into more than one partition, and we haven't
-/// reached maximum search depth, we recurse and explore both options and their
-/// consequences. Both branches will yield a proposal, and the driver will grade
-/// both and choose the best one.
-///
-/// If max depth is reached, we will use some heuristics to make a choice. Most
-/// of the time we will just use the least-pressured (cheapest) partition, but
-/// if a cluster is particularly big and there is a good amount of overlap with
-/// an existing partition, we will choose that partition instead.
-class RecursiveSearchSplitting {
-public:
-  using SubmitProposalFn = function_ref<void(SplitProposal)>;
-
-  RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts,
-                           SubmitProposalFn SubmitProposal);
-
-  void run();
-
-private:
-  struct WorkListEntry {
-    WorkListEntry(const BitVector &BV) : Cluster(BV) {}
-
-    unsigned NumNonEntryNodes = 0;
-    CostType TotalCost = 0;
-    CostType CostExcludingGraphEntryPoints = 0;
-    BitVector Cluster;
-  };
-
-  /// Collects all graph entry points's clusters and sort them so the most
-  /// expensive clusters are viewed first. This will merge clusters together if
-  /// they share a non-copyable dependency.
-  void setupWorkList();
-
-  /// Recursive function that assigns the worklist item at \p Idx into a
-  /// partition of \p SP.
-  ///
-  /// \p Depth is the current search depth. When this value is equal to
-  /// \ref MaxDepth, we can no longer recurse.
-  ///
-  /// This function only recurses if there is more than one possible assignment,
-  /// otherwise it is iterative to avoid creating a call stack that is as big as
-  /// \ref WorkList.
-  void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP);
-
-  /// \return A pair: first element is the PID of the partition that has the
-  /// most similarities with \p Entry, or \ref InvalidPID if no partition was
-  /// found with at least one element in common. The second element is the
-  /// aggregated cost of all dependencies in common between \p Entry and that
-  /// partition.
-  std::pair<unsigned, CostType>
-  findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP);
-
-  const SplitGraph &SG;
-  unsigned NumParts;
-  SubmitProposalFn SubmitProposal;
-
-  // A Cluster is considered large when its cost, excluding entry points,
-  // exceeds this value.
-  CostType LargeClusterThreshold = 0;
-  unsigned NumProposalsSubmitted = 0;
-  SmallVector<WorkListEntry> WorkList;
 };
 
-RecursiveSearchSplitting::RecursiveSearchSplitting(
-    const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal)
-    : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
-  // arbitrary max value as a safeguard. Anything above 10 will already be
-  // slow, this is just a max value to prevent extreme resource exhaustion or
-  // unbounded run time.
-  if (MaxDepth > 16)
-    report_fatal_error("[amdgpu-split-module] search depth of " +
-                       Twine(MaxDepth) + " is too high!");
-  LargeClusterThreshold =
-      (LargeFnFactor != 0.0)
-          ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
-          : std::numeric_limits<CostType>::max();
-  LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at "
-                    << LargeClusterThreshold << "\n");
-}
-
-void RecursiveSearchSplitting::run() {
-  {
-    SplitModuleTimer SMT("recursive_search_prepare", "preparing worklist");
-    setupWorkList();
+/// Calculates how much overlap there is between \p A and \p B.
+/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
+/// and B have no shared elements. Kernels do not count in overlap calculation.
+static float calculateOverlap(const DenseSet<const Function *> &A,
+                              const DenseSet<const Function *> &B) {
+  DenseSet<const Function *> Total;
+  for (const auto *F : A) {
+    if (!isEntryPoint(F))
+      Total.insert(F);
   }
 
-  {
-    SplitModuleTimer SMT("recursive_search_pick", "partitioning");
-    SplitProposal SP(SG, NumParts);
-    pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP);
-  }
-}
+  if (Total.empty())
+    return 0.0f;
 
-void RecursiveSearchSplitting::setupWorkList() {
-  // e.g. if A and B are two worklist item, and they both call a non copyable
-  // dependency C, this does:
-  //    A=C
-  //    B=C
-  // => NodeEC will create a single group (A, B, C) and we create a new
-  // WorkList entry for that group.
-
-  EquivalenceClasses<unsigned> NodeEC;
-  for (const SplitGraph::Node *N : SG.nodes()) {
-    if (!N->isGraphEntryPoint())
+  unsigned NumCommon = 0;
+  for (const auto *F : B) {
+    if (isEntryPoint(F))
       continue;
 
-    NodeEC.insert(N->getID());
-    N->visitAllDependencies([&](const SplitGraph::Node &Dep) {
-      if (&Dep != N && Dep.isNonCopyable())
-        NodeEC.unionSets(N->getID(), Dep.getID());
-    });
+    auto [It, Inserted] = Total.insert(F);
+    if (!Inserted)
+      ++NumCommon;
   }
 
-  for (auto I = NodeEC.begin(), E = NodeEC.end(); I != E; ++I) {
-    if (!I->isLeader())
-      continue;
+  return static_cast<float>(NumCommon) / Total.size();
+}
 
-    BitVector Cluster = SG.createNodesBitVector();
-    for (auto MI = NodeEC.member_begin(I); MI != NodeEC.member_end(); ++MI) {
-      const SplitGraph::Node &N = SG.getNode(*MI);
-      if (N.isGraphEntryPoint())
-        N.getDependencies(Cluster);
-    }
-    WorkList.emplace_back(std::move(Cluster));
-  }
+/// Performs all of the partitioning work on \p M.
+/// \param SML Log Helper
+/// \param M Module to partition.
+/// \param NumParts Number of partitions to create.
+/// \param ModuleCost Total cost of all functions in \p M.
+/// \param FnCosts Map of Function -> Cost
+/// \param WorkList Functions and their dependencies to process in order.
+/// \returns The created partitions (a vector of size \p NumParts )
+static std::vector<DenseSet<const Function *>>
+doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
+               CostType ModuleCost,
+               const DenseMap<const Function *, CostType> &FnCosts,
+               const SmallVector<FunctionWithDependencies> &WorkList) {
+
+  SML << "\n--Partitioning Starts--\n";
+
+  // Calculate a "large function threshold". When more than one function's total
+  // import cost exceeds this value, we will try to assign it to an existing
+  // partition to reduce the amount of duplication needed.
+  //
+  // e.g. let two functions X and Y have a import cost of ~10% of the module, we
+  // assign X to a partition as usual, but when we get to Y, we check if it's
+  // worth also putting it in Y's partition.
+  const CostType LargeFnThreshold =
+      LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor))
+                    : std::numeric_limits<CostType>::max();
+
+  std::vector<DenseSet<const Function *>> Partitions;
+  Partitions.resize(NumParts);
+
+  // Assign functions to partitions, and try to keep the partitions more or
+  // less balanced. We do that through a priority queue sorted in reverse, so we
+  // can always look at the partition with the least content.
+  //
+  // There are some cases where we will be deliberately unbalanced though.
+  //  - Large functions: we try to merge with existing partitions to reduce code
+  //  duplication.
+  //  - Functions with indirect or external calls always go in the first
+  //  partition (P0).
+  auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
+                              const std::pair<PartitionID, CostType> &b) {
+    // When two partitions have the same cost, assign to the one with the
+    // biggest ID first. This allows us to put things in P0 last, because P0 may
+    // have other stuff added later.
+    if (a.second == b.second)
+      return a.first < b.first;
+    return a.second > b.second;
+  };
 
-  // Calculate costs and other useful information.
-  for (WorkListEntry &Entry : WorkList) {
-    for (unsigned NodeID : Entry.Cluster.set_bits()) {
-      const SplitGraph::Node &N = SG.getNode(NodeID);
-      const CostType Cost = N.getIndividualCost();
+  // We can't use priority_queue here because we need to be able to access any
+  // element. This makes this a bit inefficient as we need to sort it again
+  // everytime we change it, but it's a very small array anyway (likely under 64
+  // partitions) so it's a cheap operation.
+  std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
+  for (unsigned I = 0; I < NumParts; ++I)
+    BalancingQueue.emplace_back(I, 0);
+
+  // Helper function to handle assigning a function to a partition. This takes
+  // care of updating the balancing queue.
+  const auto AssignToPartition = [&](PartitionID PID,
+                                     const FunctionWithDependencies &FWD) {
+    auto &FnsInPart = Partitions[PID];
+    FnsInPart.insert(FWD.Fn);
+    FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+
+    SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n  ->  ";
+    if (!FWD.Dependencies.empty()) {
+      SML << FWD.Dependencies.size() << " dependencies added\n";
+    };
+
+    // Update the balancing queue. we scan backwards because in the common case
+    // the partition is at the end.
+    for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
+      if (QueuePID == PID) {
+        CostType NewCost = 0;
+        for (auto *Fn : Partitions[PID])
+          NewCost += FnCosts.at(Fn);
+
+        SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
+        if (Cost) {
+          SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100)
+              << "% increase)";
+        }
+        SML << '\n';
 
-      Entry.TotalCost += Cost;
-      if (!N.isGraphEntryPoint()) {
-        Entry.CostExcludingGraphEntryPoints += Cost;
-        ++Entry.NumNonEntryNodes;
+        Cost = NewCost;
       }
     }
-  }
 
-  sort(WorkList, [](const WorkListEntry &LHS, const WorkListEntry &RHS) {
-    return LHS.TotalCost > RHS.TotalCost;
-  });
-
-  LLVM_DEBUG({
-    dbgs() << "[recursive search] worklist:\n";
-    for (const auto &[Idx, Entry] : enumerate(WorkList)) {
-      dbgs() << "  - [" << Idx << "]: ";
-      for (unsigned NodeID : Entry.Cluster.set_bits())
-        dbgs() << NodeID << " ";
-      dbgs() << "(total_cost:" << Entry.TotalCost
-             << ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
-             << ")\n";
-    }
-  });
-}
+    sort(BalancingQueue, ComparePartitions);
+  };
 
-void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
-                                             SplitProposal SP) {
-  while (Idx < WorkList.size()) {
-    // Step 1: Determine candidate PIDs.
-    //
-    const WorkListEntry &Entry = WorkList[Idx];
-    const BitVector &Cluster = Entry.Cluster;
-
-    // Default option is to do load-balancing, AKA assign to least pressured
-    // partition.
-    const unsigned CheapestPID = SP.findCheapestPartition();
-    assert(CheapestPID != InvalidPID);
-
-    // Explore assigning to the kernel that contains the most dependencies in
-    // common.
-    const auto [MostSimilarPID, SimilarDepsCost] =
-        findMostSimilarPartition(Entry, SP);
-
-    // We can chose to explore only one path if we only have one valid path, or
-    // if we reached maximum search depth and can no longer branch out.
-    unsigned SinglePIDToTry = InvalidPID;
-    if (MostSimilarPID == InvalidPID) // no similar PID found
-      SinglePIDToTry = CheapestPID;
-    else if (MostSimilarPID == CheapestPID) // both landed on the same PID
-      SinglePIDToTry = CheapestPID;
-    else if (Depth >= MaxDepth) {
-      // We have to choose one path. Use a heuristic to guess which one will be
-      // more appropriate.
-      if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
-        // Check if the amount of code in common makes it worth it.
-        assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
-        const double Ratio =
-            SimilarDepsCost / Entry.CostExcludingGraphEntryPoints;
-        assert(Ratio >= 0.0 && Ratio <= 1.0);
-        if (LargeFnOverlapForMerge > Ratio) {
-          // For debug, just print "L", so we'll see "L3=P3" for instance, which
-          // will mean we reached max depth and chose P3 based on this
-          // heuristic.
-          LLVM_DEBUG(dbgs() << 'L');
-          SinglePIDToTry = MostSimilarPID;
-        }
-      } else
-        SinglePIDToTry = CheapestPID;
+  for (auto &CurFn : WorkList) {
+    // When a function has indirect calls, it must stay in the first partition
+    // alongside every reachable non-entry function. This is a nightmare case
+    // for splitting as it severely limits what we can do.
+    if (CurFn.HasIndirectCall) {
+      SML << "Function with indirect call(s): " << getName(*CurFn.Fn)
+          << " defaulting to P0\n";
+      AssignToPartition(0, CurFn);
+      continue;
     }
 
-    // Step 2: Explore candidates.
-
-    // When we only explore one possible path, and thus branch depth doesn't
-    // increase, do not recurse, iterate instead.
-    if (SinglePIDToTry != InvalidPID) {
-      LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' ');
-      // Only one path to explore, don't clone SP, don't increase depth.
-      SP.add(SinglePIDToTry, Cluster);
-      ++Idx;
+    // When a function has non duplicatable dependencies, we have to keep it in
+    // the first partition as well. This is a conservative approach, a
+    // finer-grained approach could keep track of which dependencies are
+    // non-duplicatable exactly and just make sure they're grouped together.
+    if (CurFn.HasNonDuplicatableDependecy) {
+      SML << "Function with externally visible dependency "
+          << getName(*CurFn.Fn) << " defaulting to P0\n";
+      AssignToPartition(0, CurFn);
       continue;
     }
 
-    assert(MostSimilarPID != InvalidPID);
-
-    // We explore multiple paths: recurse at increased depth, then stop this
-    // function.
-
-    LLVM_DEBUG(dbgs() << '\n');
-
-    // lb = load balancing = put in cheapest partition
-    {
-      SplitProposal BranchSP = SP;
-      LLVM_DEBUG(dbgs().indent(Depth)
-                 << " [lb] " << Idx << "=P" << CheapestPID << "? ");
-      BranchSP.add(CheapestPID, Cluster);
-      pickPartition(Depth + 1, Idx + 1, BranchSP);
-    }
+    // Be smart with large functions to avoid duplicating their dependencies.
+    if (CurFn.isLarge(LargeFnThreshold)) {
+      assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f);
+      SML << "Large Function: " << getName(*CurFn.Fn)
+          << " - looking for partition with at least "
+          << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n";
+
+      bool Assigned = false;
+      for (const auto &[PID, Fns] : enumerate(Partitions)) {
+        float Overlap = calculateOverlap(CurFn.Dependencies, Fns);
+        SML << "  => " << format("%0.2f", Overlap * 100) << "% overlap with P"
+            << PID << '\n';
+        if (Overlap > LargeFnOverlapForMerge) {
+          SML << "  selecting P" << PID << '\n';
+          AssignToPartition(PID, CurFn);
+          Assigned = true;
+        }
+      }
 
-    // ms = most similar = put in partition with the most in common
-    {
-      SplitProposal BranchSP = SP;
-      LLVM_DEBUG(dbgs().indent(Depth)
-                 << " [ms] " << Idx << "=P" << MostSimilarPID << "? ");
-      BranchSP.add(MostSimilarPID, Cluster);
-      pickPartition(Depth + 1, Idx + 1, BranchSP);
+      if (Assigned)
+        continue;
     }
 
-    return;
+    // Normal "load-balancing", assign to partition with least pressure.
+    auto [PID, CurCost] = BalancingQueue.back();
+    AssignToPartition(PID, CurFn);
   }
 
-  // Step 3: If we assigned all WorkList items, submit the proposal.
-
-  assert(Idx == WorkList.size());
-  assert(NumProposalsSubmitted <= (2u << MaxDepth) &&
-         "Search got out of bounds?");
-  SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" +
-             std::to_string(NumProposalsSubmitted++));
-  LLVM_DEBUG(dbgs() << '\n');
-  SubmitProposal(SP);
-}
-
-std::pair<unsigned, CostType>
-RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry,
-                                                   const SplitProposal &SP) {
-  if (!Entry.NumNonEntryNodes)
-    return {InvalidPID, 0};
-
-  // We take the partition that is the most similar using Cost as a metric.
-  // So we take the set of nodes in common, compute their aggregated cost, and
-  // pick the partition with the highest cost in common.
-  unsigned ChosenPID = InvalidPID;
-  CostType ChosenCost = 0;
-  for (unsigned PID = 0; PID < NumParts; ++PID) {
-    BitVector BV = SP[PID];
-    BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?!
-
-    if (BV.none())
-      continue;
-
-    const CostType Cost = SG.calculateCost(BV);
-
-    if (ChosenPID == InvalidPID || ChosenCost < Cost ||
-        (ChosenCost == Cost && PID > ChosenPID)) {
-      ChosenPID = PID;
-      ChosenCost = Cost;
+  if (SML) {
+    CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1;
+    for (const auto &[Idx, Part] : enumerate(Partitions)) {
+      CostType Cost = 0;
+      for (auto *Fn : Part)
+        Cost += FnCosts.at(Fn);
+      SML << "P" << Idx << " has a total cost of " << Cost << " ("
+          << format("%0.2f", (float(Cost) / ModuleCostOr1) * 100)
+          << "% of source module)\n";
     }
-  }
-
-  return {ChosenPID, ChosenCost};
-}
 
-//===----------------------------------------------------------------------===//
-// DOTGraph Printing Support
-//===----------------------------------------------------------------------===//
-
-const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) {
-  return E->Dst;
-}
-
-using SplitGraphEdgeDstIterator =
-    mapped_iterator<SplitGraph::edges_iterator, decltype(&mapEdgeToDst)>;
-
-} // namespace
-
-template <> struct GraphTraits<SplitGraph> {
-  using NodeRef = const SplitGraph::Node *;
-  using nodes_iterator = SplitGraph::nodes_iterator;
-  using ChildIteratorType = SplitGraphEdgeDstIterator;
-
-  using EdgeRef = const SplitGraph::Edge *;
-  using ChildEdgeIteratorType = SplitGraph::edges_iterator;
-
-  static NodeRef getEntryNode(NodeRef N) { return N; }
-
-  static ChildIteratorType child_begin(NodeRef Ref) {
-    return {Ref->outgoing_edges().begin(), mapEdgeToDst};
-  }
-  static ChildIteratorType child_end(NodeRef Ref) {
-    return {Ref->outgoing_edges().end(), mapEdgeToDst};
-  }
-
-  static nodes_iterator nodes_begin(const SplitGraph &G) {
-    return G.nodes().begin();
-  }
-  static nodes_iterator nodes_end(const SplitGraph &G) {
-    return G.nodes().end();
-  }
-};
-
-template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits {
-  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
-
-  static std::string getGraphName(const SplitGraph &SG) {
-    return SG.getModule().getName().str();
-  }
-
-  std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) {
-    return N->getName().str();
-  }
-
-  static std::string getNodeDescription(const SplitGraph::Node *N,
-                                        const SplitGraph &SG) {
-    std::string Result;
-    if (N->isEntryFunctionCC())
-      Result += "entry-fn-cc ";
-    if (N->isNonCopyable())
-      Result += "non-copyable ";
-    Result += "cost:" + std::to_string(N->getIndividualCost());
-    return Result;
-  }
-
-  static std::string getNodeAttributes(const SplitGraph::Node *N,
-                                       const SplitGraph &SG) {
-    return N->hasAnyIncomingEdges() ? "" : "color=\"red\"";
+    SML << "--Partitioning Done--\n\n";
   }
 
-  static std::string getEdgeAttributes(const SplitGraph::Node *N,
-                                       SplitGraphEdgeDstIterator EI,
-                                       const SplitGraph &SG) {
+  // Check no functions were missed.
+#ifndef NDEBUG
+  DenseSet<const Function *> AllFunctions;
+  for (const auto &Part : Partitions)
+    AllFunctions.insert(Part.begin(), Part.end());
 
-    switch ((*EI.getCurrent())->Kind) {
-    case SplitGraph::EdgeKind::DirectCall:
-      return "";
-    case SplitGraph::EdgeKind::IndirectCall:
-      return "style=\"dashed\"";
+  for (auto &Fn : M) {
+    if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
+      assert(AllFunctions.contains(&Fn) && "Missed a function?!");
     }
-    llvm_unreachable("Unknown SplitGraph::EdgeKind enum");
   }
-};
-
-//===----------------------------------------------------------------------===//
-// Driver
-//===----------------------------------------------------------------------===//
-
-namespace {
+#endif
 
-// If we didn't externalize GVs, then local GVs need to be conservatively
-// imported into every module (including their initializers), and then cleaned
-// up afterwards.
-static bool needsConservativeImport(const GlobalValue *GV) {
-  if (const auto *Var = dyn_cast<GlobalVariable>(GV))
-    return Var->hasLocalLinkage();
-  return isa<GlobalAlias>(GV);
+  return Partitions;
 }
 
-/// Prints a summary of the partition \p N, represented by module \p M, to \p
-/// OS.
-static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M,
-                                  unsigned PartCost, unsigned ModuleCost) {
-  OS << "*** Partition P" << N << " ***\n";
-
-  for (const auto &Fn : M) {
-    if (!Fn.isDeclaration())
-      OS << " - [function] " << Fn.getName() << "\n";
-  }
-
-  for (const auto &GV : M.globals()) {
-    if (GV.hasInitializer())
-      OS << " - [global] " << GV.getName() << "\n";
+static void externalize(GlobalValue &GV) {
+  if (GV.hasLocalLinkage()) {
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    GV.setVisibility(GlobalValue::HiddenVisibility);
   }
 
-  OS << "Partition contains " << formatRatioOf(PartCost, ModuleCost)
-     << "% of the source\n";
-}
-
-static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
-  SplitModuleTimer SMT("proposal_evaluation", "proposal ranking algorithm");
-
-  New.calculateScores();
-
-  LLVM_DEBUG({
-    New.verifyCompleteness();
-    if (DebugProposalSearch)
-      New.print(dbgs());
-  });
-
-  const double CurBScore = Best.getBottleneckScore();
-  const double CurCSScore = Best.getCodeSizeScore();
-  const double NewBScore = New.getBottleneckScore();
-  const double NewCSScore = New.getCodeSizeScore();
-
-  // TODO: Improve this
-  //    We can probably lower the precision of the comparison at first
-  //    e.g. if we have
-  //      - (Current): BScore: 0.489 CSCore 1.105
-  //      - (New): BScore: 0.475 CSCore 1.305
-  //    Currently we'd choose the new one because the bottleneck score is
-  //    lower, but the new one duplicates more code. It may be worth it to
-  //    discard the new proposal as the impact on build time is negligible.
-
-  // Compare them
-  bool IsBest = false;
-  if (NewBScore < CurBScore)
-    IsBest = true;
-  else if (NewBScore == CurBScore)
-    IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker.
-
-  if (IsBest)
-    Best = std::move(New);
-
-  LLVM_DEBUG(if (DebugProposalSearch) {
-    if (IsBest)
-      dbgs() << "[search] new best proposal!\n";
-    else
-      dbgs() << "[search] discarding - not profitable\n";
-  });
-}
-
-/// Trivial helper to create an identical copy of \p M.
-static std::unique_ptr<Module> cloneAll(const Module &M) {
-  ValueToValueMapTy VMap;
-  return CloneModule(M, VMap, [&](const GlobalValue *GV) { return true; });
+  // Unnamed entities must be named consistently between modules. setName will
+  // give a distinct name to each such entity.
+  if (!GV.hasName())
+    GV.setName("__llvmsplit_unnamed");
 }
 
-/// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested.
-static void writeDOTGraph(const SplitGraph &SG) {
-  if (ModuleDotCfgOutput.empty())
-    return;
-
-  std::error_code EC;
-  raw_fd_ostream OS(ModuleDotCfgOutput, EC);
-  if (EC) {
-    errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
-           << "' - DOTGraph will not be printed\n";
+static bool hasDirectCaller(const Function &Fn) {
+  for (auto &U : Fn.uses()) {
+    if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U))
+      return true;
   }
-  WriteGraph(OS, SG, /*ShortName=*/false,
-             /*Title=*/SG.getModule().getName());
+  return false;
 }
 
 static void splitAMDGPUModule(
-    GetTTIFn GetTTI, Module &M, unsigned NumParts,
+    GetTTIFn GetTTI, Module &M, unsigned N,
     function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
+
+  SplitModuleLogger SML(M);
+
   CallGraph CG(M);
 
   // Externalize functions whose address are taken.
@@ -1341,8 +639,8 @@ static void splitAMDGPUModule(
   for (auto &Fn : M) {
     if (Fn.hasAddressTaken()) {
       if (Fn.hasLocalLinkage()) {
-        LLVM_DEBUG(dbgs() << "[externalize] " << Fn.getName()
-                          << " because its address is taken\n");
+        SML << "[externalize] " << Fn.getName()
+            << " because its address is taken\n";
       }
       externalize(Fn);
     }
@@ -1353,179 +651,138 @@ static void splitAMDGPUModule(
   if (!NoExternalizeGlobals) {
     for (auto &GV : M.globals()) {
       if (GV.hasLocalLinkage())
-        LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n');
+        SML << "[externalize] GV " << GV.getName() << '\n';
       externalize(GV);
     }
   }
 
   // Start by calculating the cost of every function in the module, as well as
   // the module's overall cost.
-  FunctionsCostMap FnCosts;
-  const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
-
-  // Build the SplitGraph, which represents the module's functions and models
-  // their dependencies accurately.
-  SplitGraph SG(M, FnCosts, ModuleCost);
-  SG.buildGraph(CG);
-
-  if (SG.empty()) {
-    LLVM_DEBUG(
-        dbgs()
-        << "[!] no nodes in graph, input is empty - no splitting possible\n");
-    ModuleCallback(cloneAll(M));
-    return;
+  DenseMap<const Function *, CostType> FnCosts;
+  const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
+
+  // First, gather ever kernel into the worklist.
+  SmallVector<FunctionWithDependencies> WorkList;
+  for (auto &Fn : M) {
+    if (isEntryPoint(&Fn) && !Fn.isDeclaration())
+      WorkList.emplace_back(SML, CG, FnCosts, &Fn);
   }
 
-  LLVM_DEBUG({
-    dbgs() << "[graph] nodes:\n";
-    for (const SplitGraph::Node *N : SG.nodes()) {
-      dbgs() << "  - [" << N->getID() << "]: " << N->getName() << " "
-             << (N->isGraphEntryPoint() ? "(entry)" : "") << "\n";
+  // Then, find missing functions that need to be considered as additional
+  // roots. These can't be called in theory, but in practice we still have to
+  // handle them to avoid linker errors.
+  {
+    DenseSet<const Function *> SeenFunctions;
+    for (const auto &FWD : WorkList) {
+      SeenFunctions.insert(FWD.Fn);
+      SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
     }
-  });
 
-  writeDOTGraph(SG);
-
-  LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n");
-
-  std::optional<SplitProposal> Proposal;
-  const auto EvaluateProposal = [&](SplitProposal SP) {
-    if (!Proposal)
-      Proposal = std::move(SP);
-    else
-      evaluateProposal(*Proposal, std::move(SP));
-  };
-
-  // TODO: It would be very easy to create new strategies by just adding a base
-  // class to RecursiveSearchSplitting and abstracting it away.
-  RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
-  LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: "
-                                  << Proposal->getName() << "\n";);
-
-  if (!Proposal) {
-    LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n");
-    ModuleCallback(cloneAll(M));
-    return;
+    for (auto &Fn : M) {
+      // If this function is not part of any kernel's dependencies and isn't
+      // directly called, consider it as a root.
+      if (!Fn.isDeclaration() && !isEntryPoint(&Fn) &&
+          !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) {
+        WorkList.emplace_back(SML, CG, FnCosts, &Fn);
+      }
+    }
   }
 
-  LLVM_DEBUG(Proposal->print(dbgs()););
+  // Sort the worklist so the most expensive roots are seen first.
+  sort(WorkList, [&](auto &A, auto &B) {
+    // Sort by total cost, and if the total cost is identical, sort
+    // alphabetically.
+    if (A.TotalCost == B.TotalCost)
+      return A.Fn->getName() < B.Fn->getName();
+    return A.TotalCost > B.TotalCost;
+  });
 
-  std::optional<raw_fd_ostream> SummariesOS;
-  if (!PartitionSummariesOutput.empty()) {
-    std::error_code EC;
-    SummariesOS.emplace(PartitionSummariesOutput, EC);
-    if (EC)
-      errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
-             << "' - Partition summaries will not be printed\n";
+  if (SML) {
+    SML << "Worklist\n";
+    for (const auto &FWD : WorkList) {
+      SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost
+          << " indirect:" << FWD.HasIndirectCall
+          << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
+          << ")\n";
+      // Sort function names before printing to ensure determinism.
+      SmallVector<std::string> SortedDepNames;
+      SortedDepNames.reserve(FWD.Dependencies.size());
+      for (const auto *Dep : FWD.Dependencies)
+        SortedDepNames.push_back(getName(*Dep));
+      sort(SortedDepNames);
+
+      for (const auto &Name : SortedDepNames)
+        SML << "  [dependency] " << Name << '\n';
+    }
   }
 
-  for (unsigned PID = 0; PID < NumParts; ++PID) {
-    SplitModuleTimer SMT2("modules_creation",
-                          "creating modules for each partition");
-    LLVM_DEBUG(dbgs() << "[split] creating new modules\n");
+  // This performs all of the partitioning work.
+  auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
+  assert(Partitions.size() == N);
+
+  // If we didn't externalize GVs, then local GVs need to be conservatively
+  // imported into every module (including their initializers), and then cleaned
+  // up afterwards.
+  const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
+    // We conservatively import private/internal GVs into every module and clean
+    // them up afterwards.
+    const auto *Var = dyn_cast<GlobalVariable>(GV);
+    return Var && Var->hasLocalLinkage();
+  };
 
-    DenseSet<const Function *> FnsInPart;
-    for (unsigned NodeID : (*Proposal)[PID].set_bits())
-      FnsInPart.insert(&SG.getNode(NodeID).getFunction());
+  SML << "Creating " << N << " modules...\n";
+  unsigned TotalFnImpls = 0;
+  for (unsigned I = 0; I < N; ++I) {
+    const auto &FnsInPart = Partitions[I];
 
     ValueToValueMapTy VMap;
-    CostType PartCost = 0;
     std::unique_ptr<Module> MPart(
         CloneModule(M, VMap, [&](const GlobalValue *GV) {
           // Functions go in their assigned partition.
-          if (const auto *Fn = dyn_cast<Function>(GV)) {
-            if (FnsInPart.contains(Fn)) {
-              PartCost += SG.getCost(*Fn);
-              return true;
-            }
-            return false;
-          }
+          if (const auto *Fn = dyn_cast<Function>(GV))
+            return FnsInPart.contains(Fn);
+
+          if (NeedsConservativeImport(GV))
+            return true;
 
           // Everything else goes in the first partition.
-          return needsConservativeImport(GV) || PID == 0;
+          return I == 0;
         }));
 
-    // FIXME: Aliases aren't seen often, and their handling isn't perfect so
-    // bugs are possible.
-
     // Clean-up conservatively imported GVs without any users.
-    for (auto &GV : make_early_inc_range(MPart->global_values())) {
-      if (needsConservativeImport(&GV) && GV.use_empty())
+    for (auto &GV : make_early_inc_range(MPart->globals())) {
+      if (NeedsConservativeImport(&GV) && GV.use_empty())
         GV.eraseFromParent();
     }
 
-    if (SummariesOS)
-      printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
-
-    LLVM_DEBUG(
-        printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost));
-
+    unsigned NumAllFns = 0, NumKernels = 0;
+    for (auto &Cur : *MPart) {
+      if (!Cur.isDeclaration()) {
+        ++NumAllFns;
+        if (isEntryPoint(&Cur))
+          ++NumKernels;
+      }
+    }
+    TotalFnImpls += NumAllFns;
+    SML << "  - Module " << I << " with " << NumAllFns << " functions ("
+        << NumKernels << " kernels)\n";
     ModuleCallback(std::move(MPart));
   }
+
+  SML << TotalFnImpls << " function definitions across all modules ("
+      << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
+      << "% of original module)\n";
 }
 } // namespace
 
 PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
                                              ModuleAnalysisManager &MAM) {
-  SplitModuleTimer SMT(
-      "total", "total pass runtime (incl. potentially waiting for lockfile)");
-
   FunctionAnalysisManager &FAM =
       MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
     return FAM.getResult<TargetIRAnalysis>(F);
   };
-
-  bool Done = false;
-#ifndef NDEBUG
-  if (UseLockFile) {
-    SmallString<128> LockFilePath;
-    sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath);
-    sys::path::append(LockFilePath, "amdgpu-split-module-debug");
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath
-                      << "'\n");
-
-    while (true) {
-      llvm::LockFileManager Locked(LockFilePath.str());
-      switch (Locked) {
-      case LockFileManager::LFS_Error:
-        LLVM_DEBUG(
-            dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
-                      "output may be mangled by other processes\n");
-        Locked.unsafeRemoveLockFile();
-        break;
-      case LockFileManager::LFS_Owned:
-        break;
-      case LockFileManager::LFS_Shared: {
-        switch (Locked.waitForUnlock()) {
-        case LockFileManager::Res_Success:
-          break;
-        case LockFileManager::Res_OwnerDied:
-          continue; // try again to get the lock.
-        case LockFileManager::Res_Timeout:
-          LLVM_DEBUG(
-              dbgs()
-              << "[amdgpu-split-module] unable to acquire lockfile, debug "
-                 "output may be mangled by other processes\n");
-          Locked.unsafeRemoveLockFile();
-          break; // give up
-        }
-        break;
-      }
-      }
-
-      splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
-      Done = true;
-      break;
-    }
-  }
-#endif
-
-  if (!Done)
-    splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
-
-  // We can change linkage/visibilities in the input, consider that nothing is
-  // preserved just to be safe. This pass runs last anyway.
-  return PreservedAnalyses::none();
+  splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
+  // We don't change the original module.
+  return PreservedAnalyses::all();
 }
-} // namespace llvm
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 8b8452a2b78c80..10fef901f77181 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -79,7 +79,7 @@ enum class ImplicitItModeTy { Always, Never, ARMOnly, ThumbOnly };
 
 static cl::opt<ImplicitItModeTy> ImplicitItMode(
     "arm-implicit-it", cl::init(ImplicitItModeTy::ARMOnly),
-    cl::desc("Allow conditional instructions outdside of an IT block"),
+    cl::desc("Allow conditional instructions outside of an IT block"),
     cl::values(clEnumValN(ImplicitItModeTy::Always, "always",
                           "Accept in both ISAs, emit implicit ITs in Thumb"),
                clEnumValN(ImplicitItModeTy::Never, "never",
@@ -441,7 +441,7 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands,
                            unsigned MnemonicOpsEndInd, unsigned ListIndex);
 
-  int tryParseRegister(bool AllowOutofBoundReg = false);
+  MCRegister tryParseRegister(bool AllowOutofBoundReg = false);
   bool tryParseRegisterWithWriteBack(OperandVector &);
   int tryParseShiftRegister(OperandVector &);
   std::optional<ARM_AM::ShiftOpc> tryParseShiftToken();
@@ -4205,7 +4205,7 @@ bool ARMAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc,
   EndLoc = Tok.getEndLoc();
   Reg = tryParseRegister();
 
-  return Reg == (unsigned)-1;
+  return !Reg;
 }
 
 ParseStatus ARMAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
@@ -4216,59 +4216,59 @@ ParseStatus ARMAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
 }
 
 /// Try to parse a register name.  The token must be an Identifier when called,
-/// and if it is a register name the token is eaten and the register number is
-/// returned.  Otherwise return -1.
-int ARMAsmParser::tryParseRegister(bool AllowOutOfBoundReg) {
+/// and if it is a register name the token is eaten and the register is
+/// returned.  Otherwise return an invalid MCRegister.
+MCRegister ARMAsmParser::tryParseRegister(bool AllowOutOfBoundReg) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
-  if (Tok.isNot(AsmToken::Identifier)) return -1;
+  if (Tok.isNot(AsmToken::Identifier))
+    return MCRegister();
 
   std::string lowerCase = Tok.getString().lower();
-  unsigned RegNum = MatchRegisterName(lowerCase);
-  if (!RegNum) {
-    RegNum = StringSwitch<unsigned>(lowerCase)
-      .Case("r13", ARM::SP)
-      .Case("r14", ARM::LR)
-      .Case("r15", ARM::PC)
-      .Case("ip", ARM::R12)
-      // Additional register name aliases for 'gas' compatibility.
-      .Case("a1", ARM::R0)
-      .Case("a2", ARM::R1)
-      .Case("a3", ARM::R2)
-      .Case("a4", ARM::R3)
-      .Case("v1", ARM::R4)
-      .Case("v2", ARM::R5)
-      .Case("v3", ARM::R6)
-      .Case("v4", ARM::R7)
-      .Case("v5", ARM::R8)
-      .Case("v6", ARM::R9)
-      .Case("v7", ARM::R10)
-      .Case("v8", ARM::R11)
-      .Case("sb", ARM::R9)
-      .Case("sl", ARM::R10)
-      .Case("fp", ARM::R11)
-      .Default(0);
-  }
-  if (!RegNum) {
+  MCRegister Reg = MatchRegisterName(lowerCase);
+  if (!Reg) {
+    Reg = StringSwitch<MCRegister>(lowerCase)
+              .Case("r13", ARM::SP)
+              .Case("r14", ARM::LR)
+              .Case("r15", ARM::PC)
+              .Case("ip", ARM::R12)
+              // Additional register name aliases for 'gas' compatibility.
+              .Case("a1", ARM::R0)
+              .Case("a2", ARM::R1)
+              .Case("a3", ARM::R2)
+              .Case("a4", ARM::R3)
+              .Case("v1", ARM::R4)
+              .Case("v2", ARM::R5)
+              .Case("v3", ARM::R6)
+              .Case("v4", ARM::R7)
+              .Case("v5", ARM::R8)
+              .Case("v6", ARM::R9)
+              .Case("v7", ARM::R10)
+              .Case("v8", ARM::R11)
+              .Case("sb", ARM::R9)
+              .Case("sl", ARM::R10)
+              .Case("fp", ARM::R11)
+              .Default(MCRegister());
+  }
+  if (!Reg) {
     // Check for aliases registered via .req. Canonicalize to lower case.
     // That's more consistent since register names are case insensitive, and
     // it's how the original entry was passed in from MC/MCParser/AsmParser.
     auto Entry = RegisterReqs.find(lowerCase);
     // If no match, return failure.
     if (Entry == RegisterReqs.end())
-      return -1;
+      return MCRegister();
     Parser.Lex(); // Eat identifier token.
     return Entry->getValue();
   }
 
   // Some FPUs only have 16 D registers, so D16-D31 are invalid
-  if (!AllowOutOfBoundReg && !hasD32() && RegNum >= ARM::D16 &&
-      RegNum <= ARM::D31)
-    return -1;
+  if (!AllowOutOfBoundReg && !hasD32() && Reg >= ARM::D16 && Reg <= ARM::D31)
+    return MCRegister();
 
   Parser.Lex(); // Eat identifier token.
 
-  return RegNum;
+  return Reg;
 }
 
 std::optional<ARM_AM::ShiftOpc> ARMAsmParser::tryParseShiftToken() {
@@ -4356,7 +4356,7 @@ int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) {
       SMLoc L = Parser.getTok().getLoc();
       EndLoc = Parser.getTok().getEndLoc();
       ShiftReg = tryParseRegister();
-      if (ShiftReg == -1) {
+      if (!ShiftReg) {
         Error(L, "expected immediate or register in shift operand");
         return -1;
       }
@@ -4387,12 +4387,11 @@ bool ARMAsmParser::tryParseRegisterWithWriteBack(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   SMLoc RegStartLoc = Parser.getTok().getLoc();
   SMLoc RegEndLoc = Parser.getTok().getEndLoc();
-  int RegNo = tryParseRegister();
-  if (RegNo == -1)
+  MCRegister Reg = tryParseRegister();
+  if (!Reg)
     return true;
 
-  Operands.push_back(
-      ARMOperand::CreateReg(RegNo, RegStartLoc, RegEndLoc, *this));
+  Operands.push_back(ARMOperand::CreateReg(Reg, RegStartLoc, RegEndLoc, *this));
 
   const AsmToken &ExclaimTok = Parser.getTok();
   if (ExclaimTok.is(AsmToken::Exclaim)) {
@@ -4619,8 +4618,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
 
   // Check the first register in the list to see what register class
   // this is a list of.
-  int Reg = tryParseRegister();
-  if (Reg == -1)
+  MCRegister Reg = tryParseRegister();
+  if (!Reg)
     return Error(RegLoc, "register expected");
   if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE)
     return Error(RegLoc, "pseudo-register not allowed");
@@ -4634,7 +4633,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
     Reg = getDRegFromQReg(Reg);
     EReg = MRI->getEncodingValue(Reg);
     Registers.emplace_back(EReg, Reg);
-    ++Reg;
+    Reg = Reg + 1;
   }
   const MCRegisterClass *RC;
   if (Reg == ARM::RA_AUTH_CODE ||
@@ -4663,8 +4662,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
         return Error(RegLoc, "pseudo-register not allowed");
       Parser.Lex(); // Eat the minus.
       SMLoc AfterMinusLoc = Parser.getTok().getLoc();
-      int EndReg = tryParseRegister(AllowOutOfBoundReg);
-      if (EndReg == -1)
+      MCRegister EndReg = tryParseRegister(AllowOutOfBoundReg);
+      if (!EndReg)
         return Error(AfterMinusLoc, "register expected");
       if (EndReg == ARM::RA_AUTH_CODE)
         return Error(AfterMinusLoc, "pseudo-register not allowed");
@@ -4696,10 +4695,10 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
     }
     Parser.Lex(); // Eat the comma.
     RegLoc = Parser.getTok().getLoc();
-    int OldReg = Reg;
+    MCRegister OldReg = Reg;
     const AsmToken RegTok = Parser.getTok();
     Reg = tryParseRegister(AllowOutOfBoundReg);
-    if (Reg == -1)
+    if (!Reg)
       return Error(RegLoc, "register expected");
     if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE)
       return Error(RegLoc, "pseudo-register not allowed");
@@ -4755,7 +4754,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
                           ") in register list");
     }
     if (isQReg) {
-      EReg = MRI->getEncodingValue(++Reg);
+      Reg = Reg + 1;
+      EReg = MRI->getEncodingValue(Reg);
       Registers.emplace_back(EReg, Reg);
     }
   }
@@ -4835,8 +4835,8 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
   // use the custom matcher to convert to list if necessary
   if (!hasMVE() && Parser.getTok().is(AsmToken::Identifier)) {
     SMLoc E = Parser.getTok().getEndLoc();
-    int Reg = tryParseRegister();
-    if (Reg == -1)
+    MCRegister Reg = tryParseRegister();
+    if (!Reg)
       return ParseStatus::NoMatch;
     if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
       ParseStatus Res = parseVectorLane(LaneKind, LaneIndex, E);
@@ -4889,12 +4889,12 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
   Parser.Lex(); // Eat '{' token.
   SMLoc RegLoc = Parser.getTok().getLoc();
 
-  int Reg = tryParseRegister();
-  if (Reg == -1)
+  MCRegister Reg = tryParseRegister();
+  if (!Reg)
     return Error(RegLoc, "register expected");
   unsigned Count = 1;
   int Spacing = 0;
-  unsigned FirstReg = Reg;
+  MCRegister FirstReg = Reg;
 
   if (hasMVE() && !ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(Reg))
     return Error(Parser.getTok().getLoc(),
@@ -4905,7 +4905,7 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
     FirstReg = Reg = getDRegFromQReg(Reg);
     Spacing = 1; // double-spacing requires explicit D registers, otherwise
                  // it's ambiguous with four-register single spaced.
-    ++Reg;
+    Reg = Reg + 1;
     ++Count;
   }
 
@@ -4923,8 +4923,8 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
                      "sequential registers in double spaced list");
       Parser.Lex(); // Eat the minus.
       SMLoc AfterMinusLoc = Parser.getTok().getLoc();
-      int EndReg = tryParseRegister();
-      if (EndReg == -1)
+      MCRegister EndReg = tryParseRegister();
+      if (!EndReg)
         return Error(AfterMinusLoc, "register expected");
       // Allow Q regs and just interpret them as the two D sub-registers.
       if (!hasMVE() && ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
@@ -4957,9 +4957,9 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
     }
     Parser.Lex(); // Eat the comma.
     RegLoc = Parser.getTok().getLoc();
-    int OldReg = Reg;
+    MCRegister OldReg = Reg;
     Reg = tryParseRegister();
-    if (Reg == -1)
+    if (!Reg)
       return Error(RegLoc, "register expected");
 
     if (hasMVE()) {
@@ -4983,7 +4983,7 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
       Reg = getDRegFromQReg(Reg);
       if (Reg != OldReg + 1)
         return Error(RegLoc, "non-contiguous register range");
-      ++Reg;
+      Reg = Reg + 1;
       Count += 2;
       // Parse the lane specifier if present.
       VectorLaneTy NextLaneKind;
@@ -5674,8 +5674,8 @@ ParseStatus ARMAsmParser::parsePostIdxReg(OperandVector &Operands) {
   }
 
   SMLoc E = Parser.getTok().getEndLoc();
-  int Reg = tryParseRegister();
-  if (Reg == -1) {
+  MCRegister Reg = tryParseRegister();
+  if (!Reg) {
     if (!haveEaten)
       return ParseStatus::NoMatch;
     return Error(Parser.getTok().getLoc(), "register expected");
@@ -5752,8 +5752,8 @@ ParseStatus ARMAsmParser::parseAM3Offset(OperandVector &Operands) {
   }
 
   Tok = Parser.getTok();
-  int Reg = tryParseRegister();
-  if (Reg == -1) {
+  MCRegister Reg = tryParseRegister();
+  if (!Reg) {
     if (!haveEaten)
       return ParseStatus::NoMatch;
     return Error(Tok.getLoc(), "register expected");
@@ -5935,8 +5935,8 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) {
   Parser.Lex(); // Eat left bracket token.
 
   const AsmToken &BaseRegTok = Parser.getTok();
-  int BaseRegNum = tryParseRegister();
-  if (BaseRegNum == -1)
+  MCRegister BaseReg = tryParseRegister();
+  if (!BaseReg)
     return Error(BaseRegTok.getLoc(), "register expected");
 
   // The next token must either be a comma, a colon or a closing bracket.
@@ -5950,7 +5950,7 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) {
     Parser.Lex(); // Eat right bracket token.
 
     Operands.push_back(ARMOperand::CreateMem(
-        BaseRegNum, nullptr, 0, ARM_AM::no_shift, 0, 0, false, S, E, *this));
+        BaseReg, nullptr, 0, ARM_AM::no_shift, 0, 0, false, S, E, *this));
 
     // If there's a pre-indexing writeback marker, '!', just add it as a token
     // operand. It's rather odd, but syntactically valid.
@@ -6006,7 +6006,7 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) {
 
     // Don't worry about range checking the value here. That's handled by
     // the is*() predicates.
-    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0,
+    Operands.push_back(ARMOperand::CreateMem(BaseReg, nullptr, 0,
                                              ARM_AM::no_shift, 0, Align, false,
                                              S, E, *this, AlignmentLoc));
 
@@ -6050,7 +6050,7 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) {
       AdjustedOffset = CE;
     } else
       AdjustedOffset = Offset;
-    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, AdjustedOffset, 0,
+    Operands.push_back(ARMOperand::CreateMem(BaseReg, AdjustedOffset, 0,
                                              ARM_AM::no_shift, 0, 0, false, S,
                                              E, *this));
 
@@ -6082,8 +6082,8 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) {
   }
 
   E = Parser.getTok().getLoc();
-  int OffsetRegNum = tryParseRegister();
-  if (OffsetRegNum == -1)
+  MCRegister OffsetReg = tryParseRegister();
+  if (!OffsetReg)
     return Error(E, "register expected");
 
   // If there's a shift operator, handle it.
@@ -6101,7 +6101,7 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) {
   E = Parser.getTok().getEndLoc();
   Parser.Lex(); // Eat right bracket token.
 
-  Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, OffsetRegNum,
+  Operands.push_back(ARMOperand::CreateMem(BaseReg, nullptr, OffsetReg,
                                            ShiftType, ShiftImm, 0, isNegative,
                                            S, E, *this));
 
@@ -12077,16 +12077,16 @@ bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) {
 
   // Parse fpreg
   SMLoc FPRegLoc = Parser.getTok().getLoc();
-  int FPReg = tryParseRegister();
+  MCRegister FPReg = tryParseRegister();
 
-  if (check(FPReg == -1, FPRegLoc, "frame pointer register expected") ||
+  if (check(!FPReg, FPRegLoc, "frame pointer register expected") ||
       Parser.parseComma())
     return true;
 
   // Parse spreg
   SMLoc SPRegLoc = Parser.getTok().getLoc();
-  int SPReg = tryParseRegister();
-  if (check(SPReg == -1, SPRegLoc, "stack pointer register expected") ||
+  MCRegister SPReg = tryParseRegister();
+  if (check(!SPReg, SPRegLoc, "stack pointer register expected") ||
       check(SPReg != ARM::SP && SPReg != UC.getFPReg(), SPRegLoc,
             "register should be either $sp or the latest fp register"))
     return true;
@@ -12404,8 +12404,8 @@ bool ARMAsmParser::parseDirectiveMovSP(SMLoc L) {
     return Error(L, "unexpected .movsp directive");
 
   SMLoc SPRegLoc = Parser.getTok().getLoc();
-  int SPReg = tryParseRegister();
-  if (SPReg == -1)
+  MCRegister SPReg = tryParseRegister();
+  if (!SPReg)
     return Error(SPRegLoc, "register expected");
   if (SPReg == ARM::SP || SPReg == ARM::PC)
     return Error(SPRegLoc, "sp and pc are not permitted in .movsp directive");
@@ -12542,8 +12542,8 @@ bool ARMAsmParser::parseDirectiveSEHSaveRegs(SMLoc L, bool Wide) {
 /// parseDirectiveSEHSaveSP
 /// ::= .seh_save_sp
 bool ARMAsmParser::parseDirectiveSEHSaveSP(SMLoc L) {
-  int Reg = tryParseRegister();
-  if (Reg == -1 || !MRI->getRegClass(ARM::GPRRegClassID).contains(Reg))
+  MCRegister Reg = tryParseRegister();
+  if (!Reg || !MRI->getRegClass(ARM::GPRRegClassID).contains(Reg))
     return Error(L, "expected GPR");
   unsigned Index = MRI->getEncodingValue(Reg);
   if (Index > 14 || Index == 13)
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index 383dfcc31117c1..c016b2dd91dc67 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -72,7 +72,7 @@ class AVRAsmParser : public MCTargetAsmParser {
   int parseRegisterName();
   int parseRegister(bool RestoreOnFailure = false);
   bool tryParseRegisterOperand(OperandVector &Operands);
-  bool tryParseExpression(OperandVector &Operands);
+  bool tryParseExpression(OperandVector &Operands, int64_t offset);
   bool tryParseRelocExpression(OperandVector &Operands);
   void eatComma();
 
@@ -418,7 +418,7 @@ bool AVRAsmParser::tryParseRegisterOperand(OperandVector &Operands) {
   return false;
 }
 
-bool AVRAsmParser::tryParseExpression(OperandVector &Operands) {
+bool AVRAsmParser::tryParseExpression(OperandVector &Operands, int64_t offset) {
   SMLoc S = Parser.getTok().getLoc();
 
   if (!tryParseRelocExpression(Operands))
@@ -437,6 +437,11 @@ bool AVRAsmParser::tryParseExpression(OperandVector &Operands) {
   if (getParser().parseExpression(Expression))
     return true;
 
+  if (offset) {
+    Expression = MCBinaryExpr::createAdd(
+        Expression, MCConstantExpr::create(offset, getContext()), getContext());
+  }
+
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   Operands.push_back(AVROperand::CreateImm(Expression, S, E));
   return false;
@@ -529,8 +534,9 @@ bool AVRAsmParser::parseOperand(OperandVector &Operands, bool maybeReg) {
     [[fallthrough]];
   case AsmToken::LParen:
   case AsmToken::Integer:
+    return tryParseExpression(Operands, 0);
   case AsmToken::Dot:
-    return tryParseExpression(Operands);
+    return tryParseExpression(Operands, 2);
   case AsmToken::Plus:
   case AsmToken::Minus: {
     // If the sign preceeds a number, parse the number,
@@ -540,7 +546,7 @@ bool AVRAsmParser::parseOperand(OperandVector &Operands, bool maybeReg) {
     case AsmToken::BigNum:
     case AsmToken::Identifier:
     case AsmToken::Real:
-      if (!tryParseExpression(Operands))
+      if (!tryParseExpression(Operands, 0))
         return false;
       break;
     default:
@@ -643,6 +649,7 @@ bool AVRAsmParser::ParseInstruction(ParseInstructionInfo &Info,
     // These specific operands should be treated as addresses/symbols/labels,
     // other than registers.
     bool maybeReg = true;
+
     if (OperandNum == 1) {
       std::array<StringRef, 8> Insts = {"lds", "adiw", "sbiw", "ldi"};
       for (auto Inst : Insts) {
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 0d29912bee2646..388d58a82214d1 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -94,6 +94,9 @@ static void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup,
 
   // Rightshifts the value by one.
   AVR::fixups::adjustBranchTarget(Value);
+
+  // Jumps are relative to the current instruction.
+  Value -= 1;
 }
 
 /// 22-bit absolute fixup.
@@ -513,15 +516,10 @@ bool AVRAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
   switch ((unsigned)Fixup.getKind()) {
   default:
     return Fixup.getKind() >= FirstLiteralRelocationKind;
-  // Fixups which should always be recorded as relocations.
   case AVR::fixup_7_pcrel:
   case AVR::fixup_13_pcrel:
-    // Do not force relocation for PC relative branch like 'rjmp .',
-    // 'rcall . - off' and 'breq . + off'.
-    if (const auto *SymA = Target.getSymA())
-      if (SymA->getSymbol().getName().size() == 0)
-        return false;
-    [[fallthrough]];
+    // Always resolve relocations for PC-relative branches
+    return false;
   case AVR::fixup_call:
     return true;
   }
diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h
index f7bc6f958470b9..f07ae4c9baf1c6 100644
--- a/llvm/lib/Target/BPF/BPF.h
+++ b/llvm/lib/Target/BPF/BPF.h
@@ -28,6 +28,7 @@ FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
 FunctionPass *createBPFMISimplifyPatchablePass();
 FunctionPass *createBPFMIPeepholePass();
 FunctionPass *createBPFMIPreEmitPeepholePass();
+FunctionPass *createBPFMIPreEmitCheckingPass();
 
 InstructionSelector *createBPFInstructionSelector(const BPFTargetMachine &,
                                                   const BPFSubtarget &,
@@ -36,6 +37,7 @@ InstructionSelector *createBPFInstructionSelector(const BPFTargetMachine &,
 void initializeBPFCheckAndAdjustIRPass(PassRegistry&);
 void initializeBPFDAGToDAGISelLegacyPass(PassRegistry &);
 void initializeBPFMIPeepholePass(PassRegistry &);
+void initializeBPFMIPreEmitCheckingPass(PassRegistry &);
 void initializeBPFMIPreEmitPeepholePass(PassRegistry &);
 void initializeBPFMISimplifyPatchablePass(PassRegistry &);
 
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 4baeeb017699d6..6c750af5c2fd92 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -786,45 +786,13 @@ let Predicates = [BPFNoALU32] in {
   def : Pat<(i64 (extloadi32 ADDRri:$src)), (i64 (LDW ADDRri:$src))>;
 }
 
-// Atomic XADD for BPFNoALU32
-class XADD<BPFWidthModifer SizeOp, string OpcodeStr>
-    : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
-                 (outs GPR:$dst),
-                 (ins MEMri:$addr, GPR:$val),
-                 "lock *("#OpcodeStr#" *)($addr) += $val",
-                 []> {
-  bits<4> dst;
-  bits<20> addr;
-
-  let Inst{51-48} = addr{19-16}; // base reg
-  let Inst{55-52} = dst;
-  let Inst{47-32} = addr{15-0}; // offset
-  let Inst{7-4} = BPF_ADD.Value;
-  let BPFClass = BPF_STX;
-}
-
 // Atomic add, and, or, xor
-class ATOMIC_NOFETCH<BPFArithOp Opc, string Opstr>
-    : TYPE_LD_ST<BPF_ATOMIC.Value, BPF_DW.Value,
+class ATOMIC_NOFETCH<BPFWidthModifer SizeOp, string OpType, RegisterClass RegTp,
+                     BPFArithOp Opc, string Opstr>
+    : TYPE_LD_ST<BPF_ATOMIC.Value, SizeOp.Value,
                  (outs GPR:$dst),
-                 (ins MEMri:$addr, GPR:$val),
-                 "lock *(u64 *)($addr) " #Opstr# "= $val",
-                 []> {
-  bits<4> dst;
-  bits<20> addr;
-
-  let Inst{51-48} = addr{19-16}; // base reg
-  let Inst{55-52} = dst;
-  let Inst{47-32} = addr{15-0}; // offset
-  let Inst{7-4} = Opc.Value;
-  let BPFClass = BPF_STX;
-}
-
-class ATOMIC32_NOFETCH<BPFArithOp Opc, string Opstr>
-    : TYPE_LD_ST<BPF_ATOMIC.Value, BPF_W.Value,
-                 (outs GPR32:$dst),
-                 (ins MEMri:$addr, GPR32:$val),
-                 "lock *(u32 *)($addr) " #Opstr# "= $val",
+                 (ins MEMri:$addr, RegTp:$val),
+                 "lock *(" #OpType# " *)($addr) " #Opstr# "= $val",
                  []> {
   bits<4> dst;
   bits<20> addr;
@@ -838,16 +806,23 @@ class ATOMIC32_NOFETCH<BPFArithOp Opc, string Opstr>
 
 let Constraints = "$dst = $val" in {
   let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
-    def XADDW32 : ATOMIC32_NOFETCH<BPF_ADD, "+">;
-    def XANDW32 : ATOMIC32_NOFETCH<BPF_AND, "&">;
-    def XORW32  : ATOMIC32_NOFETCH<BPF_OR, "|">;
-    def XXORW32 : ATOMIC32_NOFETCH<BPF_XOR, "^">;
+    def XADDW32 : ATOMIC_NOFETCH<BPF_W, "u32", GPR32, BPF_ADD, "+">;
+    def XANDW32 : ATOMIC_NOFETCH<BPF_W, "u32", GPR32, BPF_AND, "&">;
+    def XORW32  : ATOMIC_NOFETCH<BPF_W, "u32", GPR32, BPF_OR, "|">;
+    def XXORW32 : ATOMIC_NOFETCH<BPF_W, "u32", GPR32, BPF_XOR, "^">;
   }
+  def XADDW  : ATOMIC_NOFETCH<BPF_W,  "u32", GPR, BPF_ADD, "+">;
+  def XADDD  : ATOMIC_NOFETCH<BPF_DW, "u64", GPR, BPF_ADD, "+">;
+  def XANDD  : ATOMIC_NOFETCH<BPF_DW, "u64", GPR, BPF_AND, "&">;
+  def XORD   : ATOMIC_NOFETCH<BPF_DW, "u64", GPR, BPF_OR, "|">;
+  def XXORD  : ATOMIC_NOFETCH<BPF_DW, "u64", GPR, BPF_XOR, "^">;
+}
 
-  def XADDD  : ATOMIC_NOFETCH<BPF_ADD, "+">;
-  def XANDD  : ATOMIC_NOFETCH<BPF_AND, "&">;
-  def XORD   : ATOMIC_NOFETCH<BPF_OR, "|">;
-  def XXORD  : ATOMIC_NOFETCH<BPF_XOR, "^">;
+let Predicates = [BPFNoALU32] in {
+  def : Pat<(atomic_load_add_i32 ADDRri:$addr, GPR:$val),
+            (XADDW ADDRri:$addr, GPR:$val)>;
+  def : Pat<(atomic_load_add_i64 ADDRri:$addr, GPR:$val),
+            (XADDD ADDRri:$addr, GPR:$val)>;
 }
 
 // Atomic Fetch-and-<add, and, or, xor> operations
@@ -887,13 +862,6 @@ class XFALU32<BPFWidthModifer SizeOp, BPFArithOp Opc, string OpcodeStr,
   let BPFClass = BPF_STX;
 }
 
-let Constraints = "$dst = $val" in {
-  let Predicates = [BPFNoALU32] in {
-    def XADDW : XADD<BPF_W, "u32">;
-    def XFADDW : XFALU64<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_i32>;
-  }
-}
-
 let Constraints = "$dst = $val" in {
   let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
     def XFADDW32 : XFALU32<BPF_W, BPF_ADD, "u32", "add", atomic_load_add_i32>;
@@ -902,7 +870,9 @@ let Constraints = "$dst = $val" in {
     def XFXORW32 : XFALU32<BPF_W, BPF_XOR, "u32", "xor", atomic_load_xor_i32>;
   }
 
-  def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_i64>;
+  let Predicates = [BPFHasALU32] in {
+    def XFADDD : XFALU64<BPF_DW, BPF_ADD, "u64", "add", atomic_load_add_i64>;
+  }
   def XFANDD : XFALU64<BPF_DW, BPF_AND, "u64", "and", atomic_load_and_i64>;
   def XFORD  : XFALU64<BPF_DW, BPF_OR,  "u64", "or",  atomic_load_or_i64>;
   def XFXORD : XFALU64<BPF_DW, BPF_XOR, "u64", "xor", atomic_load_xor_i64>;
diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp
new file mode 100644
index 00000000000000..24224f6c1e9e66
--- /dev/null
+++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp
@@ -0,0 +1,181 @@
+//===-------------- BPFMIChecking.cpp - MI Checking Legality -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs checking to signal errors for certain illegal usages at
+// MachineInstruction layer. Specially, the result of XADD{32,64} insn should
+// not be used. The pass is done at the PreEmit pass right before the
+// machine code is emitted at which point the register liveness information
+// is still available.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstrInfo.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-mi-checking"
+
+namespace {
+
+struct BPFMIPreEmitChecking : public MachineFunctionPass {
+
+  static char ID;
+  MachineFunction *MF;
+  const TargetRegisterInfo *TRI;
+
+  BPFMIPreEmitChecking() : MachineFunctionPass(ID) {
+    initializeBPFMIPreEmitCheckingPass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  // Initialize class variables.
+  void initialize(MachineFunction &MFParm);
+
+  void processAtomicInsts();
+
+public:
+  // Main entry point for this pass.
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (!skipFunction(MF.getFunction())) {
+      initialize(MF);
+      processAtomicInsts();
+    }
+    return false;
+  }
+};
+
+// Initialize class variables.
+void BPFMIPreEmitChecking::initialize(MachineFunction &MFParm) {
+  MF = &MFParm;
+  TRI = MF->getSubtarget<BPFSubtarget>().getRegisterInfo();
+  LLVM_DEBUG(dbgs() << "*** BPF PreEmit checking pass ***\n\n");
+}
+
+// Make sure all Defs of XADD are dead, meaning any result of XADD insn is not
+// used.
+//
+// NOTE: BPF backend hasn't enabled sub-register liveness track, so when the
+// source and destination operands of XADD are GPR32, there is no sub-register
+// dead info. If we rely on the generic MachineInstr::allDefsAreDead, then we
+// will raise false alarm on GPR32 Def.
+//
+// To support GPR32 Def, ideally we could just enable sub-registr liveness track
+// on BPF backend, then allDefsAreDead could work on GPR32 Def. This requires
+// implementing TargetSubtargetInfo::enableSubRegLiveness on BPF.
+//
+// However, sub-register liveness tracking module inside LLVM is actually
+// designed for the situation where one register could be split into more than
+// one sub-registers for which case each sub-register could have their own
+// liveness and kill one of them doesn't kill others. So, tracking liveness for
+// each make sense.
+//
+// For BPF, each 64-bit register could only have one 32-bit sub-register. This
+// is exactly the case which LLVM think brings no benefits for doing
+// sub-register tracking, because the live range of sub-register must always
+// equal to its parent register, therefore liveness tracking is disabled even
+// the back-end has implemented enableSubRegLiveness. The detailed information
+// is at r232695:
+//
+//   Author: Matthias Braun <matze@braunis.de>
+//   Date:   Thu Mar 19 00:21:58 2015 +0000
+//   Do not track subregister liveness when it brings no benefits
+//
+// Hence, for BPF, we enhance MachineInstr::allDefsAreDead. Given the solo
+// sub-register always has the same liveness as its parent register, LLVM is
+// already attaching a implicit 64-bit register Def whenever the there is
+// a sub-register Def. The liveness of the implicit 64-bit Def is available.
+// For example, for "lock *(u32 *)(r0 + 4) += w9", the MachineOperand info could
+// be:
+//
+//   $w9 = XADDW32 killed $r0, 4, $w9(tied-def 0),
+//                        implicit killed $r9, implicit-def dead $r9
+//
+// Even though w9 is not marked as Dead, the parent register r9 is marked as
+// Dead correctly, and it is safe to use such information or our purpose.
+static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
+  const MCRegisterClass *GPR64RegClass =
+      &BPFMCRegisterClasses[BPF::GPRRegClassID];
+  std::vector<unsigned> GPR32LiveDefs;
+  std::vector<unsigned> GPR64DeadDefs;
+
+  for (const MachineOperand &MO : MI.operands()) {
+    bool RegIsGPR64;
+
+    if (!MO.isReg() || MO.isUse())
+      continue;
+
+    RegIsGPR64 = GPR64RegClass->contains(MO.getReg());
+    if (!MO.isDead()) {
+      // It is a GPR64 live Def, we are sure it is live. */
+      if (RegIsGPR64)
+        return true;
+      // It is a GPR32 live Def, we are unsure whether it is really dead due to
+      // no sub-register liveness tracking. Push it to vector for deferred
+      // check.
+      GPR32LiveDefs.push_back(MO.getReg());
+      continue;
+    }
+
+    // Record any GPR64 dead Def as some unmarked GPR32 could be alias of its
+    // low 32-bit.
+    if (RegIsGPR64)
+      GPR64DeadDefs.push_back(MO.getReg());
+  }
+
+  // No GPR32 live Def, safe to return false.
+  if (GPR32LiveDefs.empty())
+    return false;
+
+  // No GPR64 dead Def, so all those GPR32 live Def can't have alias, therefore
+  // must be truely live, safe to return true.
+  if (GPR64DeadDefs.empty())
+    return true;
+
+  // Otherwise, return true if any aliased SuperReg of GPR32 is not dead.
+  for (auto I : GPR32LiveDefs)
+    for (MCPhysReg SR : TRI->superregs(I))
+      if (!llvm::is_contained(GPR64DeadDefs, SR))
+        return true;
+
+  return false;
+}
+
+void BPFMIPreEmitChecking::processAtomicInsts() {
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() != BPF::XADDW && MI.getOpcode() != BPF::XADDD)
+        continue;
+
+      LLVM_DEBUG(MI.dump());
+      if (hasLiveDefs(MI, TRI)) {
+        DebugLoc Empty;
+        const DebugLoc &DL = MI.getDebugLoc();
+        const Function &F = MF->getFunction();
+        F.getContext().diagnose(DiagnosticInfoUnsupported{
+            F, "Invalid usage of the XADD return value", DL});
+      }
+    }
+  }
+}
+
+} // namespace
+
+INITIALIZE_PASS(BPFMIPreEmitChecking, "bpf-mi-pemit-checking",
+                "BPF PreEmit Checking", false, false)
+
+char BPFMIPreEmitChecking::ID = 0;
+FunctionPass *llvm::createBPFMIPreEmitCheckingPass() {
+  return new BPFMIPreEmitChecking();
+}
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 64b115b8fc8afa..7d91fa8bb824cf 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -178,6 +178,7 @@ void BPFPassConfig::addMachineSSAOptimization() {
 }
 
 void BPFPassConfig::addPreEmitPass() {
+  addPass(createBPFMIPreEmitCheckingPass());
   if (getOptLevel() != CodeGenOptLevel::None)
     if (!DisableMIPeephole)
       addPass(createBPFMIPreEmitPeepholePass());
diff --git a/llvm/lib/Target/BPF/CMakeLists.txt b/llvm/lib/Target/BPF/CMakeLists.txt
index 253660d4d62e37..eade4cacb7100e 100644
--- a/llvm/lib/Target/BPF/CMakeLists.txt
+++ b/llvm/lib/Target/BPF/CMakeLists.txt
@@ -39,6 +39,7 @@ add_llvm_target(BPFCodeGen
   BPFSubtarget.cpp
   BPFTargetMachine.cpp
   BPFMIPeephole.cpp
+  BPFMIChecking.cpp
   BPFMISimplifyPatchable.cpp
   BTFDebug.cpp
 
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index c2ae4a0734b6a7..b8f1cdfd2cb354 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -1291,14 +1291,32 @@ void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc,
     Imm = SignExtend64<32>(Imm);
 
   for (LoongArchMatInt::Inst &Inst : LoongArchMatInt::generateInstSeq(Imm)) {
-    unsigned Opc = Inst.Opc;
-    if (Opc == LoongArch::LU12I_W)
-      Out.emitInstruction(MCInstBuilder(Opc).addReg(DestReg).addImm(Inst.Imm),
-                          getSTI());
-    else
+    switch (Inst.Opc) {
+    case LoongArch::LU12I_W:
       Out.emitInstruction(
-          MCInstBuilder(Opc).addReg(DestReg).addReg(SrcReg).addImm(Inst.Imm),
+          MCInstBuilder(Inst.Opc).addReg(DestReg).addImm(Inst.Imm), getSTI());
+      break;
+    case LoongArch::ADDI_W:
+    case LoongArch::ORI:
+    case LoongArch::LU32I_D:
+    case LoongArch::LU52I_D:
+      Out.emitInstruction(
+          MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addImm(
+              Inst.Imm),
           getSTI());
+      break;
+    case LoongArch::BSTRINS_D:
+      Out.emitInstruction(MCInstBuilder(Inst.Opc)
+                              .addReg(DestReg)
+                              .addReg(SrcReg)
+                              .addReg(SrcReg)
+                              .addImm(Inst.Imm >> 32)
+                              .addImm(Inst.Imm & 0xFF),
+                          getSTI());
+      break;
+    default:
+      llvm_unreachable("unexpected opcode generated by LoongArchMatInt");
+    }
     SrcReg = DestReg;
   }
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
index b6ade6b978d2ce..70ed1e6fbdbdac 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
@@ -62,10 +62,26 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) {
     // The instructions in the sequence are handled here.
     for (LoongArchMatInt::Inst &Inst : LoongArchMatInt::generateInstSeq(Imm)) {
       SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, GRLenVT);
-      if (Inst.Opc == LoongArch::LU12I_W)
-        Result = CurDAG->getMachineNode(LoongArch::LU12I_W, DL, GRLenVT, SDImm);
-      else
+      switch (Inst.Opc) {
+      case LoongArch::LU12I_W:
+        Result = CurDAG->getMachineNode(Inst.Opc, DL, GRLenVT, SDImm);
+        break;
+      case LoongArch::ADDI_W:
+      case LoongArch::ORI:
+      case LoongArch::LU32I_D:
+      case LoongArch::LU52I_D:
         Result = CurDAG->getMachineNode(Inst.Opc, DL, GRLenVT, SrcReg, SDImm);
+        break;
+      case LoongArch::BSTRINS_D:
+        Result = CurDAG->getMachineNode(
+            Inst.Opc, DL, GRLenVT,
+            {SrcReg, SrcReg,
+             CurDAG->getTargetConstant(Inst.Imm >> 32, DL, GRLenVT),
+             CurDAG->getTargetConstant(Inst.Imm & 0xFF, DL, GRLenVT)});
+        break;
+      default:
+        llvm_unreachable("unexpected opcode generated by LoongArchMatInt");
+      }
       SrcReg = SDValue(Result, 0);
     }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 9059da460f1358..d1af65192ee612 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -210,6 +210,14 @@ void LoongArchInstrInfo::movImm(MachineBasicBlock &MBB,
           .addImm(Inst.Imm)
           .setMIFlag(Flag);
       break;
+    case LoongArch::BSTRINS_D:
+      BuildMI(MBB, MBBI, DL, get(Inst.Opc), DstReg)
+          .addReg(SrcReg, RegState::Kill)
+          .addReg(SrcReg, RegState::Kill)
+          .addImm(Inst.Imm >> 32)
+          .addImm(Inst.Imm & 0xFF)
+          .setMIFlag(Flag);
+      break;
     default:
       assert(false && "Unknown insn emitted by LoongArchMatInt");
     }
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
index 1509c436c81098..a7823470382756 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.cpp
@@ -26,11 +26,13 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) {
   const int64_t Lo12 = Val & 0xFFF;
   InstSeq Insts;
 
+  // LU52I_D used for: Bits[63:52] | Bits[51:0].
   if (Highest12 != 0 && SignExtend64<52>(Val) == 0) {
     Insts.push_back(Inst(LoongArch::LU52I_D, SignExtend64<12>(Highest12)));
     return Insts;
   }
 
+  // lo32
   if (Hi20 == 0)
     Insts.push_back(Inst(LoongArch::ORI, Lo12));
   else if (SignExtend32<1>(Lo12 >> 11) == SignExtend32<20>(Hi20))
@@ -41,11 +43,83 @@ LoongArchMatInt::InstSeq LoongArchMatInt::generateInstSeq(int64_t Val) {
       Insts.push_back(Inst(LoongArch::ORI, Lo12));
   }
 
+  // hi32
+  // Higher20
   if (SignExtend32<1>(Hi20 >> 19) != SignExtend32<20>(Higher20))
     Insts.push_back(Inst(LoongArch::LU32I_D, SignExtend64<20>(Higher20)));
 
+  // Highest12
   if (SignExtend32<1>(Higher20 >> 19) != SignExtend32<12>(Highest12))
     Insts.push_back(Inst(LoongArch::LU52I_D, SignExtend64<12>(Highest12)));
 
+  size_t N = Insts.size();
+  if (N < 3)
+    return Insts;
+
+  // When the number of instruction sequences is greater than 2, we have the
+  // opportunity to optimize using the BSTRINS_D instruction. The scenario is as
+  // follows:
+  //
+  // N of Insts = 3
+  // 1. ORI + LU32I_D + LU52I_D     =>     ORI + BSTRINS_D, TmpVal = ORI
+  // 2. ADDI_W + LU32I_D + LU52I_D  =>  ADDI_W + BSTRINS_D, TmpVal = ADDI_W
+  // 3. LU12I_W + ORI + LU32I_D     =>     ORI + BSTRINS_D, TmpVal = ORI
+  // 4. LU12I_W + LU32I_D + LU52I_D => LU12I_W + BSTRINS_D, TmpVal = LU12I_W
+  //
+  // N of Insts = 4
+  // 5. LU12I_W + ORI + LU32I_D + LU52I_D => LU12I_W + ORI + BSTRINS_D
+  //                                      => ORI + LU52I_D + BSTRINS_D
+  //    TmpVal = (LU12I_W | ORI) or (ORI | LU52I_D)
+  // The BSTRINS_D instruction will use the `TmpVal` to construct the `Val`.
+  uint64_t TmpVal1 = 0;
+  uint64_t TmpVal2 = 0;
+  switch (Insts[0].Opc) {
+  default:
+    llvm_unreachable("unexpected opcode");
+    break;
+  case LoongArch::LU12I_W:
+    if (Insts[1].Opc == LoongArch::ORI) {
+      TmpVal1 = Insts[1].Imm;
+      if (N == 3)
+        break;
+      TmpVal2 = Insts[3].Imm << 52 | TmpVal1;
+    }
+    TmpVal1 |= Insts[0].Imm << 12;
+    break;
+  case LoongArch::ORI:
+  case LoongArch::ADDI_W:
+    TmpVal1 = Insts[0].Imm;
+    break;
+  }
+
+  uint64_t Msb = 32;
+  uint64_t HighMask = ~((1ULL << (Msb + 1)) - 1);
+  for (; Msb < 64; ++Msb, HighMask = (HighMask << 1) + 1) {
+    for (uint64_t Lsb = Msb; Lsb > 0; --Lsb) {
+      uint64_t LowMask = (1ULL << Lsb) - 1;
+      uint64_t Mask = HighMask | LowMask;
+      uint64_t LsbToZero = TmpVal1 & ((1ULL << (Msb - Lsb + 1)) - 1);
+      uint64_t MsbToLsb = LsbToZero << Lsb;
+      if ((MsbToLsb | (TmpVal1 & Mask)) == (uint64_t)Val) {
+        if (Insts[1].Opc == LoongArch::ORI && N == 3)
+          Insts[0] = Insts[1];
+        Insts.pop_back_n(2);
+        Insts.push_back(Inst(LoongArch::BSTRINS_D, Msb << 32 | Lsb));
+        return Insts;
+      }
+      if (TmpVal2 != 0) {
+        LsbToZero = TmpVal2 & ((1ULL << (Msb - Lsb + 1)) - 1);
+        MsbToLsb = LsbToZero << Lsb;
+        if ((MsbToLsb | (TmpVal2 & Mask)) == (uint64_t)Val) {
+          Insts[0] = Insts[1];
+          Insts[1] = Insts[3];
+          Insts.pop_back_n(2);
+          Insts.push_back(Inst(LoongArch::BSTRINS_D, Msb << 32 | Lsb));
+          return Insts;
+        }
+      }
+    }
+  }
+
   return Insts;
 }
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h
index be1b425894de1a..3a3c12c353fb8e 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMatInt.h
@@ -16,6 +16,7 @@ namespace llvm {
 namespace LoongArchMatInt {
 struct Inst {
   unsigned Opc;
+  // Imm: Opc's imm operand, if Opc == BSTRINS_D, Imm = MSB << 32 | LSB.
   int64_t Imm;
   Inst(unsigned Opc, int64_t Imm) : Opc(Opc), Imm(Imm) {}
 };
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 09928dcc1f489a..e990325ac38279 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -891,16 +891,30 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::STRICT_FDIV, ISD::STRICT_FSQRT, ISD::STRICT_FMA};
 
     // TODO: support more vp ops.
-    static const unsigned ZvfhminPromoteVPOps[] = {
-        ISD::VP_FADD,        ISD::VP_FSUB,         ISD::VP_FMUL,
-        ISD::VP_FDIV,        ISD::VP_FNEG,         ISD::VP_FABS,
-        ISD::VP_FMA,         ISD::VP_REDUCE_FADD,  ISD::VP_REDUCE_SEQ_FADD,
-        ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX,  ISD::VP_SQRT,
-        ISD::VP_FMINNUM,     ISD::VP_FMAXNUM,      ISD::VP_FCEIL,
-        ISD::VP_FFLOOR,      ISD::VP_FROUND,       ISD::VP_FROUNDEVEN,
-        ISD::VP_FCOPYSIGN,   ISD::VP_FROUNDTOZERO, ISD::VP_FRINT,
-        ISD::VP_FNEARBYINT,  ISD::VP_SETCC,        ISD::VP_FMINIMUM,
-        ISD::VP_FMAXIMUM,    ISD::VP_REDUCE_FMINIMUM, ISD::VP_REDUCE_FMAXIMUM};
+    static const unsigned ZvfhminPromoteVPOps[] = {ISD::VP_FADD,
+                                                   ISD::VP_FSUB,
+                                                   ISD::VP_FMUL,
+                                                   ISD::VP_FDIV,
+                                                   ISD::VP_FMA,
+                                                   ISD::VP_REDUCE_FADD,
+                                                   ISD::VP_REDUCE_SEQ_FADD,
+                                                   ISD::VP_REDUCE_FMIN,
+                                                   ISD::VP_REDUCE_FMAX,
+                                                   ISD::VP_SQRT,
+                                                   ISD::VP_FMINNUM,
+                                                   ISD::VP_FMAXNUM,
+                                                   ISD::VP_FCEIL,
+                                                   ISD::VP_FFLOOR,
+                                                   ISD::VP_FROUND,
+                                                   ISD::VP_FROUNDEVEN,
+                                                   ISD::VP_FROUNDTOZERO,
+                                                   ISD::VP_FRINT,
+                                                   ISD::VP_FNEARBYINT,
+                                                   ISD::VP_SETCC,
+                                                   ISD::VP_FMINIMUM,
+                                                   ISD::VP_FMAXIMUM,
+                                                   ISD::VP_REDUCE_FMINIMUM,
+                                                   ISD::VP_REDUCE_FMAXIMUM};
 
     // Sets common operation actions on RVV floating-point vector types.
     const auto SetCommonVFPActions = [&](MVT VT) {
@@ -16440,6 +16454,13 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     SDLoc DL(N);
     SDValue Op0 = N->getOperand(0);
     MVT VT = N->getSimpleValueType(0);
+
+    // Constant fold.
+    if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op0)) {
+      APInt Val = CFP->getValueAPF().bitcastToAPInt().sext(VT.getSizeInBits());
+      return DAG.getConstant(Val, DL, VT);
+    }
+
     // If the input to FMV_X_ANYEXTW_RV64 is just FMV_W_X_RV64 then the
     // conversion is unnecessary and can be replaced with the FMV_W_X_RV64
     // operand. Similar for FMV_X_ANYEXTH and FMV_H_X.
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 0f8f9442877e33..6df3b951f5a06f 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -70,6 +70,7 @@ class RISCVVectorPeephole : public MachineFunctionPass {
 
   bool isAllOnesMask(const MachineInstr *MaskDef) const;
   std::optional<unsigned> getConstant(const MachineOperand &VL) const;
+  bool ensureDominates(const MachineOperand &Use, MachineInstr &Src) const;
 
   /// Maps uses of V0 to the corresponding def of V0.
   DenseMap<const MachineInstr *, const MachineInstr *> V0Defs;
@@ -165,6 +166,9 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
   if (VL.isIdenticalTo(SrcVL) || !isVLKnownLE(VL, SrcVL))
     return false;
 
+  if (!ensureDominates(VL, *Src))
+    return false;
+
   if (VL.isImm())
     SrcVL.ChangeToImmediate(VL.getImm());
   else if (VL.isReg())
@@ -456,6 +460,26 @@ static bool dominates(MachineBasicBlock::const_iterator A,
   return &*I == A;
 }
 
+/// If the register in \p MO doesn't dominate \p Src, try to move \p Src so it
+/// does. Returns false if doesn't dominate and we can't move. \p MO must be in
+/// the same basic block as \Src.
+bool RISCVVectorPeephole::ensureDominates(const MachineOperand &MO,
+                                          MachineInstr &Src) const {
+  assert(MO.getParent()->getParent() == Src.getParent());
+  if (!MO.isReg() || MO.getReg() == RISCV::NoRegister)
+    return true;
+
+  MachineInstr *Def = MRI->getVRegDef(MO.getReg());
+  if (Def->getParent() == Src.getParent() && !dominates(Def, Src)) {
+    if (!isSafeToMove(Src, *Def->getNextNode()))
+      return false;
+    // FIXME: Update V0Defs
+    Src.moveBefore(Def->getNextNode());
+  }
+
+  return true;
+}
+
 /// If a PseudoVMV_V_V is the only user of its input, fold its passthru and VL
 /// into it.
 ///
@@ -501,15 +525,8 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
     return false;
 
   // If the new passthru doesn't dominate Src, try to move Src so it does.
-  if (Passthru.getReg() != RISCV::NoRegister) {
-    MachineInstr *PassthruDef = MRI->getVRegDef(Passthru.getReg());
-    if (PassthruDef->getParent() == Src->getParent() &&
-        !dominates(PassthruDef, Src)) {
-      if (!isSafeToMove(*Src, *PassthruDef->getNextNode()))
-        return false;
-      Src->moveBefore(PassthruDef->getNextNode());
-    }
-  }
+  if (!ensureDominates(Passthru, *Src))
+    return false;
 
   if (SrcPassthru.getReg() != Passthru.getReg()) {
     SrcPassthru.setReg(Passthru.getReg());
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4578ff7f715146..5cc084f3ab1387 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -275,8 +275,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
         setOperationAction(Op, T, Expand);
 
     // But saturating fp_to_int converstions are
-    for (auto Op : {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT})
+    for (auto Op : {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}) {
       setOperationAction(Op, MVT::v4i32, Custom);
+      if (Subtarget->hasFP16()) {
+        setOperationAction(Op, MVT::v8i16, Custom);
+      }
+    }
 
     // Support vector extending
     for (auto T : MVT::integer_fixedlen_vector_valuetypes()) {
@@ -2475,6 +2479,9 @@ SDValue WebAssemblyTargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
   if (ResT == MVT::v4i32 && SatVT == MVT::i32)
     return Op;
 
+  if (ResT == MVT::v8i16 && SatVT == MVT::i16)
+    return Op;
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 887278e9c12ef3..9d17d90f530541 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -165,8 +165,9 @@ def F16x8 : Vec {
  let prefix = "f16x8";
 }
 
-// TODO: Include F16x8 here when half precision is better supported.
-defvar AllVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2];
+// TODO: Remove StdVecs when the F16x8 works every where StdVecs is used.
+defvar StdVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2];
+defvar AllVecs = !listconcat(StdVecs, [F16x8]);
 defvar IntVecs = [I8x16, I16x8, I32x4, I64x2];
 
 //===----------------------------------------------------------------------===//
@@ -188,7 +189,7 @@ defm LOAD_V128_A64 :
 }
 
 // Def load patterns from WebAssemblyInstrMemory.td for vector types
-foreach vec = AllVecs in {
+foreach vec = StdVecs in {
 defm : LoadPat<vec.vt, load, "LOAD_V128">;
 }
 
@@ -217,7 +218,7 @@ defm "" : SIMDLoadSplat<16, 8>;
 defm "" : SIMDLoadSplat<32, 9>;
 defm "" : SIMDLoadSplat<64, 10>;
 
-foreach vec = AllVecs in {
+foreach vec = StdVecs in {
   defvar inst = "LOAD"#vec.lane_bits#"_SPLAT";
   defm : LoadPat<vec.vt,
                  PatFrag<(ops node:$addr), (splat_vector (vec.lane_vt (vec.lane_load node:$addr)))>,
@@ -389,7 +390,7 @@ defm STORE_V128_A64 :
 }
 
 // Def store patterns from WebAssemblyInstrMemory.td for vector types
-foreach vec = AllVecs in {
+foreach vec = StdVecs in {
 defm : StorePat<vec.vt, store, "STORE_V128">;
 }
 
@@ -513,7 +514,7 @@ defm "" : ConstVec<F64x2,
                   "$i0, $i1">;
 
 // Match splat(x) -> const.v128(x, ..., x)
-foreach vec = AllVecs in {
+foreach vec = StdVecs in {
   defvar numEls = !div(vec.vt.Size, vec.lane_bits);
   defvar isFloat = !or(!eq(vec.lane_vt, f32), !eq(vec.lane_vt, f64));
   defvar immKind = !if(isFloat, fpimm, imm);
@@ -557,7 +558,7 @@ defm SHUFFLE :
 // Shuffles after custom lowering
 def wasm_shuffle_t : SDTypeProfile<1, 18, []>;
 def wasm_shuffle : SDNode<"WebAssemblyISD::SHUFFLE", wasm_shuffle_t>;
-foreach vec = AllVecs in {
+foreach vec = StdVecs in {
 // The @llvm.wasm.shuffle intrinsic has immediate arguments that become TargetConstants.
 def : Pat<(vec.vt (wasm_shuffle (vec.vt V128:$x), (vec.vt V128:$y),
             (i32 timm:$m0), (i32 timm:$m1),
@@ -627,7 +628,7 @@ defm SPLAT_F16x8 :
                    "f16x8.splat\t$dst, $x", "f16x8.splat", 0x120>;
 
 // scalar_to_vector leaves high lanes undefined, so can be a splat
-foreach vec = AllVecs in
+foreach vec = StdVecs in
 def : Pat<(vec.vt (scalar_to_vector (vec.lane_vt vec.lane_rc:$x))),
           (!cast<Instruction>("SPLAT_"#vec) $x)>;
 
@@ -762,7 +763,7 @@ multiclass SIMDConditionInt<string name, CondCode cond, bits<32> baseInst> {
 multiclass SIMDConditionFP<string name, CondCode cond, bits<32> baseInst> {
   defm "" : SIMDCondition<F32x4, name, cond, baseInst>;
   defm "" : SIMDCondition<F64x2, name, cond, !add(baseInst, 6)>;
-  defm "" : HalfPrecisionCondition<F16x8, name, cond, !add(baseInst, 255)>;
+  defm "" : HalfPrecisionCondition<F16x8, name, cond, !add(baseInst, 246)>;
 }
 
 // Equality: eq
@@ -880,7 +881,7 @@ defm BITSELECT :
   SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins), [],
          "v128.bitselect\t$dst, $v1, $v2, $c", "v128.bitselect", 82>;
 
-foreach vec = AllVecs in
+foreach vec = StdVecs in
 def : Pat<(vec.vt (int_wasm_bitselect
             (vec.vt V128:$v1), (vec.vt V128:$v2), (vec.vt V128:$c))),
           (BITSELECT $v1, $v2, $c)>;
@@ -906,7 +907,7 @@ def : Pat<(vec.vt (xor (and (xor (vec.vt V128:$v1), (vec.vt V128:$v2)),
           (BITSELECT $v2, $v1, $c)>;
 
 // Also implement vselect in terms of bitselect
-foreach vec = AllVecs in
+foreach vec = StdVecs in
 def : Pat<(vec.vt (vselect
             (vec.int_vt V128:$c), (vec.vt V128:$v1), (vec.vt V128:$v2))),
           (BITSELECT $v1, $v2, $c)>;
@@ -916,7 +917,7 @@ defm SELECT_V128 :
   I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs, I32:$cond), (outs), (ins), [],
     "v128.select\t$dst, $lhs, $rhs, $cond", "v128.select", 0x1b>;
 
-foreach vec = AllVecs in {
+foreach vec = StdVecs in {
 def : Pat<(select I32:$cond, (vec.vt V128:$lhs), (vec.vt V128:$rhs)),
           (SELECT_V128 $lhs, $rhs, $cond)>;
 
@@ -1217,7 +1218,7 @@ multiclass SIMDUnaryFP<SDNode node, string name, bits<32> baseInst> {
   // Unlike F32x4 and F64x2 there's not a gap in the opcodes between "neg" and
   // "sqrt" so subtract one from the offset.
   defm "" : HalfPrecisionUnary<F16x8, node, name,
-                               !add(baseInst,!if(!eq(name, "sqrt"), 80, 81))>;
+                               !add(baseInst,!if(!eq(name, "sqrt"), 79, 80))>;
 }
 
 // Absolute value: abs
@@ -1238,10 +1239,10 @@ defm CEIL : SIMDUnary<F64x2, fceil, "ceil", 0x74>;
 defm FLOOR : SIMDUnary<F64x2, ffloor, "floor", 0x75>;
 defm TRUNC: SIMDUnary<F64x2, ftrunc, "trunc", 0x7a>;
 defm NEAREST: SIMDUnary<F64x2, fnearbyint, "nearest", 0x94>;
-defm CEIL : HalfPrecisionUnary<F16x8, fceil, "ceil", 0x13c>;
-defm FLOOR : HalfPrecisionUnary<F16x8, ffloor, "floor", 0x13d>;
-defm TRUNC : HalfPrecisionUnary<F16x8, ftrunc, "trunc", 0x13e>;
-defm NEAREST : HalfPrecisionUnary<F16x8, fnearbyint, "nearest", 0x13f>;
+defm CEIL : HalfPrecisionUnary<F16x8, fceil, "ceil", 0x133>;
+defm FLOOR : HalfPrecisionUnary<F16x8, ffloor, "floor", 0x134>;
+defm TRUNC : HalfPrecisionUnary<F16x8, ftrunc, "trunc", 0x135>;
+defm NEAREST : HalfPrecisionUnary<F16x8, fnearbyint, "nearest", 0x136>;
 
 // WebAssembly doesn't expose inexact exceptions, so map frint to fnearbyint.
 def : Pat<(v4f32 (frint (v4f32 V128:$src))), (NEAREST_F32x4 V128:$src)>;
@@ -1260,7 +1261,7 @@ def : Pat<(v8f16 (froundeven (v8f16 V128:$src))), (NEAREST_F16x8 V128:$src)>;
 multiclass SIMDBinaryFP<SDPatternOperator node, string name, bits<32> baseInst> {
   defm "" : SIMDBinary<F32x4, node, name, baseInst>;
   defm "" : SIMDBinary<F64x2, node, name, !add(baseInst, 12)>;
-  defm "" : HalfPrecisionBinary<F16x8, node, name, !add(baseInst, 80)>;
+  defm "" : HalfPrecisionBinary<F16x8, node, name, !add(baseInst, 89)>;
 }
 
 // Addition: add
@@ -1361,8 +1362,8 @@ multiclass HalfPrecisionConvert<Vec vec, Vec arg, SDPatternOperator op,
 // Floating point to integer with saturation: trunc_sat
 defm "" : SIMDConvert<I32x4, F32x4, fp_to_sint, "trunc_sat_f32x4_s", 248>;
 defm "" : SIMDConvert<I32x4, F32x4, fp_to_uint, "trunc_sat_f32x4_u", 249>;
-defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_sint, "trunc_sat_f16x8_s", 0x148>;
-defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_uint, "trunc_sat_f16x8_u", 0x149>;
+defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_sint, "trunc_sat_f16x8_s", 0x145>;
+defm "" : HalfPrecisionConvert<I16x8, F16x8, fp_to_uint, "trunc_sat_f16x8_u", 0x146>;
 
 // Support the saturating variety as well.
 def trunc_s_sat32 : PatFrag<(ops node:$x), (fp_to_sint_sat $x, i32)>;
@@ -1370,6 +1371,11 @@ def trunc_u_sat32 : PatFrag<(ops node:$x), (fp_to_uint_sat $x, i32)>;
 def : Pat<(v4i32 (trunc_s_sat32 (v4f32 V128:$src))), (fp_to_sint_I32x4 $src)>;
 def : Pat<(v4i32 (trunc_u_sat32 (v4f32 V128:$src))), (fp_to_uint_I32x4 $src)>;
 
+def trunc_s_sat16 : PatFrag<(ops node:$x), (fp_to_sint_sat $x, i16)>;
+def trunc_u_sat16 : PatFrag<(ops node:$x), (fp_to_uint_sat $x, i16)>;
+def : Pat<(v8i16 (trunc_s_sat16 (v8f16 V128:$src))), (fp_to_sint_I16x8 $src)>;
+def : Pat<(v8i16 (trunc_u_sat16 (v8f16 V128:$src))), (fp_to_uint_I16x8 $src)>;
+
 def trunc_sat_zero_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
 def trunc_sat_zero_s :
   SDNode<"WebAssemblyISD::TRUNC_SAT_ZERO_S", trunc_sat_zero_t>;
@@ -1388,8 +1394,8 @@ defm "" : SIMDConvert<F32x4, I32x4, sint_to_fp, "convert_i32x4_s", 250>;
 defm "" : SIMDConvert<F32x4, I32x4, uint_to_fp, "convert_i32x4_u", 251>;
 defm "" : SIMDConvert<F64x2, I32x4, convert_low_s, "convert_low_i32x4_s", 0xfe>;
 defm "" : SIMDConvert<F64x2, I32x4, convert_low_u, "convert_low_i32x4_u", 0xff>;
-defm "" : HalfPrecisionConvert<F16x8, I16x8, sint_to_fp, "convert_i16x8_s", 0x14a>;
-defm "" : HalfPrecisionConvert<F16x8, I16x8, uint_to_fp, "convert_i16x8_u", 0x14b>;
+defm "" : HalfPrecisionConvert<F16x8, I16x8, sint_to_fp, "convert_i16x8_s", 0x147>;
+defm "" : HalfPrecisionConvert<F16x8, I16x8, uint_to_fp, "convert_i16x8_u", 0x148>;
 
 // Extending operations
 // TODO: refactor this to be uniform for i64x2 if the numbering is not changed.
@@ -1532,7 +1538,7 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
 
 defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
 defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F16x8, 0x146, 0x147, [HasFP16]>;
+defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
 
 //===----------------------------------------------------------------------===//
 // Laneselect
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 9cc5ed5d89ad70..a62fb7f723cdbc 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -3009,15 +3009,15 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   case Intrinsic::x86_avx512_vpermi2var_d_128:
   case Intrinsic::x86_avx512_vpermi2var_d_256:
   case Intrinsic::x86_avx512_vpermi2var_d_512:
-  case Intrinsic::x86_avx512_vpermi2var_hi_128: 
-  case Intrinsic::x86_avx512_vpermi2var_hi_256: 
-  case Intrinsic::x86_avx512_vpermi2var_hi_512: 
-  case Intrinsic::x86_avx512_vpermi2var_pd_128: 
-  case Intrinsic::x86_avx512_vpermi2var_pd_256: 
-  case Intrinsic::x86_avx512_vpermi2var_pd_512: 
-  case Intrinsic::x86_avx512_vpermi2var_ps_128: 
-  case Intrinsic::x86_avx512_vpermi2var_ps_256: 
-  case Intrinsic::x86_avx512_vpermi2var_ps_512: 
+  case Intrinsic::x86_avx512_vpermi2var_hi_128:
+  case Intrinsic::x86_avx512_vpermi2var_hi_256:
+  case Intrinsic::x86_avx512_vpermi2var_hi_512:
+  case Intrinsic::x86_avx512_vpermi2var_pd_128:
+  case Intrinsic::x86_avx512_vpermi2var_pd_256:
+  case Intrinsic::x86_avx512_vpermi2var_pd_512:
+  case Intrinsic::x86_avx512_vpermi2var_ps_128:
+  case Intrinsic::x86_avx512_vpermi2var_ps_256:
+  case Intrinsic::x86_avx512_vpermi2var_ps_512:
   case Intrinsic::x86_avx512_vpermi2var_q_128:
   case Intrinsic::x86_avx512_vpermi2var_q_256:
   case Intrinsic::x86_avx512_vpermi2var_q_512:
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 603a1565e48c45..79746201133bdd 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1762,54 +1762,52 @@ static void addNoReturnAttrs(const SCCNodeSet &SCCNodes,
   }
 }
 
-static bool
-allBBPathsGoThroughCold(BasicBlock *BB,
-                        SmallDenseMap<BasicBlock *, bool, 16> &Visited) {
-  // If BB contains a cold callsite this path through the CG is cold.
-  // Ignore whether the instructions actually are guranteed to transfer
-  // execution. Divergent behavior is considered unlikely.
-  if (any_of(*BB, [](Instruction &I) {
-        if (auto *CB = dyn_cast<CallBase>(&I))
-          return CB->hasFnAttr(Attribute::Cold);
-        return false;
-      })) {
-    Visited[BB] = true;
-    return true;
-  }
-
-  auto Succs = successors(BB);
-  // We found a path that doesn't go through any cold callsite.
-  if (Succs.empty())
-    return false;
+static bool allPathsGoThroughCold(Function &F) {
+  SmallDenseMap<BasicBlock *, bool, 16> ColdPaths;
+  ColdPaths[&F.front()] = false;
+  SmallVector<BasicBlock *> Jobs;
+  Jobs.push_back(&F.front());
+
+  while (!Jobs.empty()) {
+    BasicBlock *BB = Jobs.pop_back_val();
+
+    // If block contains a cold callsite this path through the CG is cold.
+    // Ignore whether the instructions actually are guaranteed to transfer
+    // execution. Divergent behavior is considered unlikely.
+    if (any_of(*BB, [](Instruction &I) {
+          if (auto *CB = dyn_cast<CallBase>(&I))
+            return CB->hasFnAttr(Attribute::Cold);
+          return false;
+        })) {
+      ColdPaths[BB] = true;
+      continue;
+    }
 
-  // We didn't find a cold callsite in this BB, so check that all successors
-  // contain a cold callsite (or that their successors do).
-  // Potential TODO: We could use static branch hints to assume certain
-  // successor paths are inherently cold, irrespective of if they contain a cold
-  // callsite.
-  for (auto *Succ : Succs) {
-    // Start with false, this is necessary to ensure we don't turn loops into
-    // cold.
-    auto R = Visited.try_emplace(Succ, false);
-    if (!R.second) {
-      if (R.first->second)
-        continue;
+    auto Succs = successors(BB);
+    // We found a path that doesn't go through any cold callsite.
+    if (Succs.empty())
       return false;
+
+    // We didn't find a cold callsite in this BB, so check that all successors
+    // contain a cold callsite (or that their successors do).
+    // Potential TODO: We could use static branch hints to assume certain
+    // successor paths are inherently cold, irrespective of if they contain a
+    // cold callsite.
+    for (BasicBlock *Succ : Succs) {
+      // Start with false, this is necessary to ensure we don't turn loops into
+      // cold.
+      auto [Iter, Inserted] = ColdPaths.try_emplace(Succ, false);
+      if (!Inserted) {
+        if (Iter->second)
+          continue;
+        return false;
+      }
+      Jobs.push_back(Succ);
     }
-    if (!allBBPathsGoThroughCold(Succ, Visited))
-      return false;
-    Visited[Succ] = true;
   }
-
   return true;
 }
 
-static bool allPathsGoThroughCold(Function &F) {
-  SmallDenseMap<BasicBlock *, bool, 16> Visited;
-  Visited[&F.front()] = false;
-  return allBBPathsGoThroughCold(&F.front(), Visited);
-}
-
 // Set the cold function attribute if possible.
 static void addColdAttrs(const SCCNodeSet &SCCNodes,
                          SmallSet<Function *, 8> &Changed) {
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 36f5cd7fd9e6cb..69e5835bee8a5e 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -598,24 +598,6 @@ void HWAddressSanitizer::initializeModule() {
   LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
   TargetTriple = Triple(M.getTargetTriple());
 
-  for (auto &F : M.functions()) {
-    // Remove memory attributes that are invalid with HWASan.
-    // HWASan checks read from shadow, which invalidates memory(argmem: *)
-    // Short granule checks on function arguments read from the argument memory
-    // (last byte of the granule), which invalidates writeonly.
-    //
-    // This is not only true for sanitized functions, because AttrInfer can
-    // infer those attributes on libc functions, which is not true if those
-    // are instrumented (Android) or intercepted.
-
-    // nobuiltin makes sure later passes don't restore assumptions about
-    // the function.
-    F.addFnAttr(llvm::Attribute::NoBuiltin);
-    F.removeFnAttr(llvm::Attribute::Memory);
-    for (auto &A : F.args())
-      A.removeAttr(llvm::Attribute::WriteOnly);
-  }
-
   // x86_64 currently has two modes:
   // - Intel LAM (default)
   // - pointer aliasing (heap only)
@@ -1640,6 +1622,14 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
 
   assert(!ShadowBase);
 
+  // Remove memory attributes that are about to become invalid.
+  // HWASan checks read from shadow, which invalidates memory(argmem: *)
+  // Short granule checks on function arguments read from the argument memory
+  // (last byte of the granule), which invalidates writeonly.
+  F.removeFnAttr(llvm::Attribute::Memory);
+  for (auto &A : F.args())
+    A.removeAttr(llvm::Attribute::WriteOnly);
+
   BasicBlock::iterator InsertPt = F.getEntryBlock().begin();
   IRBuilder<> EntryIRB(&F.getEntryBlock(), InsertPt);
   emitPrologue(EntryIRB,
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 526ae4e8834396..86c7dceffc5245 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2537,14 +2537,19 @@ static bool hoistAdd(ICmpInst::Predicate Pred, Value *VariantLHS,
                      Value *InvariantRHS, ICmpInst &ICmp, Loop &L,
                      ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU,
                      AssumptionCache *AC, DominatorTree *DT) {
-  assert(ICmpInst::isSigned(Pred) && "Not supported yet!");
   assert(!L.isLoopInvariant(VariantLHS) && "Precondition.");
   assert(L.isLoopInvariant(InvariantRHS) && "Precondition.");
 
+  bool IsSigned = ICmpInst::isSigned(Pred);
+
   // Try to represent VariantLHS as sum of invariant and variant operands.
   using namespace PatternMatch;
   Value *VariantOp, *InvariantOp;
-  if (!match(VariantLHS, m_NSWAdd(m_Value(VariantOp), m_Value(InvariantOp))))
+  if (IsSigned &&
+      !match(VariantLHS, m_NSWAdd(m_Value(VariantOp), m_Value(InvariantOp))))
+    return false;
+  if (!IsSigned &&
+      !match(VariantLHS, m_NUWAdd(m_Value(VariantOp), m_Value(InvariantOp))))
     return false;
 
   // LHS itself is a loop-variant, try to represent it in the form:
@@ -2559,17 +2564,20 @@ static bool hoistAdd(ICmpInst::Predicate Pred, Value *VariantLHS,
   // normal linear arithmetics). Overflows make things much more complicated, so
   // we want to avoid this.
   auto &DL = L.getHeader()->getDataLayout();
-  bool ProvedNoOverflowAfterReassociate =
-      computeOverflowForSignedSub(InvariantRHS, InvariantOp,
-                                  SimplifyQuery(DL, DT, AC, &ICmp)) ==
-      llvm::OverflowResult::NeverOverflows;
-  if (!ProvedNoOverflowAfterReassociate)
+  SimplifyQuery SQ(DL, DT, AC, &ICmp);
+  if (IsSigned && computeOverflowForSignedSub(InvariantRHS, InvariantOp, SQ) !=
+                      llvm::OverflowResult::NeverOverflows)
+    return false;
+  if (!IsSigned &&
+      computeOverflowForUnsignedSub(InvariantRHS, InvariantOp, SQ) !=
+          llvm::OverflowResult::NeverOverflows)
     return false;
   auto *Preheader = L.getLoopPreheader();
   assert(Preheader && "Loop is not in simplify form?");
   IRBuilder<> Builder(Preheader->getTerminator());
-  Value *NewCmpOp = Builder.CreateSub(InvariantRHS, InvariantOp, "invariant.op",
-                                      /*HasNUW*/ false, /*HasNSW*/ true);
+  Value *NewCmpOp =
+      Builder.CreateSub(InvariantRHS, InvariantOp, "invariant.op",
+                        /*HasNUW*/ !IsSigned, /*HasNSW*/ IsSigned);
   ICmp.setPredicate(Pred);
   ICmp.setOperand(0, VariantOp);
   ICmp.setOperand(1, NewCmpOp);
@@ -2584,14 +2592,19 @@ static bool hoistSub(ICmpInst::Predicate Pred, Value *VariantLHS,
                      Value *InvariantRHS, ICmpInst &ICmp, Loop &L,
                      ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU,
                      AssumptionCache *AC, DominatorTree *DT) {
-  assert(ICmpInst::isSigned(Pred) && "Not supported yet!");
   assert(!L.isLoopInvariant(VariantLHS) && "Precondition.");
   assert(L.isLoopInvariant(InvariantRHS) && "Precondition.");
 
+  bool IsSigned = ICmpInst::isSigned(Pred);
+
   // Try to represent VariantLHS as sum of invariant and variant operands.
   using namespace PatternMatch;
   Value *VariantOp, *InvariantOp;
-  if (!match(VariantLHS, m_NSWSub(m_Value(VariantOp), m_Value(InvariantOp))))
+  if (IsSigned &&
+      !match(VariantLHS, m_NSWSub(m_Value(VariantOp), m_Value(InvariantOp))))
+    return false;
+  if (!IsSigned &&
+      !match(VariantLHS, m_NUWSub(m_Value(VariantOp), m_Value(InvariantOp))))
     return false;
 
   bool VariantSubtracted = false;
@@ -2613,16 +2626,26 @@ static bool hoistSub(ICmpInst::Predicate Pred, Value *VariantLHS,
   // "C1 - C2" does not overflow.
   auto &DL = L.getHeader()->getDataLayout();
   SimplifyQuery SQ(DL, DT, AC, &ICmp);
-  if (VariantSubtracted) {
+  if (VariantSubtracted && IsSigned) {
     // C1 - LV < C2 --> LV > C1 - C2
     if (computeOverflowForSignedSub(InvariantOp, InvariantRHS, SQ) !=
         llvm::OverflowResult::NeverOverflows)
       return false;
-  } else {
+  } else if (VariantSubtracted && !IsSigned) {
+    // C1 - LV < C2 --> LV > C1 - C2
+    if (computeOverflowForUnsignedSub(InvariantOp, InvariantRHS, SQ) !=
+        llvm::OverflowResult::NeverOverflows)
+      return false;
+  } else if (!VariantSubtracted && IsSigned) {
     // LV - C1 < C2 --> LV < C1 + C2
     if (computeOverflowForSignedAdd(InvariantOp, InvariantRHS, SQ) !=
         llvm::OverflowResult::NeverOverflows)
       return false;
+  } else { // !VariantSubtracted && !IsSigned
+    // LV - C1 < C2 --> LV < C1 + C2
+    if (computeOverflowForUnsignedAdd(InvariantOp, InvariantRHS, SQ) !=
+        llvm::OverflowResult::NeverOverflows)
+      return false;
   }
   auto *Preheader = L.getLoopPreheader();
   assert(Preheader && "Loop is not in simplify form?");
@@ -2630,9 +2653,9 @@ static bool hoistSub(ICmpInst::Predicate Pred, Value *VariantLHS,
   Value *NewCmpOp =
       VariantSubtracted
           ? Builder.CreateSub(InvariantOp, InvariantRHS, "invariant.op",
-                              /*HasNUW*/ false, /*HasNSW*/ true)
+                              /*HasNUW*/ !IsSigned, /*HasNSW*/ IsSigned)
           : Builder.CreateAdd(InvariantOp, InvariantRHS, "invariant.op",
-                              /*HasNUW*/ false, /*HasNSW*/ true);
+                              /*HasNUW*/ !IsSigned, /*HasNSW*/ IsSigned);
   ICmp.setPredicate(Pred);
   ICmp.setOperand(0, VariantOp);
   ICmp.setOperand(1, NewCmpOp);
@@ -2650,10 +2673,6 @@ static bool hoistAddSub(Instruction &I, Loop &L, ICFLoopSafetyInfo &SafetyInfo,
   if (!match(&I, m_ICmp(Pred, m_Value(LHS), m_Value(RHS))))
     return false;
 
-  // TODO: Support unsigned predicates?
-  if (!ICmpInst::isSigned(Pred))
-    return false;
-
   // Put variant operand to LHS position.
   if (L.isLoopInvariant(LHS)) {
     std::swap(LHS, RHS);
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index cf00299812bb7f..d378c6c3a4b01c 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -937,7 +937,6 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::NoUnwind:
       case Attribute::NoSanitizeBounds:
       case Attribute::NoSanitizeCoverage:
-      case Attribute::NoSanitizeRealtime:
       case Attribute::NullPointerIsValid:
       case Attribute::OptimizeForDebugging:
       case Attribute::OptForFuzzing:
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index f1f2d522f1cbaa..8a8d8afece6cb4 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1210,39 +1210,31 @@ Value *llvm::createAnyOfTargetReduction(IRBuilderBase &Builder, Value *Src,
 Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
                                          RecurKind RdxKind) {
   auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
+  auto getIdentity = [&]() {
+    Intrinsic::ID ID = getReductionIntrinsicID(RdxKind);
+    unsigned Opc = getArithmeticReductionInstruction(ID);
+    return ConstantExpr::getBinOpIdentity(Opc, SrcVecEltTy);
+  };
   switch (RdxKind) {
   case RecurKind::Add:
-    return Builder.CreateAddReduce(Src);
   case RecurKind::Mul:
-    return Builder.CreateMulReduce(Src);
   case RecurKind::And:
-    return Builder.CreateAndReduce(Src);
   case RecurKind::Or:
-    return Builder.CreateOrReduce(Src);
   case RecurKind::Xor:
-    return Builder.CreateXorReduce(Src);
-  case RecurKind::FMulAdd:
-  case RecurKind::FAdd:
-    return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy),
-                                    Src);
-  case RecurKind::FMul:
-    return Builder.CreateFMulReduce(ConstantFP::get(SrcVecEltTy, 1.0), Src);
   case RecurKind::SMax:
-    return Builder.CreateIntMaxReduce(Src, true);
   case RecurKind::SMin:
-    return Builder.CreateIntMinReduce(Src, true);
   case RecurKind::UMax:
-    return Builder.CreateIntMaxReduce(Src, false);
   case RecurKind::UMin:
-    return Builder.CreateIntMinReduce(Src, false);
   case RecurKind::FMax:
-    return Builder.CreateFPMaxReduce(Src);
   case RecurKind::FMin:
-    return Builder.CreateFPMinReduce(Src);
   case RecurKind::FMinimum:
-    return Builder.CreateFPMinimumReduce(Src);
   case RecurKind::FMaximum:
-    return Builder.CreateFPMaximumReduce(Src);
+    return Builder.CreateUnaryIntrinsic(getReductionIntrinsicID(RdxKind), Src);
+  case RecurKind::FMulAdd:
+  case RecurKind::FAdd:
+    return Builder.CreateFAddReduce(getIdentity(), Src);
+  case RecurKind::FMul:
+    return Builder.CreateFMulReduce(getIdentity(), Src);
   default:
     llvm_unreachable("Unhandled opcode");
   }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6babfd1eee9108..fa05b8dd22426f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7147,7 +7147,12 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
     if (!OrigLoop->contains(CondI) ||
         !CostCtx.SkipCostComputation.insert(CondI).second)
       continue;
-    Cost += CostCtx.getLegacyCost(CondI, VF);
+    InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
+    LLVM_DEBUG({
+      dbgs() << "Cost of " << CondICost << " for VF " << VF
+             << ": exit condition instruction " << *CondI << "\n";
+    });
+    Cost += CondICost;
     for (Value *Op : CondI->operands()) {
       auto *OpI = dyn_cast<Instruction>(Op);
       if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) {
@@ -7250,10 +7255,9 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
 /// not have corresponding recipes in \p Plan and are not marked to be ignored
 /// in \p CostCtx. This means the VPlan contains simplification that the legacy
 /// cost-model did not account for.
-static bool
-planContainsAdditionalSimplifications(VPlan &Plan, ElementCount VF,
-                                      VPCostContext &CostCtx, Loop *TheLoop,
-                                      LoopVectorizationCostModel &CM) {
+static bool planContainsAdditionalSimplifications(VPlan &Plan,
+                                                  VPCostContext &CostCtx,
+                                                  Loop *TheLoop) {
   // First collect all instructions for the recipes in Plan.
   auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
     if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
@@ -7284,16 +7288,13 @@ planContainsAdditionalSimplifications(VPlan &Plan, ElementCount VF,
   // Return true if the loop contains any instructions that are not also part of
   // the VPlan or are skipped for VPlan-based cost computations. This indicates
   // that the VPlan contains extra simplifications.
-  return any_of(
-      TheLoop->blocks(), [&SeenInstrs, VF, &CostCtx, &CM](BasicBlock *BB) {
-        return any_of(*BB, [&SeenInstrs, VF, &CostCtx, &CM](Instruction &I) {
-          if (isa<PHINode>(&I))
-            return false;
-          return !SeenInstrs.contains(&I) &&
-                 !CostCtx.skipCostComputation(&I, true) &&
-                 !CM.canTruncateToMinimalBitwidth(&I, VF);
-        });
-      });
+  return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx](BasicBlock *BB) {
+    return any_of(*BB, [&SeenInstrs, &CostCtx](Instruction &I) {
+      if (isa<PHINode>(&I))
+        return false;
+      return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
+    });
+  });
 }
 #endif
 
@@ -7364,8 +7365,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   assert((BestFactor.Width == LegacyVF.Width ||
           planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
-                                                BestFactor.Width, CostCtx,
-                                                OrigLoop, CM)) &&
+                                                CostCtx, OrigLoop)) &&
          " VPlan cost model and legacy cost model disagreed");
   assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
          "when vectorizing, the scalar cost must be computed.");
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index edb2567fa057b3..3d41c978281351 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -260,6 +260,20 @@ static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
                               VF * getNumElements(ScalarTy));
 }
 
+/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
+/// which forms type, which splits by \p TTI into whole vector types during
+/// legalization.
+static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
+                                              Type *Ty, unsigned Sz) {
+  if (!isValidElementType(Ty))
+    return PowerOf2Ceil(Sz);
+  // Find the number of elements, which forms full vectors.
+  const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
+  if (NumParts == 0 || NumParts == Sz)
+    return PowerOf2Ceil(Sz);
+  return PowerOf2Ceil(divideCeil(Sz, NumParts)) * NumParts;
+}
+
 static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
                                                    SmallVectorImpl<int> &Mask) {
   // The ShuffleBuilder implementation use shufflevector to splat an "element".
@@ -1224,6 +1238,22 @@ static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
          (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
 }
 
+/// Returns true if widened type of \p Ty elements with size \p Sz represents
+/// full vector type, i.e. adding extra element results in extra parts upon type
+/// legalization.
+static bool hasFullVectorsOnly(const TargetTransformInfo &TTI, Type *Ty,
+                               unsigned Sz) {
+  if (Sz <= 1)
+    return false;
+  if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
+    return false;
+  if (has_single_bit(Sz))
+    return true;
+  const unsigned NumParts = TTI.getRegUsageForType(getWidenedType(Ty, Sz));
+  return NumParts > 0 && NumParts != Sz && has_single_bit(Sz / NumParts) &&
+         Sz % NumParts == 0;
+}
+
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -2467,7 +2497,9 @@ class BoUpSLP {
         }
         // TODO: Check if we can remove a check for non-power-2 number of
         // scalars after full support of non-power-2 vectorization.
-        return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
+        return UniqueValues.size() != 2 &&
+               hasFullVectorsOnly(*R.TTI, (*UniqueValues.begin())->getType(),
+                                  UniqueValues.size());
       };
 
       // If the initial strategy fails for any of the operand indexes, then we
@@ -2605,7 +2637,7 @@ class BoUpSLP {
       int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
                                                Candidates[I].second,
                                                /*U1=*/nullptr, /*U2=*/nullptr,
-                                               /*Level=*/1, std::nullopt);
+                                               /*CurrLevel=*/1, std::nullopt);
       if (Score > BestScore) {
         BestScore = Score;
         Index = I;
@@ -2864,6 +2896,14 @@ class BoUpSLP {
   /// avoid issues with def-use order.
   Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
 
+  /// Returns vectorized operand node, that matches the order of the scalars
+  /// operand number \p NodeIdx in entry \p E.
+  TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
+  const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
+                                               unsigned NodeIdx) const {
+    return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
+  }
+
   /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
   /// \p E.
   /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
@@ -3268,8 +3308,9 @@ class BoUpSLP {
                           SmallVectorImpl<Value *> *AltScalars = nullptr) const;
 
     /// Return true if this is a non-power-of-2 node.
-    bool isNonPowOf2Vec() const {
-      bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
+    bool isNonPowOf2Vec(const TargetTransformInfo &TTI) const {
+      bool IsNonPowerOf2 = !hasFullVectorsOnly(
+          TTI, getValueType(Scalars.front()), Scalars.size());
       assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
              "Reshuffling not supported with non-power-of-2 vectors yet.");
       return IsNonPowerOf2;
@@ -3447,7 +3488,7 @@ class BoUpSLP {
 
     if (UserTreeIdx.UserTE) {
       Last->UserTreeIndices.push_back(UserTreeIdx);
-      assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
+      assert((!Last->isNonPowOf2Vec(*TTI) || Last->ReorderIndices.empty()) &&
              "Reordering isn't implemented for non-power-of-2 nodes yet");
     }
     return Last;
@@ -4353,7 +4394,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
   if (!isValidElementType(ScalarTy))
     return std::nullopt;
   auto *VecTy = getWidenedType(ScalarTy, NumScalars);
-  int NumParts = TTI->getNumberOfParts(VecTy);
+  int NumParts = TTI->getRegUsageForType(VecTy);
   if (NumParts == 0 || NumParts >= NumScalars)
     NumParts = 1;
   SmallVector<int> ExtractMask;
@@ -4725,7 +4766,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
   // Check the order of pointer operands or that all pointers are the same.
   bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
   // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
-  if (!Order.empty() && !has_single_bit(VL.size())) {
+  if (!Order.empty() && !hasFullVectorsOnly(*TTI, ScalarTy, Sz)) {
     assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
                                    "supported with VectorizeNonPowerOf2");
     return LoadsState::Gather;
@@ -4779,12 +4820,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
                  });
         });
     const unsigned AbsoluteDiff = std::abs(*Diff);
-    if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
-                              ((Sz > MinProfitableStridedLoads ||
-                                (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
-                                 has_single_bit(AbsoluteDiff))) &&
-                               AbsoluteDiff > Sz) ||
-                              *Diff == -(static_cast<int>(Sz) - 1))) {
+    if (IsPossibleStrided &&
+        (IsAnyPointerUsedOutGraph ||
+         ((Sz > MinProfitableStridedLoads ||
+           (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
+            hasFullVectorsOnly(*TTI, ScalarTy, AbsoluteDiff))) &&
+          AbsoluteDiff > Sz) ||
+         *Diff == -(static_cast<int>(Sz) - 1))) {
       int Stride = *Diff / static_cast<int>(Sz - 1);
       if (*Diff == Stride * static_cast<int>(Sz - 1)) {
         Align Alignment =
@@ -4812,16 +4854,68 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
       }
     }
   }
-  auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
+  // Correctly identify compare the cost of loads + shuffles rather than
+  // strided/masked gather loads. Returns true if vectorized + shuffles
+  // representation is better than just gather.
+  auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
+                                                bool ProfitableGatherPointers) {
+    // Compare masked gather cost and loads + insert subvector costs.
+    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+    auto [ScalarGEPCost, VectorGEPCost] =
+        getGEPCosts(TTI, PointerOps, PointerOps.front(),
+                    Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
+    // Estimate the cost of masked gather GEP. If not a splat, roughly
+    // estimate as a buildvector, otherwise estimate as splat.
+    APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
+    VectorType *PtrVecTy =
+        getWidenedType(PointerOps.front()->getType()->getScalarType(),
+                       VecTy->getNumElements());
+    if (static_cast<unsigned>(count_if(
+            PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
+        any_of(PointerOps, [&](Value *V) {
+          return getUnderlyingObject(V) !=
+                 getUnderlyingObject(PointerOps.front());
+        }))
+      VectorGEPCost += TTI.getScalarizationOverhead(
+          PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
+    else
+      VectorGEPCost +=
+          TTI.getScalarizationOverhead(
+              PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
+              /*Insert=*/true, /*Extract=*/false, CostKind) +
+          ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, std::nullopt,
+                           CostKind);
+    // The cost of scalar loads.
+    InstructionCost ScalarLoadsCost =
+        std::accumulate(VL.begin(), VL.end(), InstructionCost(),
+                        [&](InstructionCost C, Value *V) {
+                          return C + TTI.getInstructionCost(
+                                         cast<Instruction>(V), CostKind);
+                        }) +
+        ScalarGEPCost;
+    // The cost of masked gather.
+    InstructionCost MaskedGatherCost =
+        TTI.getGatherScatterOpCost(
+            Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
+            /*VariableMask=*/false, CommonAlignment, CostKind) +
+        (ProfitableGatherPointers ? 0 : VectorGEPCost);
+    InstructionCost GatherCost =
+        TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
+                                     /*Extract=*/false, CostKind) +
+        ScalarLoadsCost;
+    // The list of loads is small or perform partial check already - directly
+    // compare masked gather cost and gather cost.
+    constexpr unsigned ListLimit = 4;
+    if (!TryRecursiveCheck || VL.size() < ListLimit)
+      return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
     unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
-    unsigned MinVF = getMinVF(Sz);
-    unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
-    MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
-    for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
-      unsigned VectorizedCnt = 0;
+    unsigned MinVF = getMinVF(2 * Sz);
+    DemandedElts.clearAllBits();
+    // Iterate through possible vectorization factors and check if vectorized +
+    // shuffles is better than just gather.
+    for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
       SmallVector<LoadsState> States;
-      for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
-           Cnt += VF, ++VectorizedCnt) {
+      for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
         ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
         SmallVector<unsigned> Order;
         SmallVector<Value *> PointerOps;
@@ -4829,8 +4923,10 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
             canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
                               /*TryRecursiveCheck=*/false);
         // Check that the sorted loads are consecutive.
-        if (LS == LoadsState::Gather)
-          break;
+        if (LS == LoadsState::Gather) {
+          DemandedElts.setBits(Cnt, Cnt + VF);
+          continue;
+        }
         // If need the reorder - consider as high-cost masked gather for now.
         if ((LS == LoadsState::Vectorize ||
              LS == LoadsState::StridedVectorize) &&
@@ -4838,79 +4934,92 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
           LS = LoadsState::ScatterVectorize;
         States.push_back(LS);
       }
+      if (DemandedElts.isAllOnes())
+        // All loads gathered - try smaller VF.
+        continue;
       // Can be vectorized later as a serie of loads/insertelements.
-      if (VectorizedCnt == VL.size() / VF) {
-        // Compare masked gather cost and loads + insersubvector costs.
-        TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-        auto [ScalarGEPCost, VectorGEPCost] =
-            getGEPCosts(TTI, PointerOps, PointerOps.front(),
-                        Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
-        InstructionCost MaskedGatherCost =
-            TTI.getGatherScatterOpCost(Instruction::Load, VecTy,
-                                       cast<LoadInst>(VL0)->getPointerOperand(),
-                                       /*VariableMask=*/false, CommonAlignment,
-                                       CostKind) +
-            VectorGEPCost - ScalarGEPCost;
-        InstructionCost VecLdCost = 0;
-        auto *SubVecTy = getWidenedType(ScalarTy, VF);
-        for (auto [I, LS] : enumerate(States)) {
-          auto *LI0 = cast<LoadInst>(VL[I * VF]);
-          switch (LS) {
-          case LoadsState::Vectorize: {
-            auto [ScalarGEPCost, VectorGEPCost] =
-                getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
-                            LI0->getPointerOperand(), Instruction::Load,
-                            CostKind, ScalarTy, SubVecTy);
-            VecLdCost += TTI.getMemoryOpCost(
-                             Instruction::Load, SubVecTy, LI0->getAlign(),
-                             LI0->getPointerAddressSpace(), CostKind,
-                             TTI::OperandValueInfo()) +
-                         VectorGEPCost - ScalarGEPCost;
-            break;
-          }
-          case LoadsState::StridedVectorize: {
-            auto [ScalarGEPCost, VectorGEPCost] =
-                getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
-                            LI0->getPointerOperand(), Instruction::Load,
-                            CostKind, ScalarTy, SubVecTy);
-            VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
-                                                    LI0->getPointerOperand(),
-                                                    /*VariableMask=*/false,
-                                                    CommonAlignment, CostKind) +
-                         VectorGEPCost - ScalarGEPCost;
-            break;
-          }
-          case LoadsState::ScatterVectorize: {
-            auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
-                TTI, ArrayRef(PointerOps).slice(I * VF, VF),
-                LI0->getPointerOperand(), Instruction::GetElementPtr, CostKind,
-                ScalarTy, SubVecTy);
-            VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
-                                                    LI0->getPointerOperand(),
-                                                    /*VariableMask=*/false,
-                                                    CommonAlignment, CostKind) +
-                         VectorGEPCost - ScalarGEPCost;
-            break;
-          }
-          case LoadsState::Gather:
-            llvm_unreachable(
-                "Expected only consecutive, strided or masked gather loads.");
-          }
-          SmallVector<int> ShuffleMask(VL.size());
-          for (int Idx : seq<int>(0, VL.size()))
-            ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
+      InstructionCost VecLdCost = 0;
+      if (!DemandedElts.isZero()) {
+        VecLdCost =
+            TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
+                                         /*Extract=*/false, CostKind) +
+            ScalarGEPCost;
+        for (unsigned Idx : seq<unsigned>(VL.size()))
+          if (DemandedElts[Idx])
+            VecLdCost +=
+                TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
+      }
+      auto *SubVecTy = getWidenedType(ScalarTy, VF);
+      for (auto [I, LS] : enumerate(States)) {
+        auto *LI0 = cast<LoadInst>(VL[I * VF]);
+        InstructionCost VectorGEPCost =
+            (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
+                ? 0
+                : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
+                              LI0->getPointerOperand(),
+                              Instruction::GetElementPtr, CostKind, ScalarTy,
+                              SubVecTy)
+                      .second;
+        if (LS == LoadsState::ScatterVectorize) {
+          if (static_cast<unsigned>(
+                  count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
+                  PointerOps.size() - 1 ||
+              any_of(PointerOps, [&](Value *V) {
+                return getUnderlyingObject(V) !=
+                       getUnderlyingObject(PointerOps.front());
+              }))
+            VectorGEPCost += TTI.getScalarizationOverhead(
+                SubVecTy, APInt::getAllOnes(VF),
+                /*Insert=*/true, /*Extract=*/false, CostKind);
+          else
+            VectorGEPCost += TTI.getScalarizationOverhead(
+                                 SubVecTy, APInt::getOneBitSet(VF, 0),
+                                 /*Insert=*/true, /*Extract=*/false, CostKind) +
+                             ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy,
+                                              std::nullopt, CostKind);
+        }
+        switch (LS) {
+        case LoadsState::Vectorize:
+          VecLdCost +=
+              TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
+                                  LI0->getPointerAddressSpace(), CostKind,
+                                  TTI::OperandValueInfo()) +
+              VectorGEPCost;
+          break;
+        case LoadsState::StridedVectorize:
+          VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
+                                                  LI0->getPointerOperand(),
+                                                  /*VariableMask=*/false,
+                                                  CommonAlignment, CostKind) +
+                       VectorGEPCost;
+          break;
+        case LoadsState::ScatterVectorize:
+          VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
+                                                  LI0->getPointerOperand(),
+                                                  /*VariableMask=*/false,
+                                                  CommonAlignment, CostKind) +
+                       VectorGEPCost;
+          break;
+        case LoadsState::Gather:
+          // Gathers are already calculated - ignore.
+          continue;
+        }
+        SmallVector<int> ShuffleMask(VL.size());
+        for (int Idx : seq<int>(0, VL.size()))
+          ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
+        if (I > 0)
           VecLdCost +=
               ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
                                CostKind, I * VF, SubVecTy);
-        }
-        // If masked gather cost is higher - better to vectorize, so
-        // consider it as a gather node. It will be better estimated
-        // later.
-        if (MaskedGatherCost >= VecLdCost)
-          return true;
       }
+      // If masked gather cost is higher - better to vectorize, so
+      // consider it as a gather node. It will be better estimated
+      // later.
+      if (MaskedGatherCost >= VecLdCost &&
+          VecLdCost - GatherCost < -SLPCostThreshold)
+        return true;
     }
-    return false;
+    return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
   };
   // TODO: need to improve analysis of the pointers, if not all of them are
   // GEPs or have > 2 operands, we end up with a gather node, which just
@@ -4931,7 +5040,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
         !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
       // Check if potential masked gather can be represented as series
       // of loads + insertsubvectors.
-      if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
+      if (TryRecursiveCheck &&
+          CheckForShuffledLoads(CommonAlignment, ProfitableGatherPointers)) {
         // If masked gather cost is higher - better to vectorize, so
         // consider it as a gather node. It will be better estimated
         // later.
@@ -5121,7 +5231,7 @@ static bool areTwoInsertFromSameBuildVector(
 std::optional<BoUpSLP::OrdersType>
 BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
   // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
-  if (TE.isNonPowOf2Vec())
+  if (TE.isNonPowOf2Vec(*TTI))
     return std::nullopt;
 
   // No need to reorder if need to shuffle reuses, still need to shuffle the
@@ -5155,8 +5265,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       }
     }
     if (Sz == 2 && TE.getVectorFactor() == 4 &&
-        TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
-                                             2 * TE.getVectorFactor())) == 1)
+        TTI->getRegUsageForType(getWidenedType(TE.Scalars.front()->getType(),
+                                               2 * TE.getVectorFactor())) == 1)
       return std::nullopt;
     if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
                                                      Sz)) {
@@ -5505,7 +5615,7 @@ void BoUpSLP::reorderTopToBottom() {
 
   // Reorder the graph nodes according to their vectorization factor.
   for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
-       VF /= 2) {
+       VF -= 2) {
     auto It = VFToOrderedEntries.find(VF);
     if (It == VFToOrderedEntries.end())
       continue;
@@ -5678,7 +5788,7 @@ bool BoUpSLP::canReorderOperands(
     ArrayRef<TreeEntry *> ReorderableGathers,
     SmallVectorImpl<TreeEntry *> &GatherOps) {
   // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
-  if (UserTE->isNonPowOf2Vec())
+  if (UserTE->isNonPowOf2Vec(*TTI))
     return false;
 
   for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
@@ -5853,7 +5963,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
         const auto AllowsReordering = [&](const TreeEntry *TE) {
           // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
-          if (TE->isNonPowOf2Vec())
+          if (TE->isNonPowOf2Vec(*TTI))
             return false;
           if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
               (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
@@ -6499,7 +6609,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   case Instruction::ExtractElement: {
     bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
     // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
-    if (!has_single_bit(VL.size()))
+    if (!hasFullVectorsOnly(*TTI, VL0->getType(), VL.size()))
       return TreeEntry::NeedToGather;
     if (Reuse || !CurrentOrder.empty())
       return TreeEntry::Vectorize;
@@ -6909,7 +7019,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       ReuseShuffleIndices.clear();
     } else {
       // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
-      if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
+      if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec(*TTI)) {
         LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
                              "for nodes with padding.\n");
         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
@@ -6922,7 +7032,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                                    return isa<UndefValue>(V) ||
                                                           !isConstant(V);
                                                  })) ||
-          !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
+          !hasFullVectorsOnly(*TTI, UniqueValues.front()->getType(),
+                              NumUniqueScalarValues)) {
         if (DoNotFail && UniquePositions.size() > 1 &&
             NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
             all_of(UniqueValues, [=](Value *V) {
@@ -6930,7 +7041,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                      areAllUsersVectorized(cast<Instruction>(V),
                                            UserIgnoreList);
             })) {
-          unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
+          // Find the number of elements, which forms full vectors.
+          unsigned PWSz = getFullVectorNumberOfElements(
+              *TTI, UniqueValues.front()->getType(), UniqueValues.size());
           if (PWSz == VL.size()) {
             ReuseShuffleIndices.clear();
           } else {
@@ -6964,6 +7077,55 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
   }
 
+  // Check if this is a duplicate of another entry.
+  if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+    LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
+    if (!E->isSame(VL)) {
+      auto It = MultiNodeScalars.find(S.OpValue);
+      if (It != MultiNodeScalars.end()) {
+        auto *TEIt = find_if(It->getSecond(),
+                             [&](TreeEntry *ME) { return ME->isSame(VL); });
+        if (TEIt != It->getSecond().end())
+          E = *TEIt;
+        else
+          E = nullptr;
+      } else {
+        E = nullptr;
+      }
+    }
+    if (!E) {
+      if (!doesNotNeedToBeScheduled(S.OpValue)) {
+        LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+        if (TryToFindDuplicates(S))
+          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndices);
+        return;
+      }
+      SmallPtrSet<const TreeEntry *, 4> Nodes;
+      Nodes.insert(getTreeEntry(S.OpValue));
+      for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue))
+        Nodes.insert(E);
+      SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
+      if (any_of(Nodes, [&](const TreeEntry *E) {
+            return all_of(E->Scalars,
+                          [&](Value *V) { return Values.contains(V); });
+          })) {
+        LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
+        if (TryToFindDuplicates(S))
+          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+                       ReuseShuffleIndices);
+        return;
+      }
+    } else {
+      // Record the reuse of the tree node.  FIXME, currently this is only used
+      // to properly draw the graph rather than for the actual vectorization.
+      E->UserTreeIndices.push_back(UserTreeIdx);
+      LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+                        << ".\n");
+      return;
+    }
+  }
+
   // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
   // a load), in which case peek through to include it in the tree, without
   // ballooning over-budget.
@@ -7095,55 +7257,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // We now know that this is a vector of instructions of the same type from
   // the same block.
 
-  // Check if this is a duplicate of another entry.
-  if (TreeEntry *E = getTreeEntry(S.OpValue)) {
-    LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
-    if (!E->isSame(VL)) {
-      auto It = MultiNodeScalars.find(S.OpValue);
-      if (It != MultiNodeScalars.end()) {
-        auto *TEIt = find_if(It->getSecond(),
-                             [&](TreeEntry *ME) { return ME->isSame(VL); });
-        if (TEIt != It->getSecond().end())
-          E = *TEIt;
-        else
-          E = nullptr;
-      } else {
-        E = nullptr;
-      }
-    }
-    if (!E) {
-      if (!doesNotNeedToBeScheduled(S.OpValue)) {
-        LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-        if (TryToFindDuplicates(S))
-          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndices);
-        return;
-      }
-      SmallPtrSet<const TreeEntry *, 4> Nodes;
-      Nodes.insert(getTreeEntry(S.OpValue));
-      for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue))
-        Nodes.insert(E);
-      SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
-      if (any_of(Nodes, [&](const TreeEntry *E) {
-            return all_of(E->Scalars,
-                          [&](Value *V) { return Values.contains(V); });
-          })) {
-        LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
-        if (TryToFindDuplicates(S))
-          newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndices);
-        return;
-      }
-    } else {
-      // Record the reuse of the tree node.  FIXME, currently this is only used
-      // to properly draw the graph rather than for the actual vectorization.
-      E->UserTreeIndices.push_back(UserTreeIdx);
-      LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
-                        << ".\n");
-      return;
-    }
-  }
-
   // Check that none of the instructions in the bundle are already in the tree.
   for (Value *V : VL) {
     if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
@@ -9141,7 +9254,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     }
     assert(!CommonMask.empty() && "Expected non-empty common mask.");
     auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
-    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+    unsigned NumParts = TTI.getRegUsageForType(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
     unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9158,7 +9271,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     }
     assert(!CommonMask.empty() && "Expected non-empty common mask.");
     auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
-    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+    unsigned NumParts = TTI.getRegUsageForType(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
     unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
@@ -9362,22 +9475,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
 
 const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
                                                    unsigned Idx) const {
-  Value *Op = E->getOperand(Idx).front();
-  if (const TreeEntry *TE = getTreeEntry(Op)) {
-    if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
-          return EI.EdgeIdx == Idx && EI.UserTE == E;
-        }) != TE->UserTreeIndices.end())
-      return TE;
-    auto MIt = MultiNodeScalars.find(Op);
-    if (MIt != MultiNodeScalars.end()) {
-      for (const TreeEntry *TE : MIt->second) {
-        if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
-              return EI.EdgeIdx == Idx && EI.UserTE == E;
-            }) != TE->UserTreeIndices.end())
-          return TE;
-      }
-    }
-  }
+  if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
+    return VE;
   const auto *It =
       find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
         return TE->isGather() &&
@@ -9678,7 +9777,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     unsigned const NumElts = SrcVecTy->getNumElements();
     unsigned const NumScalars = VL.size();
 
-    unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
+    unsigned NumOfParts = TTI->getRegUsageForType(SrcVecTy);
 
     SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
     unsigned OffsetBeg = *getElementIndex(VL.front());
@@ -10894,7 +10993,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           // Keep original scalar if number of externally used instructions in
           // the same entry is not power of 2. It may help to do some extra
           // vectorization for now.
-          KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
+          KeepScalar =
+              ScalarUsesCount <= 1 ||
+              !hasFullVectorsOnly(*TTI, EU.Scalar->getType(), ScalarUsesCount);
         }
         if (KeepScalar) {
           ExternalUsesAsOriginalScalar.insert(EU.Scalar);
@@ -11587,13 +11688,14 @@ BoUpSLP::isGatherShuffledEntry(
   if (TE == VectorizableTree.front().get())
     return {};
   // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
-  if (TE->isNonPowOf2Vec())
+  if (TE->isNonPowOf2Vec(*TTI))
     return {};
   Mask.assign(VL.size(), PoisonMaskElem);
   assert(TE->UserTreeIndices.size() == 1 &&
          "Expected only single user of the gather node.");
-  assert(VL.size() % NumParts == 0 &&
-         "Number of scalars must be divisible by NumParts.");
+  // Number of scalars must be divisible by NumParts.
+  if (VL.size() % NumParts != 0)
+    return {};
   unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
   SmallVector<std::optional<TTI::ShuffleKind>> Res;
   for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -12521,10 +12623,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   }
 };
 
-Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
-                                 bool PostponedPHIs) {
-  ValueList &VL = E->getOperand(NodeIdx);
-  const unsigned VF = VL.size();
+BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
+                                                         unsigned NodeIdx) {
+  ArrayRef<Value *> VL = E->getOperand(NodeIdx);
   InstructionsState S = getSameOpcode(VL, *TLI);
   // Special processing for GEPs bundle, which may include non-gep values.
   if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
@@ -12532,109 +12633,113 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
     if (It != VL.end())
       S = getSameOpcode(*It, *TLI);
   }
-  if (S.getOpcode()) {
-    auto CheckSameVE = [&](const TreeEntry *VE) {
-      return VE->isSame(VL) &&
-             (any_of(VE->UserTreeIndices,
-                     [E, NodeIdx](const EdgeInfo &EI) {
-                       return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
-                     }) ||
-              any_of(VectorizableTree,
-                     [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
-                       return TE->isOperandGatherNode({E, NodeIdx}) &&
-                              VE->isSame(TE->Scalars);
-                     }));
+  if (!S.getOpcode())
+    return nullptr;
+  auto CheckSameVE = [&](const TreeEntry *VE) {
+    return VE->isSame(VL) &&
+           (any_of(VE->UserTreeIndices,
+                   [E, NodeIdx](const EdgeInfo &EI) {
+                     return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
+                   }) ||
+            any_of(VectorizableTree,
+                   [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
+                     return TE->isOperandGatherNode(
+                                {const_cast<TreeEntry *>(E), NodeIdx}) &&
+                            VE->isSame(TE->Scalars);
+                   }));
+  };
+  TreeEntry *VE = getTreeEntry(S.OpValue);
+  if (VE && CheckSameVE(VE))
+    return VE;
+  auto It = MultiNodeScalars.find(S.OpValue);
+  if (It != MultiNodeScalars.end()) {
+    auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
+      return TE != VE && CheckSameVE(TE);
+    });
+    if (I != It->getSecond().end())
+      return *I;
+  }
+  return nullptr;
+}
+
+Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
+                                 bool PostponedPHIs) {
+  ValueList &VL = E->getOperand(NodeIdx);
+  const unsigned VF = VL.size();
+  if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
+    auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
+      // V may be affected by MinBWs.
+      // We want ShuffleInstructionBuilder to correctly support REVEC. The key
+      // factor is the number of elements, not their type.
+      Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
+      unsigned NumElements = getNumElements(VL.front()->getType());
+      ShuffleInstructionBuilder ShuffleBuilder(
+          NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
+                           : ScalarTy,
+          Builder, *this);
+      ShuffleBuilder.add(V, Mask);
+      SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
+          E->CombinedEntriesWithIndices.size());
+      transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
+                [&](const auto &P) {
+                  return std::make_pair(VectorizableTree[P.first].get(),
+                                        P.second);
+                });
+      return ShuffleBuilder.finalize(std::nullopt, SubVectors);
     };
-    TreeEntry *VE = getTreeEntry(S.OpValue);
-    bool IsSameVE = VE && CheckSameVE(VE);
-    if (!IsSameVE) {
-      auto It = MultiNodeScalars.find(S.OpValue);
-      if (It != MultiNodeScalars.end()) {
-        auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
-          return TE != VE && CheckSameVE(TE);
-        });
-        if (I != It->getSecond().end()) {
-          VE = *I;
-          IsSameVE = true;
-        }
-      }
-    }
-    if (IsSameVE) {
-      auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
-        // V may be affected by MinBWs.
-        // We want ShuffleInstructionBuilder to correctly support REVEC. The key
-        // factor is the number of elements, not their type.
-        Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
-        unsigned NumElements = getNumElements(VL.front()->getType());
-        ShuffleInstructionBuilder ShuffleBuilder(
-            NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
-                             : ScalarTy,
-            Builder, *this);
-        ShuffleBuilder.add(V, Mask);
-        SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
-            E->CombinedEntriesWithIndices.size());
-        transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
-                  [&](const auto &P) {
-                    return std::make_pair(VectorizableTree[P.first].get(),
-                                          P.second);
-                  });
-        return ShuffleBuilder.finalize(std::nullopt, SubVectors);
-      };
-      Value *V = vectorizeTree(VE, PostponedPHIs);
-      if (VF * getNumElements(VL[0]->getType()) !=
-          cast<FixedVectorType>(V->getType())->getNumElements()) {
-        if (!VE->ReuseShuffleIndices.empty()) {
-          // Reshuffle to get only unique values.
-          // If some of the scalars are duplicated in the vectorization
-          // tree entry, we do not vectorize them but instead generate a
-          // mask for the reuses. But if there are several users of the
-          // same entry, they may have different vectorization factors.
-          // This is especially important for PHI nodes. In this case, we
-          // need to adapt the resulting instruction for the user
-          // vectorization factor and have to reshuffle it again to take
-          // only unique elements of the vector. Without this code the
-          // function incorrectly returns reduced vector instruction with
-          // the same elements, not with the unique ones.
-
-          // block:
-          // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
-          // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
-          // ... (use %2)
-          // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
-          // br %block
-          SmallVector<int> Mask(VF, PoisonMaskElem);
-          for (auto [I, V] : enumerate(VL)) {
-            if (isa<PoisonValue>(V))
-              continue;
-            Mask[I] = VE->findLaneForValue(V);
-          }
-          V = FinalShuffle(V, Mask);
-        } else {
-          assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
-                 "Expected vectorization factor less "
-                 "than original vector size.");
-          SmallVector<int> UniformMask(VF, 0);
-          std::iota(UniformMask.begin(), UniformMask.end(), 0);
-          V = FinalShuffle(V, UniformMask);
+    Value *V = vectorizeTree(VE, PostponedPHIs);
+    if (VF * getNumElements(VL[0]->getType()) !=
+        cast<FixedVectorType>(V->getType())->getNumElements()) {
+      if (!VE->ReuseShuffleIndices.empty()) {
+        // Reshuffle to get only unique values.
+        // If some of the scalars are duplicated in the vectorization
+        // tree entry, we do not vectorize them but instead generate a
+        // mask for the reuses. But if there are several users of the
+        // same entry, they may have different vectorization factors.
+        // This is especially important for PHI nodes. In this case, we
+        // need to adapt the resulting instruction for the user
+        // vectorization factor and have to reshuffle it again to take
+        // only unique elements of the vector. Without this code the
+        // function incorrectly returns reduced vector instruction with
+        // the same elements, not with the unique ones.
+
+        // block:
+        // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
+        // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
+        // ... (use %2)
+        // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
+        // br %block
+        SmallVector<int> Mask(VF, PoisonMaskElem);
+        for (auto [I, V] : enumerate(VL)) {
+          if (isa<PoisonValue>(V))
+            continue;
+          Mask[I] = VE->findLaneForValue(V);
         }
-      }
-      // Need to update the operand gather node, if actually the operand is not a
-      // vectorized node, but the buildvector/gather node, which matches one of
-      // the vectorized nodes.
-      if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
-            return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
-          }) == VE->UserTreeIndices.end()) {
-        auto *It = find_if(
-            VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
-              return TE->isGather() &&
-                     TE->UserTreeIndices.front().UserTE == E &&
-                     TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
-            });
-        assert(It != VectorizableTree.end() && "Expected gather node operand.");
-        (*It)->VectorizedValue = V;
-      }
-      return V;
+        V = FinalShuffle(V, Mask);
+      } else {
+        assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
+               "Expected vectorization factor less "
+               "than original vector size.");
+        SmallVector<int> UniformMask(VF, 0);
+        std::iota(UniformMask.begin(), UniformMask.end(), 0);
+        V = FinalShuffle(V, UniformMask);
+      }
+    }
+    // Need to update the operand gather node, if actually the operand is not a
+    // vectorized node, but the buildvector/gather node, which matches one of
+    // the vectorized nodes.
+    if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
+          return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
+        }) == VE->UserTreeIndices.end()) {
+      auto *It =
+          find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+            return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
+                   TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
+          });
+      assert(It != VectorizableTree.end() && "Expected gather node operand.");
+      (*It)->VectorizedValue = V;
     }
+    return V;
   }
 
   // Find the corresponding gather entry and vectorize it.
@@ -12729,7 +12834,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
   SmallVector<SmallVector<const TreeEntry *>> Entries;
   Type *OrigScalarTy = GatheredScalars.front()->getType();
   auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
-  unsigned NumParts = TTI->getNumberOfParts(VecTy);
+  unsigned NumParts = TTI->getRegUsageForType(VecTy);
   if (NumParts == 0 || NumParts >= GatheredScalars.size())
     NumParts = 1;
   if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
@@ -13137,7 +13242,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   }
 
   bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
-  auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
+  auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
     ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
     if (E->getOpcode() == Instruction::Store &&
         E->State == TreeEntry::Vectorize) {
@@ -13197,7 +13302,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
                                PH->getParent()->getFirstInsertionPt());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
 
-        V = FinalShuffle(V, E, VecTy);
+        V = FinalShuffle(V, E);
 
         E->VectorizedValue = V;
         if (PostponedPHIs)
@@ -13249,7 +13354,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       if (const TreeEntry *TE = getTreeEntry(V))
         V = TE->VectorizedValue;
       setInsertPointAfterBundle(E);
-      V = FinalShuffle(V, E, VecTy);
+      V = FinalShuffle(V, E);
       E->VectorizedValue = V;
       return V;
     }
@@ -13259,7 +13364,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *Ptr = LI->getPointerOperand();
       LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
       Value *NewV = propagateMetadata(V, E->Scalars);
-      NewV = FinalShuffle(NewV, E, VecTy);
+      NewV = FinalShuffle(NewV, E);
       E->VectorizedValue = NewV;
       return NewV;
     }
@@ -13474,7 +13579,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
                      ? InVec
                      : Builder.CreateCast(VecOpcode, InVec, VecTy);
-      V = FinalShuffle(V, E, VecTy);
+      V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -13518,7 +13623,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       propagateIRFlags(V, E->Scalars, VL0);
       // Do not cast for cmps.
       VecTy = cast<FixedVectorType>(V->getType());
-      V = FinalShuffle(V, E, VecTy);
+      V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -13571,7 +13676,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       assert(getNumElements(Cond->getType()) == TrueNumElements &&
              "Cannot vectorize Instruction::Select");
       Value *V = Builder.CreateSelect(Cond, True, False);
-      V = FinalShuffle(V, E, VecTy);
+      V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -13593,7 +13698,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
-      V = FinalShuffle(V, E, VecTy);
+      V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -13611,7 +13716,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       }
 
       Value *V = Builder.CreateFreeze(Op);
-      V = FinalShuffle(V, E, VecTy);
+      V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -13655,7 +13760,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
                 auto *CI = dyn_cast<ConstantInt>(Op);
                 return CI && CI->getValue().countr_one() >= It->second.first;
               })) {
-            V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
+            V = FinalShuffle(I == 0 ? RHS : LHS, E);
             E->VectorizedValue = V;
             ++NumVectorInstructions;
             return V;
@@ -13688,7 +13793,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           I->setHasNoUnsignedWrap(/*b=*/false);
       }
 
-      V = FinalShuffle(V, E, VecTy);
+      V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -13780,7 +13885,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       }
       Value *V = propagateMetadata(NewLI, E->Scalars);
 
-      V = FinalShuffle(V, E, VecTy);
+      V = FinalShuffle(V, E);
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -13794,7 +13899,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       if (VecValue->getType() != VecTy)
         VecValue =
             Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
-      VecValue = FinalShuffle(VecValue, E, VecTy);
+      VecValue = FinalShuffle(VecValue, E);
 
       Value *Ptr = SI->getPointerOperand();
       Instruction *ST;
@@ -13859,7 +13964,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         V = propagateMetadata(I, GEPs);
       }
 
-      V = FinalShuffle(V, E, VecTy);
+      V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -13941,7 +14046,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
 
       propagateIRFlags(V, E->Scalars, VL0);
-      V = FinalShuffle(V, E, VecTy);
+      V = FinalShuffle(V, E);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -14039,6 +14144,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
                      "Expected same type as operand.");
               if (auto *I = dyn_cast<Instruction>(LHS))
                 LHS = propagateMetadata(I, E->Scalars);
+              LHS = FinalShuffle(LHS, E);
               E->VectorizedValue = LHS;
               ++NumVectorInstructions;
               return LHS;
@@ -15974,7 +16080,7 @@ void BoUpSLP::computeMinimumValueSizes() {
                [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
       return 0u;
 
-    unsigned NumParts = TTI->getNumberOfParts(
+    unsigned NumParts = TTI->getRegUsageForType(
         getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
 
     // The maximum bit width required to represent all the values that can be
@@ -16031,7 +16137,7 @@ void BoUpSLP::computeMinimumValueSizes() {
     // use - ignore it.
     if (NumParts > 1 &&
         NumParts ==
-            TTI->getNumberOfParts(getWidenedType(
+            TTI->getRegUsageForType(getWidenedType(
                 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
       return 0u;
 
@@ -16892,7 +16998,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     for (unsigned I = NextInst; I < MaxInst; ++I) {
       unsigned ActualVF = std::min(MaxInst - I, VF);
 
-      if (!has_single_bit(ActualVF))
+      if (!hasFullVectorsOnly(*TTI, ScalarTy, ActualVF))
         continue;
 
       if (MaxVFOnly && ActualVF < MaxVF)
@@ -19148,7 +19254,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       }
       // Undefs come last.
       assert(U1 && U2 && "The only thing left should be undef & undef.");
-      continue;
     }
     return false;
   };
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f84317ba51257a..c9cee652d2d326 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1802,18 +1802,18 @@ void VPReductionRecipe::execute(VPTransformState &State) {
             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
             NewVecOp);
       PrevInChain = NewRed;
+      NextInChain = NewRed;
     } else {
       PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
+      if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
+        NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
+                                     NewRed, PrevInChain);
+      else
+        NextInChain = State.Builder.CreateBinOp(
+            (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed,
+            PrevInChain);
     }
-    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
-      NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
-                                   NewRed, PrevInChain);
-    } else if (IsOrdered)
-      NextInChain = NewRed;
-    else
-      NextInChain = State.Builder.CreateBinOp(
-          (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
     State.set(this, NextInChain, Part, /*IsScalar*/ true);
   }
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ee7c7cea0b7670..9796ee64f6ef90 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -878,6 +878,17 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     // value with the others blended into it.
 
     unsigned StartIndex = 0;
+    for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
+      // If a value's mask is used only by the blend then is can be deadcoded.
+      // TODO: Find the most expensive mask that can be deadcoded, or a mask
+      // that's used by multiple blends where it can be removed from them all.
+      VPValue *Mask = Blend->getMask(I);
+      if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
+        StartIndex = I;
+        break;
+      }
+    }
+
     SmallVector<VPValue *, 4> OperandsWithMask;
     OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
 
@@ -956,6 +967,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
                          m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
       X == X1 && Y == Y1) {
     R.getVPSingleValue()->replaceAllUsesWith(X);
+    R.eraseFromParent();
     return;
   }
 
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast-half.ll b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
new file mode 100644
index 00000000000000..e20d24c27eb8b4
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
@@ -0,0 +1,1469 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefixes=RV32ZVFH
+; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfhmin -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefixes=RV32ZVFHMIN
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefixes=RV64ZVFH
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefixes=RV64ZVFHMIN
+
+define void @fptosi() {
+; RV32ZVFH-LABEL: 'fptosi'
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; RV32ZVFHMIN-LABEL: 'fptosi'
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; RV64ZVFH-LABEL: 'fptosi'
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; RV64ZVFHMIN-LABEL: 'fptosi'
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8>
+  %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16>
+  %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32>
+  %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64>
+  %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1>
+  %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8>
+  %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16>
+  %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32>
+  %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64>
+  %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1>
+  %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8>
+  %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16>
+  %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32>
+  %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64>
+  %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1>
+  %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8>
+  %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16>
+  %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32>
+  %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64>
+  %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1>
+  %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8>
+  %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16>
+  %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32>
+  %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64>
+  %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1>
+  %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8>
+  %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16>
+  %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32>
+  %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64>
+  %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1>
+  %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8>
+  %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16>
+  %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32>
+  %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64>
+  %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1>
+  %nxv1f16_nxv1i8 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i8>
+  %nxv1f16_nxv1i16 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i16>
+  %nxv1f16_nxv1i32 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i32>
+  %nxv1f16_nxv1i64 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i64>
+  %nxv1f16_nxv1i1 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i1>
+  %nxv2f16_nxv2i8 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i8>
+  %nxv2f16_nxv2i16 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i16>
+  %nxv2f16_nxv2i32 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i32>
+  %nxv2f16_nxv2i64 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i64>
+  %nxv2f16_nxv2i1 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i1>
+  %nxv4f16_nxv4i8 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i8>
+  %nxv4f16_nxv4i16 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i16>
+  %nxv4f16_nxv4i32 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i32>
+  %nxv4f16_nxv4i64 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i64>
+  %nxv4f16_nxv4i1 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i1>
+  %nxv8f16_nxv8i8 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i8>
+  %nxv8f16_nxv8i16 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i16>
+  %nxv8f16_nxv8i32 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i32>
+  %nxv8f16_nxv8i64 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i64>
+  %nxv8f16_nxv8i1 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i1>
+  %nxv16f16_nxv16i8 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i8>
+  %nxv16f16_nxv16i16 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i16>
+  %nxv16f16_nxv16i32 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i32>
+  %nxv16f16_nxv16i64 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i64>
+  %nxv16f16_nxv16i1 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i1>
+  %nxv32f16_nxv32i8 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i8>
+  %nxv32f16_nxv32i16 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i16>
+  %nxv32f16_nxv32i32 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i32>
+  %nxv32f16_nxv32i64 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i64>
+  %nxv32f16_nxv32i1 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i1>
+  %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
+  %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
+  %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
+  %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
+  %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
+  ret void
+}
+
+define void @fptoui() {
+; RV32ZVFH-LABEL: 'fptoui'
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; RV32ZVFHMIN-LABEL: 'fptoui'
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; RV64ZVFH-LABEL: 'fptoui'
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; RV64ZVFHMIN-LABEL: 'fptoui'
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8>
+  %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16>
+  %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32>
+  %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64>
+  %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1>
+  %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8>
+  %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16>
+  %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32>
+  %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64>
+  %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1>
+  %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8>
+  %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16>
+  %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32>
+  %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64>
+  %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1>
+  %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8>
+  %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16>
+  %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32>
+  %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64>
+  %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1>
+  %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8>
+  %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16>
+  %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32>
+  %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64>
+  %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1>
+  %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8>
+  %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16>
+  %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32>
+  %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64>
+  %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1>
+  %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8>
+  %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16>
+  %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32>
+  %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64>
+  %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1>
+  %nxv1f16_nxv1i8 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i8>
+  %nxv1f16_nxv1i16 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i16>
+  %nxv1f16_nxv1i32 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i32>
+  %nxv1f16_nxv1i64 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i64>
+  %nxv1f16_nxv1i1 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i1>
+  %nxv2f16_nxv2i8 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i8>
+  %nxv2f16_nxv2i16 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i16>
+  %nxv2f16_nxv2i32 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i32>
+  %nxv2f16_nxv2i64 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i64>
+  %nxv2f16_nxv2i1 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i1>
+  %nxv4f16_nxv4i8 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i8>
+  %nxv4f16_nxv4i16 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i16>
+  %nxv4f16_nxv4i32 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i32>
+  %nxv4f16_nxv4i64 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i64>
+  %nxv4f16_nxv4i1 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i1>
+  %nxv8f16_nxv8i8 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i8>
+  %nxv8f16_nxv8i16 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i16>
+  %nxv8f16_nxv8i32 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i32>
+  %nxv8f16_nxv8i64 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i64>
+  %nxv8f16_nxv8i1 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i1>
+  %nxv16f16_nxv16i8 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i8>
+  %nxv16f16_nxv16i16 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i16>
+  %nxv16f16_nxv16i32 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i32>
+  %nxv16f16_nxv16i64 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i64>
+  %nxv16f16_nxv16i1 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i1>
+  %nxv32f16_nxv32i8 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i8>
+  %nxv32f16_nxv32i16 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i16>
+  %nxv32f16_nxv32i32 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i32>
+  %nxv32f16_nxv32i64 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i64>
+  %nxv32f16_nxv32i1 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i1>
+  %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
+  %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
+  %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
+  %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
+  %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
+  ret void
+}
+
+define void @sitofp() {
+; RV32ZVFH-LABEL: 'sitofp'
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; RV32ZVFHMIN-LABEL: 'sitofp'
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; RV64ZVFH-LABEL: 'sitofp'
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; RV64ZVFHMIN-LABEL: 'sitofp'
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half>
+  %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half>
+  %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half>
+  %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half>
+  %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half>
+  %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half>
+  %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half>
+  %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half>
+  %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half>
+  %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half>
+  %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half>
+  %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half>
+  %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half>
+  %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half>
+  %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half>
+  %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half>
+  %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half>
+  %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half>
+  %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half>
+  %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half>
+  %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half>
+  %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half>
+  %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half>
+  %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half>
+  %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half>
+  %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half>
+  %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half>
+  %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half>
+  %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half>
+  %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half>
+  %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half>
+  %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half>
+  %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half>
+  %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half>
+  %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half>
+  %nxv1i8_nxv1f16 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
+  %nxv1i16_nxv1f16 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
+  %nxv1i32_nxv1f16 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
+  %nxv1i64_nxv1f16 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
+  %nxv1i1_nxv1f16 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
+  %nxv2i8_nxv2f16 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
+  %nxv2i16_nxv2f16 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
+  %nxv2i32_nxv2f16 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
+  %nxv2i64_nxv2f16 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
+  %nxv2i1_nxv2f16 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
+  %nxv4i8_nxv4f16 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
+  %nxv4i16_nxv4f16 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
+  %nxv4i32_nxv4f16 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
+  %nxv4i64_nxv4f16 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
+  %nxv4i1_nxv4f16 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
+  %nxv8i8_nxv8f16 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
+  %nxv8i16_nxv8f16 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
+  %nxv8i32_nxv8f16 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
+  %nxv8i64_nxv8f16 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
+  %nxv8i1_nxv8f16 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
+  %nxv16i8_nxv16f16 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
+  %nxv16i16_nxv16f16 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
+  %nxv16i32_nxv16f16 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
+  %nxv16i64_nxv16f16 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
+  %nxv16i1_nxv16f16 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
+  %nxv32i8_nxv32f16 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
+  %nxv32i16_nxv32f16 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
+  %nxv32i32_nxv32f16 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
+  %nxv32i64_nxv32f16 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
+  %nxv32i1_nxv32f16 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
+  %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
+  %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
+  %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
+  %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+  %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
+  ret void
+}
+
+define void @uitofp() {
+; RV32ZVFH-LABEL: 'uitofp'
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; RV32ZVFHMIN-LABEL: 'uitofp'
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; RV64ZVFH-LABEL: 'uitofp'
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
+; RV64ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; RV64ZVFHMIN-LABEL: 'uitofp'
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half>
+  %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half>
+  %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half>
+  %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half>
+  %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half>
+  %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half>
+  %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half>
+  %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half>
+  %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half>
+  %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half>
+  %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half>
+  %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half>
+  %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half>
+  %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half>
+  %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half>
+  %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half>
+  %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half>
+  %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half>
+  %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half>
+  %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half>
+  %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half>
+  %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half>
+  %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half>
+  %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half>
+  %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half>
+  %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half>
+  %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half>
+  %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half>
+  %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half>
+  %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half>
+  %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half>
+  %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half>
+  %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half>
+  %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half>
+  %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half>
+  %nxv1i8_nxv1f16 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
+  %nxv1i16_nxv1f16 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
+  %nxv1i32_nxv1f16 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
+  %nxv1i64_nxv1f16 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
+  %nxv1i1_nxv1f16 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
+  %nxv2i8_nxv2f16 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
+  %nxv2i16_nxv2f16 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
+  %nxv2i32_nxv2f16 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
+  %nxv2i64_nxv2f16 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
+  %nxv2i1_nxv2f16 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
+  %nxv4i8_nxv4f16 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
+  %nxv4i16_nxv4f16 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
+  %nxv4i32_nxv4f16 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
+  %nxv4i64_nxv4f16 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
+  %nxv4i1_nxv4f16 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
+  %nxv8i8_nxv8f16 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
+  %nxv8i16_nxv8f16 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
+  %nxv8i32_nxv8f16 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
+  %nxv8i64_nxv8f16 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
+  %nxv8i1_nxv8f16 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
+  %nxv16i8_nxv16f16 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
+  %nxv16i16_nxv16f16 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
+  %nxv16i32_nxv16f16 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
+  %nxv16i64_nxv16f16 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
+  %nxv16i1_nxv16f16 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
+  %nxv32i8_nxv32f16 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
+  %nxv32i16_nxv32f16 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
+  %nxv32i32_nxv32f16 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
+  %nxv32i64_nxv32f16 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
+  %nxv32i1_nxv32f16 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
+  %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
+  %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
+  %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
+  %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+  %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index e90fab9fbc8c46..ccc9101e7b0cdd 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -1718,652 +1718,442 @@ define void @fptrunc() {
 
 define void @fptosi() {
 ; RV32-LABEL: 'fptosi'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'fptosi'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16_v2i8 = fptosi <2 x half> undef to <2 x i8>
   %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
   %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
-  %v2f16_v2i16 = fptosi <2 x half> undef to <2 x i16>
   %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
   %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
-  %v2f16_v2i32 = fptosi <2 x half> undef to <2 x i32>
   %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
   %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
-  %v2f16_v2i64 = fptosi <2 x half> undef to <2 x i64>
   %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
   %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
-  %v2f16_v2i1 = fptosi <2 x half> undef to <2 x i1>
   %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
   %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
 
-  %v4f16_v4i8 = fptosi <4 x half> undef to <4 x i8>
   %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
   %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
-  %v4f16_v4i16 = fptosi <4 x half> undef to <4 x i16>
   %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
   %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
-  %v4f16_v4i32 = fptosi <4 x half> undef to <4 x i32>
   %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
   %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
-  %v4f16_v4i64 = fptosi <4 x half> undef to <4 x i64>
   %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
   %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
-  %v4f16_v4i1 = fptosi <4 x half> undef to <4 x i1>
   %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
   %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
 
-  %v8f16_v8i8 = fptosi <8 x half> undef to <8 x i8>
   %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
   %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
-  %v8f16_v8i16 = fptosi <8 x half> undef to <8 x i16>
   %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
   %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
-  %v8f16_v8i32 = fptosi <8 x half> undef to <8 x i32>
   %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
   %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
-  %v8f16_v8i64 = fptosi <8 x half> undef to <8 x i64>
   %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
   %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
-  %v8f16_v8i1 = fptosi <8 x half> undef to <8 x i1>
   %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
   %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
 
-  %v16f16_v16i8 = fptosi <16 x half> undef to <16 x i8>
   %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
   %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
-  %v16f16_v16i16 = fptosi <16 x half> undef to <16 x i16>
   %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
   %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
-  %v16f16_v16i32 = fptosi <16 x half> undef to <16 x i32>
   %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
   %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
-  %v16f16_v16i64 = fptosi <16 x half> undef to <16 x i64>
   %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
   %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
-  %v16f16_v16i1 = fptosi <16 x half> undef to <16 x i1>
   %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
   %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
 
-  %v32f16_v32i8 = fptosi <32 x half> undef to <32 x i8>
   %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
   %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
-  %v32f16_v32i16 = fptosi <32 x half> undef to <32 x i16>
   %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
   %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
-  %v32f16_v32i32 = fptosi <32 x half> undef to <32 x i32>
   %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
   %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
-  %v32f16_v32i64 = fptosi <32 x half> undef to <32 x i64>
   %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
   %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
-  %v32f16_v32i1 = fptosi <32 x half> undef to <32 x i1>
   %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
   %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
 
-  %v64f16_v64i8 = fptosi <64 x half> undef to <64 x i8>
   %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
   %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
-  %v64f16_v64i16 = fptosi <64 x half> undef to <64 x i16>
   %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
   %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
-  %v64f16_v64i32 = fptosi <64 x half> undef to <64 x i32>
   %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
   %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
-  %v64f16_v64i64 = fptosi <64 x half> undef to <64 x i64>
   %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
   %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
-  %v64f16_v64i1 = fptosi <64 x half> undef to <64 x i1>
   %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
   %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
 
-  %v128f16_v128i8 = fptosi <128 x half> undef to <128 x i8>
   %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
   %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
-  %v128f16_v128i16 = fptosi <128 x half> undef to <128 x i16>
   %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
   %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
-  %v128f16_v128i32 = fptosi <128 x half> undef to <128 x i32>
   %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
   %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
-  %v128f16_v128i64 = fptosi <128 x half> undef to <128 x i64>
   %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
   %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
-  %v128f16_v128i1 = fptosi <128 x half> undef to <128 x i1>
   %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
   %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
 
-  %nxv1f16_nxv1i8 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i8>
   %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
   %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-  %nxv1f16_nxv1i16 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i16>
   %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
   %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-  %nxv1f16_nxv1i32 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i32>
   %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
   %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-  %nxv1f16_nxv1i64 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i64>
   %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
   %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
-  %nxv1f16_nxv1i1 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i1>
   %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
   %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
 
-  %nxv2f16_nxv2i8 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i8>
   %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
   %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-  %nxv2f16_nxv2i16 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i16>
   %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
   %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-  %nxv2f16_nxv2i32 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i32>
   %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
   %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-  %nxv2f16_nxv2i64 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i64>
   %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
   %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
-  %nxv2f16_nxv2i1 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i1>
   %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
   %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
 
-  %nxv4f16_nxv4i8 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i8>
   %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
   %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-  %nxv4f16_nxv4i16 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i16>
   %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
   %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-  %nxv4f16_nxv4i32 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i32>
   %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
   %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-  %nxv4f16_nxv4i64 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i64>
   %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
   %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
-  %nxv4f16_nxv4i1 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i1>
   %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
   %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
 
-  %nxv8f16_nxv8i8 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i8>
   %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
   %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-  %nxv8f16_nxv8i16 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i16>
   %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
   %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-  %nxv8f16_nxv8i32 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i32>
   %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
   %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-  %nxv8f16_nxv8i64 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i64>
   %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
   %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
-  %nxv8f16_nxv8i1 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i1>
   %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
   %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
 
-  %nxv16f16_nxv16i8 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i8>
   %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
   %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
-  %nxv16f16_nxv16i16 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i16>
   %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
   %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
-  %nxv16f16_nxv16i32 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i32>
   %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
   %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
-  %nxv16f16_nxv16i64 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i64>
   %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
   %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
-  %nxv16f16_nxv16i1 = fptosi <vscale x 16 x half> undef to <vscale x 16 x i1>
   %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
   %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
 
-  %nxv32f16_nxv32i8 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i8>
   %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
   %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
-  %nxv32f16_nxv32i16 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i16>
   %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
   %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
-  %nxv32f16_nxv32i32 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i32>
   %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
   %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
-  %nxv32f16_nxv32i64 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i64>
   %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
   %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
-  %nxv32f16_nxv32i1 = fptosi <vscale x 32 x half> undef to <vscale x 32 x i1>
   %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
   %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
 
-  %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
   %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
   %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
-  %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
   %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
   %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
-  %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
   %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
   %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
-  %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
   %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
   %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
-  %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
   %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
   %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
 
@@ -2372,652 +2162,442 @@ define void @fptosi() {
 
 define void @fptoui() {
 ; RV32-LABEL: 'fptoui'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'fptoui'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i8 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i16 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1i32 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1i64 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f16_nxv1i1 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i8 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i16 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2i32 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f16_nxv2i64 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2i1 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i8 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i16 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f16_nxv4i32 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4i64 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f16_nxv4i1 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i8 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i16 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f16_nxv8i32 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f16_nxv8i64 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f16_nxv8i1 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i8 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i16 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f16_nxv16i32 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16f16_nxv16i64 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f16_nxv16i1 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i8 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32f16_nxv32i16 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i32 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32f16_nxv32i64 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32f16_nxv32i1 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2f16_v2i8 = fptoui <2 x half> undef to <2 x i8>
   %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
   %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
-  %v2f16_v2i16 = fptoui <2 x half> undef to <2 x i16>
   %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
   %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
-  %v2f16_v2i32 = fptoui <2 x half> undef to <2 x i32>
   %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
   %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
-  %v2f16_v2i64 = fptoui <2 x half> undef to <2 x i64>
   %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
   %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
-  %v2f16_v2i1 = fptoui <2 x half> undef to <2 x i1>
   %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
   %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
 
-  %v4f16_v4i8 = fptoui <4 x half> undef to <4 x i8>
   %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
   %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
-  %v4f16_v4i16 = fptoui <4 x half> undef to <4 x i16>
   %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
   %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
-  %v4f16_v4i32 = fptoui <4 x half> undef to <4 x i32>
   %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
   %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
-  %v4f16_v4i64 = fptoui <4 x half> undef to <4 x i64>
   %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
   %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
-  %v4f16_v4i1 = fptoui <4 x half> undef to <4 x i1>
   %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
   %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
 
-  %v8f16_v8i8 = fptoui <8 x half> undef to <8 x i8>
   %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
   %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
-  %v8f16_v8i16 = fptoui <8 x half> undef to <8 x i16>
   %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
   %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
-  %v8f16_v8i32 = fptoui <8 x half> undef to <8 x i32>
   %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
   %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
-  %v8f16_v8i64 = fptoui <8 x half> undef to <8 x i64>
   %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
   %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
-  %v8f16_v8i1 = fptoui <8 x half> undef to <8 x i1>
   %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
   %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
 
-  %v16f16_v16i8 = fptoui <16 x half> undef to <16 x i8>
   %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
   %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
-  %v16f16_v16i16 = fptoui <16 x half> undef to <16 x i16>
   %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
   %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
-  %v16f16_v16i32 = fptoui <16 x half> undef to <16 x i32>
   %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
   %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
-  %v16f16_v16i64 = fptoui <16 x half> undef to <16 x i64>
   %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
   %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
-  %v16f16_v16i1 = fptoui <16 x half> undef to <16 x i1>
   %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
   %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
 
-  %v32f16_v32i8 = fptoui <32 x half> undef to <32 x i8>
   %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
   %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
-  %v32f16_v32i16 = fptoui <32 x half> undef to <32 x i16>
   %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
   %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
-  %v32f16_v32i32 = fptoui <32 x half> undef to <32 x i32>
   %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
   %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
-  %v32f16_v32i64 = fptoui <32 x half> undef to <32 x i64>
   %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
   %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
-  %v32f16_v32i1 = fptoui <32 x half> undef to <32 x i1>
   %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
   %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
 
-  %v64f16_v64i8 = fptoui <64 x half> undef to <64 x i8>
   %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
   %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
-  %v64f16_v64i16 = fptoui <64 x half> undef to <64 x i16>
   %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
   %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
-  %v64f16_v64i32 = fptoui <64 x half> undef to <64 x i32>
   %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
   %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
-  %v64f16_v64i64 = fptoui <64 x half> undef to <64 x i64>
   %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
   %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
-  %v64f16_v64i1 = fptoui <64 x half> undef to <64 x i1>
   %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
   %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
 
-  %v128f16_v128i8 = fptoui <128 x half> undef to <128 x i8>
   %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
   %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
-  %v128f16_v128i16 = fptoui <128 x half> undef to <128 x i16>
   %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
   %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
-  %v128f16_v128i32 = fptoui <128 x half> undef to <128 x i32>
   %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
   %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
-  %v128f16_v128i64 = fptoui <128 x half> undef to <128 x i64>
   %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
   %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
-  %v128f16_v128i1 = fptoui <128 x half> undef to <128 x i1>
   %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
   %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
 
-  %nxv1f16_nxv1i8 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i8>
   %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
   %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-  %nxv1f16_nxv1i16 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i16>
   %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
   %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-  %nxv1f16_nxv1i32 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i32>
   %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
   %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
-  %nxv1f16_nxv1i64 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i64>
   %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
   %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
-  %nxv1f16_nxv1i1 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i1>
   %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
   %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
 
-  %nxv2f16_nxv2i8 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i8>
   %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
   %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-  %nxv2f16_nxv2i16 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i16>
   %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
   %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-  %nxv2f16_nxv2i32 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i32>
   %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
   %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
-  %nxv2f16_nxv2i64 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i64>
   %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
   %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
-  %nxv2f16_nxv2i1 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i1>
   %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
   %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
 
-  %nxv4f16_nxv4i8 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i8>
   %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
   %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-  %nxv4f16_nxv4i16 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i16>
   %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
   %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-  %nxv4f16_nxv4i32 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i32>
   %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
   %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
-  %nxv4f16_nxv4i64 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i64>
   %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
   %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
-  %nxv4f16_nxv4i1 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i1>
   %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
   %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
 
-  %nxv8f16_nxv8i8 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i8>
   %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
   %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-  %nxv8f16_nxv8i16 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i16>
   %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
   %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-  %nxv8f16_nxv8i32 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i32>
   %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
   %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
-  %nxv8f16_nxv8i64 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i64>
   %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
   %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
-  %nxv8f16_nxv8i1 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i1>
   %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
   %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
 
-  %nxv16f16_nxv16i8 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i8>
   %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
   %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
-  %nxv16f16_nxv16i16 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i16>
   %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
   %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
-  %nxv16f16_nxv16i32 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i32>
   %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
   %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
-  %nxv16f16_nxv16i64 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i64>
   %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
   %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
-  %nxv16f16_nxv16i1 = fptoui <vscale x 16 x half> undef to <vscale x 16 x i1>
   %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
   %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
 
-  %nxv32f16_nxv32i8 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i8>
   %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
   %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
-  %nxv32f16_nxv32i16 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i16>
   %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
   %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
-  %nxv32f16_nxv32i32 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i32>
   %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
   %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
-  %nxv32f16_nxv32i64 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i64>
   %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
   %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
-  %nxv32f16_nxv32i1 = fptoui <vscale x 32 x half> undef to <vscale x 32 x i1>
   %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
   %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
 
-  %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
   %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
   %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
-  %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
   %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
   %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
-  %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
   %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
   %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
-  %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
   %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
   %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
-  %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
   %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
   %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
 
@@ -3026,652 +2606,442 @@ define void @fptoui() {
 
 define void @sitofp() {
 ; RV32-LABEL: 'sitofp'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i16> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f64 = sitofp <8 x i16> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f64 = sitofp <16 x i16> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f64 = sitofp <32 x i16> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f64 = sitofp <64 x i16> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f64 = sitofp <128 x i16> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'sitofp'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i16> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f64 = sitofp <8 x i16> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f64 = sitofp <16 x i16> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f64 = sitofp <32 x i16> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f64 = sitofp <64 x i16> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f64 = sitofp <128 x i16> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2i8_v2f16 = sitofp <2 x i8> undef to <2 x half>
   %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
   %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
-  %v2i16_v2f16 = sitofp <2 x i16> undef to <2 x half>
   %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
   %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
-  %v2i32_v2f16 = sitofp <2 x i32> undef to <2 x half>
   %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
   %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-  %v2i64_v2f16 = sitofp <2 x i64> undef to <2 x half>
   %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
   %v2i64_v2f64 = sitofp <2 x i16> undef to <2 x double>
-  %v2i1_v2f16 = sitofp <2 x i1> undef to <2 x half>
   %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
   %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
 
-  %v4i8_v4f16 = sitofp <4 x i8> undef to <4 x half>
   %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
   %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
-  %v4i16_v4f16 = sitofp <4 x i16> undef to <4 x half>
   %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
   %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
-  %v4i32_v4f16 = sitofp <4 x i32> undef to <4 x half>
   %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
   %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-  %v4i64_v4f16 = sitofp <4 x i64> undef to <4 x half>
   %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
   %v4i64_v4f64 = sitofp <4 x i16> undef to <4 x double>
-  %v4i1_v4f16 = sitofp <4 x i1> undef to <4 x half>
   %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
   %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
 
-  %v8i8_v8f16 = sitofp <8 x i8> undef to <8 x half>
   %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
   %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
-  %v8i16_v8f16 = sitofp <8 x i16> undef to <8 x half>
   %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
   %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
-  %v8i32_v8f16 = sitofp <8 x i32> undef to <8 x half>
   %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
   %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
-  %v8i64_v8f16 = sitofp <8 x i64> undef to <8 x half>
   %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
   %v8i64_v8f64 = sitofp <8 x i16> undef to <8 x double>
-  %v8i1_v8f16 = sitofp <8 x i1> undef to <8 x half>
   %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
   %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
 
-  %v16i8_v16f16 = sitofp <16 x i8> undef to <16 x half>
   %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
   %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
-  %v16i16_v16f16 = sitofp <16 x i16> undef to <16 x half>
   %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
   %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
-  %v16i32_v16f16 = sitofp <16 x i32> undef to <16 x half>
   %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
   %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
-  %v16i64_v16f16 = sitofp <16 x i64> undef to <16 x half>
   %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
   %v16i64_v16f64 = sitofp <16 x i16> undef to <16 x double>
-  %v16i1_v16f16 = sitofp <16 x i1> undef to <16 x half>
   %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
   %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
 
-  %v32i8_v32f16 = sitofp <32 x i8> undef to <32 x half>
   %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
   %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
-  %v32i16_v32f16 = sitofp <32 x i16> undef to <32 x half>
   %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
   %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
-  %v32i32_v32f16 = sitofp <32 x i32> undef to <32 x half>
   %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
   %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
-  %v32i64_v32f16 = sitofp <32 x i64> undef to <32 x half>
   %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
   %v32i64_v32f64 = sitofp <32 x i16> undef to <32 x double>
-  %v32i1_v32f16 = sitofp <32 x i1> undef to <32 x half>
   %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
   %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
 
-  %v64i8_v64f16 = sitofp <64 x i8> undef to <64 x half>
   %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
   %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
-  %v64i16_v64f16 = sitofp <64 x i16> undef to <64 x half>
   %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
   %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
-  %v64i32_v64f16 = sitofp <64 x i32> undef to <64 x half>
   %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
   %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
-  %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half>
   %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
   %v64i64_v64f64 = sitofp <64 x i16> undef to <64 x double>
-  %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half>
   %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
   %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
 
-  %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half>
   %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
   %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
-  %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half>
   %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
   %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
-  %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half>
   %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
   %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
-  %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half>
   %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
   %v128i64_v128f64 = sitofp <128 x i16> undef to <128 x double>
-  %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half>
   %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
   %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
 
-  %nxv1i8_nxv1f16 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
   %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
   %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-  %nxv1i16_nxv1f16 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
   %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
   %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-  %nxv1i32_nxv1f16 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
   %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
   %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-  %nxv1i64_nxv1f16 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
   %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
   %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-  %nxv1i1_nxv1f16 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
   %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
   %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
 
-  %nxv2i8_nxv2f16 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
   %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
   %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-  %nxv2i16_nxv2f16 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
   %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
   %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-  %nxv2i32_nxv2f16 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
   %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
   %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-  %nxv2i64_nxv2f16 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
   %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
   %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-  %nxv2i1_nxv2f16 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
   %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
   %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
 
-  %nxv4i8_nxv4f16 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
   %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
   %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-  %nxv4i16_nxv4f16 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
   %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
   %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-  %nxv4i32_nxv4f16 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
   %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
   %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-  %nxv4i64_nxv4f16 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
   %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
   %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-  %nxv4i1_nxv4f16 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
   %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
   %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
 
-  %nxv8i8_nxv8f16 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
   %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
   %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-  %nxv8i16_nxv8f16 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
   %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
   %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-  %nxv8i32_nxv8f16 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
   %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
   %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-  %nxv8i64_nxv8f16 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
   %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
   %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-  %nxv8i1_nxv8f16 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
   %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
   %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
 
-  %nxv16i8_nxv16f16 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
   %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
   %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-  %nxv16i16_nxv16f16 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
   %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
   %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-  %nxv16i32_nxv16f16 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
   %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
   %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-  %nxv16i64_nxv16f16 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
   %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
   %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-  %nxv16i1_nxv16f16 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
   %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
   %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
 
-  %nxv32i8_nxv32f16 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
   %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
   %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-  %nxv32i16_nxv32f16 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
   %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
   %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-  %nxv32i32_nxv32f16 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
   %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
   %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-  %nxv32i64_nxv32f16 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
   %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
   %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-  %nxv32i1_nxv32f16 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
   %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
   %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
 
-  %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
   %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
   %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-  %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
   %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
   %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-  %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
   %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
   %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-  %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
   %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
   %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-  %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
   %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
   %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 
@@ -3680,652 +3050,442 @@ define void @sitofp() {
 
 define void @uitofp() {
 ; RV32-LABEL: 'uitofp'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i16> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f64 = uitofp <8 x i16> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f64 = uitofp <16 x i16> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f64 = uitofp <32 x i16> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f64 = uitofp <64 x i16> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f64 = uitofp <128 x i16> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; RV64-LABEL: 'uitofp'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i16> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f64 = uitofp <8 x i16> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16f64 = uitofp <16 x i16> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i64_v32f64 = uitofp <32 x i16> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64f64 = uitofp <64 x i16> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v128i64_v128f64 = uitofp <128 x i16> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1f16 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f16 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f16 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f16 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f16 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2f16 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f16 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f16 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f16 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f16 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4f16 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f16 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f16 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f16 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f16 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8_nxv8f16 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f16 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f16 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f16 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f16 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8_nxv16f16 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f16 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f16 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f16 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f16 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8_nxv32f16 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i16_nxv32f16 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i32_nxv32f16 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f16 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv32i1_nxv32f16 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %v2i8_v2f16 = uitofp <2 x i8> undef to <2 x half>
   %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
   %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
-  %v2i16_v2f16 = uitofp <2 x i16> undef to <2 x half>
   %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
   %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
-  %v2i32_v2f16 = uitofp <2 x i32> undef to <2 x half>
   %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
   %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
-  %v2i64_v2f16 = uitofp <2 x i64> undef to <2 x half>
   %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
   %v2i64_v2f64 = uitofp <2 x i16> undef to <2 x double>
-  %v2i1_v2f16 = uitofp <2 x i1> undef to <2 x half>
   %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
   %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
 
-  %v4i8_v4f16 = uitofp <4 x i8> undef to <4 x half>
   %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
   %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
-  %v4i16_v4f16 = uitofp <4 x i16> undef to <4 x half>
   %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
   %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
-  %v4i32_v4f16 = uitofp <4 x i32> undef to <4 x half>
   %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
   %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
-  %v4i64_v4f16 = uitofp <4 x i64> undef to <4 x half>
   %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
   %v4i64_v4f64 = uitofp <4 x i16> undef to <4 x double>
-  %v4i1_v4f16 = uitofp <4 x i1> undef to <4 x half>
   %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
   %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
 
-  %v8i8_v8f16 = uitofp <8 x i8> undef to <8 x half>
   %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
   %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
-  %v8i16_v8f16 = uitofp <8 x i16> undef to <8 x half>
   %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
   %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
-  %v8i32_v8f16 = uitofp <8 x i32> undef to <8 x half>
   %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
   %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
-  %v8i64_v8f16 = uitofp <8 x i64> undef to <8 x half>
   %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
   %v8i64_v8f64 = uitofp <8 x i16> undef to <8 x double>
-  %v8i1_v8f16 = uitofp <8 x i1> undef to <8 x half>
   %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
   %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
 
-  %v16i8_v16f16 = uitofp <16 x i8> undef to <16 x half>
   %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
   %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
-  %v16i16_v16f16 = uitofp <16 x i16> undef to <16 x half>
   %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
   %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
-  %v16i32_v16f16 = uitofp <16 x i32> undef to <16 x half>
   %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
   %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
-  %v16i64_v16f16 = uitofp <16 x i64> undef to <16 x half>
   %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
   %v16i64_v16f64 = uitofp <16 x i16> undef to <16 x double>
-  %v16i1_v16f16 = uitofp <16 x i1> undef to <16 x half>
   %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
   %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
 
-  %v32i8_v32f16 = uitofp <32 x i8> undef to <32 x half>
   %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
   %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
-  %v32i16_v32f16 = uitofp <32 x i16> undef to <32 x half>
   %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
   %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
-  %v32i32_v32f16 = uitofp <32 x i32> undef to <32 x half>
   %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
   %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
-  %v32i64_v32f16 = uitofp <32 x i64> undef to <32 x half>
   %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
   %v32i64_v32f64 = uitofp <32 x i16> undef to <32 x double>
-  %v32i1_v32f16 = uitofp <32 x i1> undef to <32 x half>
   %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
   %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
 
-  %v64i8_v64f16 = uitofp <64 x i8> undef to <64 x half>
   %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
   %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
-  %v64i16_v64f16 = uitofp <64 x i16> undef to <64 x half>
   %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
   %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
-  %v64i32_v64f16 = uitofp <64 x i32> undef to <64 x half>
   %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
   %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
-  %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half>
   %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
   %v64i64_v64f64 = uitofp <64 x i16> undef to <64 x double>
-  %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half>
   %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
   %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
 
-  %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half>
   %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
   %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
-  %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half>
   %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
   %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
-  %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half>
   %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
   %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
-  %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half>
   %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
   %v128i64_v128f64 = uitofp <128 x i16> undef to <128 x double>
-  %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half>
   %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
   %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
 
-  %nxv1i8_nxv1f16 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x half>
   %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
   %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-  %nxv1i16_nxv1f16 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x half>
   %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
   %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-  %nxv1i32_nxv1f16 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x half>
   %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
   %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-  %nxv1i64_nxv1f16 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x half>
   %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
   %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-  %nxv1i1_nxv1f16 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x half>
   %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
   %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
 
-  %nxv2i8_nxv2f16 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x half>
   %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
   %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-  %nxv2i16_nxv2f16 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x half>
   %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
   %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-  %nxv2i32_nxv2f16 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x half>
   %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
   %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-  %nxv2i64_nxv2f16 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x half>
   %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
   %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-  %nxv2i1_nxv2f16 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x half>
   %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
   %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
 
-  %nxv4i8_nxv4f16 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x half>
   %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
   %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-  %nxv4i16_nxv4f16 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x half>
   %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
   %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-  %nxv4i32_nxv4f16 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x half>
   %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
   %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-  %nxv4i64_nxv4f16 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x half>
   %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
   %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-  %nxv4i1_nxv4f16 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x half>
   %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
   %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
 
-  %nxv8i8_nxv8f16 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x half>
   %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
   %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-  %nxv8i16_nxv8f16 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x half>
   %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
   %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-  %nxv8i32_nxv8f16 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x half>
   %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
   %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-  %nxv8i64_nxv8f16 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x half>
   %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
   %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-  %nxv8i1_nxv8f16 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x half>
   %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
   %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
 
-  %nxv16i8_nxv16f16 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x half>
   %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
   %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-  %nxv16i16_nxv16f16 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x half>
   %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
   %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-  %nxv16i32_nxv16f16 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x half>
   %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
   %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-  %nxv16i64_nxv16f16 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x half>
   %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
   %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-  %nxv16i1_nxv16f16 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x half>
   %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
   %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
 
-  %nxv32i8_nxv32f16 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x half>
   %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
   %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-  %nxv32i16_nxv32f16 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x half>
   %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
   %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-  %nxv32i32_nxv32f16 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x half>
   %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
   %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-  %nxv32i64_nxv32f16 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x half>
   %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
   %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-  %nxv32i1_nxv32f16 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x half>
   %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
   %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
 
-  %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
   %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
   %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-  %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
   %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
   %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-  %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
   %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
   %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-  %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
   %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
   %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-  %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
   %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
   %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 
diff --git a/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll b/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll
index 2dc6737a3d8a07..6c1bfb72c85967 100644
--- a/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse3 | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s --check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
@@ -11,8 +11,8 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
 ;
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE4,SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
@@ -39,6 +39,29 @@ define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_eq'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_eq'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8
@@ -251,6 +274,29 @@ define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ne'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ne'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8
@@ -463,6 +509,29 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sge'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sge'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8
@@ -652,6 +721,52 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'cmp_int_uge'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'cmp_int_uge'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_uge'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
@@ -864,6 +979,29 @@ define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sgt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sgt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8
@@ -1076,6 +1214,29 @@ define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ugt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ugt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8
@@ -1288,6 +1449,29 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sle'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sle'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8
@@ -1477,6 +1661,52 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'cmp_int_ule'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'cmp_int_ule'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ule'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
@@ -1689,6 +1919,29 @@ define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_slt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_slt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8
@@ -1901,6 +2154,29 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ult'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ult'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8
@@ -2090,6 +2366,52 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'scmp_int'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'scmp_int'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-LABEL: 'scmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
@@ -2256,6 +2578,52 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 }
 
 define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'ucmp_int'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'ucmp_int'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-LABEL: 'ucmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
@@ -2421,3 +2789,5 @@ define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
   ret i32 undef
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SSE4: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/icmp-latency.ll b/llvm/test/Analysis/CostModel/X86/icmp-latency.ll
index 726a6dd782a4f7..efa903ea2819c8 100644
--- a/llvm/test/Analysis/CostModel/X86/icmp-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/icmp-latency.ll
@@ -2,8 +2,8 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse3 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s --check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
@@ -12,7 +12,7 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
 ;
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SLM
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
@@ -39,6 +39,29 @@ define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_eq'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_eq'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8
@@ -274,6 +297,29 @@ define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ne'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ne'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8
@@ -509,6 +555,29 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sge'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sge'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8
@@ -721,6 +790,52 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'cmp_int_uge'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'cmp_int_uge'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_uge'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
@@ -956,6 +1071,29 @@ define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sgt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sgt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8
@@ -1191,6 +1329,29 @@ define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ugt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ugt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8
@@ -1426,6 +1587,29 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sle'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sle'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8
@@ -1638,6 +1822,52 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'cmp_int_ule'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'cmp_int_ule'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ule'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
@@ -1873,6 +2103,29 @@ define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_slt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_slt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8
@@ -2108,6 +2361,29 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ult'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ult'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8
@@ -2320,28 +2596,51 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
-; SSE42-LABEL: 'scmp_int'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE2-LABEL: 'scmp_int'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'scmp_int'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'scmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
@@ -2532,28 +2831,51 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 }
 
 define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
-; SSE42-LABEL: 'ucmp_int'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE2-LABEL: 'ucmp_int'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'ucmp_int'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'ucmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
diff --git a/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll
index 0deaad5991fb2f..4fc7c68be26f78 100644
--- a/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll
@@ -2,8 +2,8 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse3 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s --check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
@@ -11,8 +11,8 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
 ;
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE4,SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
@@ -39,6 +39,29 @@ define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_eq'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_eq'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8
@@ -251,6 +274,29 @@ define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ne'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ne'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8
@@ -463,6 +509,29 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sge'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sge'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8
@@ -652,6 +721,52 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'cmp_int_uge'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'cmp_int_uge'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_uge'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
@@ -864,6 +979,29 @@ define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sgt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sgt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8
@@ -1076,6 +1214,29 @@ define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ugt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ugt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8
@@ -1288,6 +1449,29 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sle'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sle'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8
@@ -1477,6 +1661,52 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'cmp_int_ule'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'cmp_int_ule'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ule'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
@@ -1689,6 +1919,29 @@ define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_slt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_slt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8
@@ -1901,6 +2154,29 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ult'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ult'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8
@@ -2090,6 +2366,52 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'scmp_int'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'scmp_int'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-LABEL: 'scmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
@@ -2256,6 +2578,52 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 }
 
 define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'ucmp_int'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'ucmp_int'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-LABEL: 'ucmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
@@ -2421,3 +2789,5 @@ define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
   ret i32 undef
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SSE4: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/icmp.ll b/llvm/test/Analysis/CostModel/X86/icmp.ll
index 599895c2b5705a..d8959a67145d63 100644
--- a/llvm/test/Analysis/CostModel/X86/icmp.ll
+++ b/llvm/test/Analysis/CostModel/X86/icmp.ll
@@ -2,8 +2,8 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
@@ -12,7 +12,7 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+xop,+avx2 | FileCheck %s -check-prefixes=XOPAVX2
 ;
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SLM
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
@@ -3125,51 +3125,28 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; SSE41-LABEL: 'scmp_int'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'scmp_int'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-LABEL: 'scmp_int'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'scmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
@@ -3429,51 +3406,28 @@ define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; SSE41-LABEL: 'ucmp_int'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'ucmp_int'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-LABEL: 'ucmp_int'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'ucmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
index 835622276ef279..4402289ac170d9 100644
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -511,12 +511,6 @@ define void @f92() sanitize_realtime
         ret void;
 }
 
-; CHECK: define void @f93() #54
-define void @f93() nosanitize_realtime
-{
-        ret void;
-}
-
 ; CHECK: define void @f87() [[FNRETTHUNKEXTERN:#[0-9]+]]
 define void @f87() fn_ret_thunk_extern { ret void }
 
@@ -612,7 +606,6 @@ define void @initializes(ptr initializes((-4, 0), (4, 8)) %a) {
 ; CHECK: attributes #51 = { uwtable(sync) }
 ; CHECK: attributes #52 = { nosanitize_bounds }
 ; CHECK: attributes #53 = { sanitize_realtime }
-; CHECK: attributes #54 = { nosanitize_realtime }
 ; CHECK: attributes [[FNRETTHUNKEXTERN]] = { fn_ret_thunk_extern }
 ; CHECK: attributes [[SKIPPROFILE]] = { skipprofile }
 ; CHECK: attributes [[OPTDEBUG]] = { optdebug }
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index c401cde8e146e7..fd60c49a4be39b 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1562,7 +1562,7 @@ exit:
   ; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
 
   call void @f.nobuiltin() builtin
-  ; CHECK: call void @f.nobuiltin() #54
+  ; CHECK: call void @f.nobuiltin() #53
 
   call fastcc noalias ptr @f.noalias() noinline
   ; CHECK: call fastcc noalias ptr @f.noalias() #12
@@ -1992,9 +1992,6 @@ declare void @f.sanitize_numerical_stability() sanitize_numerical_stability
 declare void @f.sanitize_realtime() sanitize_realtime
 ; CHECK: declare void @f.sanitize_realtime() #52
 
-declare void @f.nosanitize_realtime() nosanitize_realtime
-; CHECK: declare void @f.nosanitize_realtime() #53
-
 ; CHECK: declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan))
 declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan))
 
@@ -2118,8 +2115,7 @@ define float @nofpclass_callsites(float %arg) {
 ; CHECK: attributes #50 = { allockind("alloc,uninitialized") }
 ; CHECK: attributes #51 = { sanitize_numerical_stability }
 ; CHECK: attributes #52 = { sanitize_realtime }
-; CHECK: attributes #53 = { nosanitize_realtime }
-; CHECK: attributes #54 = { builtin }
+; CHECK: attributes #53 = { builtin }
 
 ;; Metadata
 
diff --git a/llvm/test/CodeGen/AVR/jmp.ll b/llvm/test/CodeGen/AVR/jmp.ll
new file mode 100644
index 00000000000000..95dfff4836b4e8
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/jmp.ll
@@ -0,0 +1,25 @@
+; RUN: llc -filetype=obj -mtriple=avr < %s | llvm-objdump -dr --no-show-raw-insn - | FileCheck %s
+
+define i8 @foo(i8 %a) {
+bb0:
+  %0 = tail call i8 @bar(i8 %a)
+  %1 = icmp eq i8 %0, 123
+  br i1 %1, label %bb1, label %bb2
+
+bb1:
+  ret i8 100
+
+bb2:
+  ret i8 200
+}
+
+declare i8 @bar(i8);
+
+; CHECK: rcall   .-2
+; CHECK-NEXT: 00000000: R_AVR_13_PCREL bar
+; CHECK-NEXT: cpi     r24, 0x7b
+; CHECK-NEXT: brne    .+4
+; CHECK-NEXT: ldi     r24, 0x64
+; CHECK-NEXT: ret
+; CHECK-NEXT: ldi     r24, 0xc8
+; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/BPF/atomics.ll b/llvm/test/CodeGen/BPF/atomics.ll
index 0c16c49f2a873b..c17b94af5f7bd9 100644
--- a/llvm/test/CodeGen/BPF/atomics.ll
+++ b/llvm/test/CodeGen/BPF/atomics.ll
@@ -1,10 +1,11 @@
-; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding | FileCheck --check-prefixes=CHECK,CHECK-V2 %s
-; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding -mcpu=v3 | FileCheck --check-prefixes=CHECK,CHECK-V3 %s
+; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding | FileCheck %s
+; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding -mcpu=v3 | FileCheck --check-prefix=CHECK-V3 %s
 
 ; CHECK-LABEL: test_load_add_32
-; CHECK-V2: r2 = atomic_fetch_add((u32 *)(r1 + 0), r2)
+; CHECK: lock *(u32 *)(r1 + 0) += r2
+; CHECK: encoding: [0xc3,0x21
 ; CHECK-V3: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2)
-; CHECK: encoding: [0xc3,0x21,0x00,0x00,0x01,0x00,0x00,0x00]
+; CHECK-V3: encoding: [0xc3,0x21,0x00,0x00,0x01,0x00,0x00,0x00]
 define void @test_load_add_32(ptr %p, i32 zeroext %v) {
 entry:
   atomicrmw add ptr %p, i32 %v seq_cst
@@ -12,8 +13,10 @@ entry:
 }
 
 ; CHECK-LABEL: test_load_add_64
-; CHECK: r2 = atomic_fetch_add((u64 *)(r1 + 0), r2)
-; CHECK: encoding: [0xdb,0x21,0x00,0x00,0x01,0x00,0x00,0x00]
+; CHECK: lock *(u64 *)(r1 + 0) += r2
+; CHECK: encoding: [0xdb,0x21
+; CHECK-V3: r2 = atomic_fetch_add((u64 *)(r1 + 0), r2)
+; CHECK-V3: encoding: [0xdb,0x21,0x00,0x00,0x01,0x00,0x00,0x00]
 define void @test_load_add_64(ptr %p, i64 zeroext %v) {
 entry:
   atomicrmw add ptr %p, i64 %v seq_cst
diff --git a/llvm/test/CodeGen/BPF/atomics_2.ll b/llvm/test/CodeGen/BPF/atomics_2.ll
index c670ddb05b6a77..6371e3b875638e 100644
--- a/llvm/test/CodeGen/BPF/atomics_2.ll
+++ b/llvm/test/CodeGen/BPF/atomics_2.ll
@@ -224,7 +224,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_atomic_xor_64
-; CHECK: r2 = atomic_fetch_xor((u64 *)(r1 + 0), r2)
+; CHECK: atomic_fetch_xor((u64 *)(r1 + 0), r2)
 ; CHECK: encoding: [0xdb,0x21,0x00,0x00,0xa1,0x00,0x00,0x00]
 ; CHECK: w0 = 0
 define dso_local i32 @test_atomic_xor_64(ptr nocapture %p, i64 %v) local_unnamed_addr {
diff --git a/llvm/test/CodeGen/BPF/objdump_atomics.ll b/llvm/test/CodeGen/BPF/objdump_atomics.ll
index c4cb16b2c36418..fcc889ba300e39 100644
--- a/llvm/test/CodeGen/BPF/objdump_atomics.ll
+++ b/llvm/test/CodeGen/BPF/objdump_atomics.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: test_load_add_32
 ; CHECK: c3 21
-; CHECK: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2)
+; CHECK: lock *(u32 *)(r1 + 0) += w2
 define void @test_load_add_32(ptr %p, i32 zeroext %v) {
 entry:
   atomicrmw add ptr %p, i32 %v seq_cst
@@ -11,7 +11,7 @@ entry:
 
 ; CHECK-LABEL: test_load_add_64
 ; CHECK: db 21
-; CHECK: r2 = atomic_fetch_add((u64 *)(r1 + 0), r2)
+; CHECK: lock *(u64 *)(r1 + 0) += r2
 define void @test_load_add_64(ptr %p, i64 zeroext %v) {
 entry:
   atomicrmw add ptr %p, i64 %v seq_cst
diff --git a/llvm/test/CodeGen/BPF/xadd.ll b/llvm/test/CodeGen/BPF/xadd.ll
new file mode 100644
index 00000000000000..5aeeb9baf7b892
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/xadd.ll
@@ -0,0 +1,59 @@
+; RUN: not llc -march=bpfel < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=bpfeb < %s 2>&1 | FileCheck %s
+
+; This file is generated with the source command and source
+; $ clang -target bpf -O2 -g -S -emit-llvm t.c
+; $ cat t.c
+; int test(int *ptr) {
+;    int r;
+;    __sync_fetch_and_add(ptr, 4);
+;    r = __sync_fetch_and_add(ptr, 6);
+;    return r;
+; }
+
+; ModuleID = 't.c'
+source_filename = "t.c"
+target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128"
+target triple = "bpf"
+
+; Function Attrs: nounwind
+define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr #0 !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata ptr %ptr, metadata !13, metadata !DIExpression()), !dbg !15
+  %0 = atomicrmw add ptr %ptr, i32 4 seq_cst, !dbg !16
+  %1 = atomicrmw add ptr %ptr, i32 6 seq_cst, !dbg !17
+; CHECK: in function test i32 (ptr): Invalid usage of the XADD return value
+  call void @llvm.dbg.value(metadata i32 %1, metadata !14, metadata !DIExpression()), !dbg !18
+  ret i32 %1, !dbg !19
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0 (trunk 342605) (llvm/trunk 342612)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "t.c", directory: "/home/yhs/work/tests/llvm/sync/test1")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk 342605) (llvm/trunk 342612)"}
+!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !11}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
+!12 = !{!13, !14}
+!13 = !DILocalVariable(name: "ptr", arg: 1, scope: !7, file: !1, line: 1, type: !11)
+!14 = !DILocalVariable(name: "r", scope: !7, file: !1, line: 2, type: !10)
+!15 = !DILocation(line: 1, column: 15, scope: !7)
+!16 = !DILocation(line: 3, column: 4, scope: !7)
+!17 = !DILocation(line: 4, column: 8, scope: !7)
+!18 = !DILocation(line: 2, column: 8, scope: !7)
+!19 = !DILocation(line: 5, column: 4, scope: !7)
diff --git a/llvm/test/CodeGen/BPF/xadd_legal.ll b/llvm/test/CodeGen/BPF/xadd_legal.ll
index 88f04d85a779f8..9b07afade3fee9 100644
--- a/llvm/test/CodeGen/BPF/xadd_legal.ll
+++ b/llvm/test/CodeGen/BPF/xadd_legal.ll
@@ -19,7 +19,7 @@ define dso_local i32 @test(ptr nocapture %ptr, i64 %a) {
 entry:
   %conv = trunc i64 %a to i32
   %0 = atomicrmw add ptr %ptr, i32 %conv seq_cst
-; CHECK-64: r2 = atomic_fetch_add((u32 *)(r1 + 0), r2)
+; CHECK-64: lock *(u32 *)(r1 + 0) += r2
 ; CHECK-32: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2)
   %1 = load i32, ptr %ptr, align 4
   ret i32 %1
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index f17cec231f3236..3efdd08bbea4c4 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -338,14 +338,12 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ; LA64-NEXT:    srli.d $a1, $a0, 1
 ; LA64-NEXT:    lu12i.w $a2, 349525
 ; LA64-NEXT:    ori $a2, $a2, 1365
-; LA64-NEXT:    lu32i.d $a2, 349525
-; LA64-NEXT:    lu52i.d $a2, $a2, 1365
+; LA64-NEXT:    bstrins.d $a2, $a2, 62, 32
 ; LA64-NEXT:    and $a1, $a1, $a2
 ; LA64-NEXT:    sub.d $a0, $a0, $a1
 ; LA64-NEXT:    lu12i.w $a1, 209715
 ; LA64-NEXT:    ori $a1, $a1, 819
-; LA64-NEXT:    lu32i.d $a1, 209715
-; LA64-NEXT:    lu52i.d $a1, $a1, 819
+; LA64-NEXT:    bstrins.d $a1, $a1, 61, 32
 ; LA64-NEXT:    and $a2, $a0, $a1
 ; LA64-NEXT:    srli.d $a0, $a0, 2
 ; LA64-NEXT:    and $a0, $a0, $a1
@@ -354,13 +352,11 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ; LA64-NEXT:    add.d $a0, $a0, $a1
 ; LA64-NEXT:    lu12i.w $a1, 61680
 ; LA64-NEXT:    ori $a1, $a1, 3855
-; LA64-NEXT:    lu32i.d $a1, -61681
-; LA64-NEXT:    lu52i.d $a1, $a1, 240
+; LA64-NEXT:    bstrins.d $a1, $a1, 59, 32
 ; LA64-NEXT:    and $a0, $a0, $a1
 ; LA64-NEXT:    lu12i.w $a1, 4112
 ; LA64-NEXT:    ori $a1, $a1, 257
-; LA64-NEXT:    lu32i.d $a1, 65793
-; LA64-NEXT:    lu52i.d $a1, $a1, 16
+; LA64-NEXT:    bstrins.d $a1, $a1, 56, 32
 ; LA64-NEXT:    mul.d $a0, $a0, $a1
 ; LA64-NEXT:    srli.d $a0, $a0, 56
 ; LA64-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/imm.ll b/llvm/test/CodeGen/LoongArch/imm.ll
index f84fddaec66b9f..aca508e99fb960 100644
--- a/llvm/test/CodeGen/LoongArch/imm.ll
+++ b/llvm/test/CodeGen/LoongArch/imm.ll
@@ -47,8 +47,7 @@ define i64 @imm0008000000000fff() {
 ; CHECK-LABEL: imm0008000000000fff:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ori $a0, $zero, 4095
-; CHECK-NEXT:    lu32i.d $a0, -524288
-; CHECK-NEXT:    lu52i.d $a0, $a0, 0
+; CHECK-NEXT:    bstrins.d $a0, $a0, 51, 51
 ; CHECK-NEXT:    ret
   ret i64 2251799813689343
 }
@@ -164,3 +163,59 @@ define i64 @imm0008000080000800() {
 ; CHECK-NEXT:    ret
   ret i64 2251801961170944
 }
+
+define i64 @imm14000000a() {
+; CHECK-LABEL: imm14000000a:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a0, $zero, 10
+; CHECK-NEXT:    bstrins.d $a0, $a0, 32, 29
+; CHECK-NEXT:    ret
+  ret i64 5368709130
+}
+
+define i64 @imm0fff000000000fff() {
+; CHECK-LABEL: imm0fff000000000fff:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a0, $zero, 4095
+; CHECK-NEXT:    bstrins.d $a0, $a0, 59, 48
+; CHECK-NEXT:    ret
+  ret i64 1152640029630140415
+}
+
+define i64 @immffecffffffffffec() {
+; CHECK-LABEL: immffecffffffffffec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.w $a0, $zero, -20
+; CHECK-NEXT:    bstrins.d $a0, $a0, 52, 48
+; CHECK-NEXT:    ret
+  ret i64 -5348024557502484
+}
+
+define i64 @imm1c000000700000() {
+; CHECK-LABEL: imm1c000000700000:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a0, 1792
+; CHECK-NEXT:    bstrins.d $a0, $a0, 52, 30
+; CHECK-NEXT:    ret
+  ret i64 7881299355238400
+}
+
+define i64 @immf0f0f0f0f0f0f0f0() {
+; CHECK-LABEL: immf0f0f0f0f0f0f0f0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a0, -61681
+; CHECK-NEXT:    ori $a0, $a0, 240
+; CHECK-NEXT:    bstrins.d $a0, $a0, 59, 32
+; CHECK-NEXT:    ret
+  ret i64 -1085102592571150096
+}
+
+define i64 @imm110000014000000a() {
+; CHECK-LABEL: imm110000014000000a:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a0, $zero, 10
+; CHECK-NEXT:    lu52i.d $a0, $a0, 272
+; CHECK-NEXT:    bstrins.d $a0, $a0, 32, 29
+; CHECK-NEXT:    ret
+  ret i64 1224979104013484042
+}
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll
index 772ae8d81a88bf..9654542f877459 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll
@@ -973,9 +973,8 @@ define i64 @ld_sd_constant(i64 %a) nounwind {
 ; LA64NOPIC-LABEL: ld_sd_constant:
 ; LA64NOPIC:       # %bb.0:
 ; LA64NOPIC-NEXT:    lu12i.w $a1, -136485
-; LA64NOPIC-NEXT:    ori $a1, $a1, 3823
-; LA64NOPIC-NEXT:    lu32i.d $a1, -147729
-; LA64NOPIC-NEXT:    lu52i.d $a2, $a1, -534
+; LA64NOPIC-NEXT:    ori $a2, $a1, 3823
+; LA64NOPIC-NEXT:    bstrins.d $a2, $a2, 61, 32
 ; LA64NOPIC-NEXT:    ld.d $a1, $a2, 0
 ; LA64NOPIC-NEXT:    st.d $a0, $a2, 0
 ; LA64NOPIC-NEXT:    move $a0, $a1
@@ -984,9 +983,8 @@ define i64 @ld_sd_constant(i64 %a) nounwind {
 ; LA64PIC-LABEL: ld_sd_constant:
 ; LA64PIC:       # %bb.0:
 ; LA64PIC-NEXT:    lu12i.w $a1, -136485
-; LA64PIC-NEXT:    ori $a1, $a1, 3823
-; LA64PIC-NEXT:    lu32i.d $a1, -147729
-; LA64PIC-NEXT:    lu52i.d $a2, $a1, -534
+; LA64PIC-NEXT:    ori $a2, $a1, 3823
+; LA64PIC-NEXT:    bstrins.d $a2, $a2, 61, 32
 ; LA64PIC-NEXT:    ld.d $a1, $a2, 0
 ; LA64PIC-NEXT:    st.d $a0, $a2, 0
 ; LA64PIC-NEXT:    move $a0, $a1
diff --git a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
index 1e7a79beb62c61..323858c7613a67 100644
--- a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
+++ b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
@@ -1128,8 +1128,7 @@ define dso_local ptr @load_addr_offset_614750729487779976() nounwind {
 ; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
 ; LA64-NEXT:    lu12i.w $a1, 279556
 ; LA64-NEXT:    ori $a1, $a1, 1088
-; LA64-NEXT:    lu32i.d $a1, 17472
-; LA64-NEXT:    lu52i.d $a1, $a1, 1092
+; LA64-NEXT:    bstrins.d $a1, $a1, 62, 32
 ; LA64-NEXT:    add.d $a0, $a0, $a1
 ; LA64-NEXT:    ret
 ;
@@ -1142,8 +1141,7 @@ define dso_local ptr @load_addr_offset_614750729487779976() nounwind {
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    lu12i.w $a1, 279556
 ; LA64-LARGE-NEXT:    ori $a1, $a1, 1088
-; LA64-LARGE-NEXT:    lu32i.d $a1, 17472
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, 1092
+; LA64-LARGE-NEXT:    bstrins.d $a1, $a1, 62, 32
 ; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
 ; LA64-LARGE-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/sextw-removal.ll b/llvm/test/CodeGen/LoongArch/sextw-removal.ll
index 2bb39395c1d1b6..7500b5ae09359a 100644
--- a/llvm/test/CodeGen/LoongArch/sextw-removal.ll
+++ b/llvm/test/CodeGen/LoongArch/sextw-removal.ll
@@ -323,21 +323,17 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind {
 ; CHECK-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
 ; CHECK-NEXT:    sra.w $a0, $a0, $a1
 ; CHECK-NEXT:    lu12i.w $a1, 349525
-; CHECK-NEXT:    ori $a1, $a1, 1365
-; CHECK-NEXT:    lu32i.d $a1, 349525
-; CHECK-NEXT:    lu52i.d $fp, $a1, 1365
+; CHECK-NEXT:    ori $fp, $a1, 1365
+; CHECK-NEXT:    bstrins.d $fp, $fp, 62, 32
 ; CHECK-NEXT:    lu12i.w $a1, 209715
-; CHECK-NEXT:    ori $a1, $a1, 819
-; CHECK-NEXT:    lu32i.d $a1, 209715
-; CHECK-NEXT:    lu52i.d $s0, $a1, 819
+; CHECK-NEXT:    ori $s0, $a1, 819
+; CHECK-NEXT:    bstrins.d $s0, $s0, 61, 32
 ; CHECK-NEXT:    lu12i.w $a1, 61680
-; CHECK-NEXT:    ori $a1, $a1, 3855
-; CHECK-NEXT:    lu32i.d $a1, -61681
-; CHECK-NEXT:    lu52i.d $s1, $a1, 240
+; CHECK-NEXT:    ori $s1, $a1, 3855
+; CHECK-NEXT:    bstrins.d $s1, $s1, 59, 32
 ; CHECK-NEXT:    lu12i.w $a1, 4112
-; CHECK-NEXT:    ori $a1, $a1, 257
-; CHECK-NEXT:    lu32i.d $a1, 65793
-; CHECK-NEXT:    lu52i.d $s2, $a1, 16
+; CHECK-NEXT:    ori $s2, $a1, 257
+; CHECK-NEXT:    bstrins.d $s2, $s2, 56, 32
 ; CHECK-NEXT:    .p2align 4, , 16
 ; CHECK-NEXT:  .LBB6_1: # %bb2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -374,21 +370,17 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind {
 ; NORMV-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
 ; NORMV-NEXT:    sra.w $a0, $a0, $a1
 ; NORMV-NEXT:    lu12i.w $a1, 349525
-; NORMV-NEXT:    ori $a1, $a1, 1365
-; NORMV-NEXT:    lu32i.d $a1, 349525
-; NORMV-NEXT:    lu52i.d $fp, $a1, 1365
+; NORMV-NEXT:    ori $fp, $a1, 1365
+; NORMV-NEXT:    bstrins.d $fp, $fp, 62, 32
 ; NORMV-NEXT:    lu12i.w $a1, 209715
-; NORMV-NEXT:    ori $a1, $a1, 819
-; NORMV-NEXT:    lu32i.d $a1, 209715
-; NORMV-NEXT:    lu52i.d $s0, $a1, 819
+; NORMV-NEXT:    ori $s0, $a1, 819
+; NORMV-NEXT:    bstrins.d $s0, $s0, 61, 32
 ; NORMV-NEXT:    lu12i.w $a1, 61680
-; NORMV-NEXT:    ori $a1, $a1, 3855
-; NORMV-NEXT:    lu32i.d $a1, -61681
-; NORMV-NEXT:    lu52i.d $s1, $a1, 240
+; NORMV-NEXT:    ori $s1, $a1, 3855
+; NORMV-NEXT:    bstrins.d $s1, $s1, 59, 32
 ; NORMV-NEXT:    lu12i.w $a1, 4112
-; NORMV-NEXT:    ori $a1, $a1, 257
-; NORMV-NEXT:    lu32i.d $a1, 65793
-; NORMV-NEXT:    lu52i.d $s2, $a1, 16
+; NORMV-NEXT:    ori $s2, $a1, 257
+; NORMV-NEXT:    bstrins.d $s2, $s2, 56, 32
 ; NORMV-NEXT:    .p2align 4, , 16
 ; NORMV-NEXT:  .LBB6_1: # %bb2
 ; NORMV-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-half.ll b/llvm/test/CodeGen/RISCV/calling-conv-half.ll
index c88b2bf596ca23..cccb69d2e6986a 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-half.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-half.ll
@@ -161,10 +161,8 @@ define i32 @caller_half_in_regs() nounwind {
 ; RV64IF:       # %bb.0:
 ; RV64IF-NEXT:    addi sp, sp, -16
 ; RV64IF-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64IF-NEXT:    lui a0, 1048564
-; RV64IF-NEXT:    fmv.w.x fa5, a0
-; RV64IF-NEXT:    fmv.x.w a1, fa5
 ; RV64IF-NEXT:    li a0, 1
+; RV64IF-NEXT:    lui a1, 1048564
 ; RV64IF-NEXT:    call callee_half_in_regs
 ; RV64IF-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64IF-NEXT:    addi sp, sp, 16
@@ -511,9 +509,8 @@ define half @callee_half_ret() nounwind {
 ;
 ; RV64IF-LABEL: callee_half_ret:
 ; RV64IF:       # %bb.0:
-; RV64IF-NEXT:    lui a0, %hi(.LCPI4_0)
-; RV64IF-NEXT:    flw fa5, %lo(.LCPI4_0)(a0)
-; RV64IF-NEXT:    fmv.x.w a0, fa5
+; RV64IF-NEXT:    lui a0, 1048564
+; RV64IF-NEXT:    addiw a0, a0, -1024
 ; RV64IF-NEXT:    ret
 ;
 ; RV32-ILP32F-LABEL: callee_half_ret:
diff --git a/llvm/test/CodeGen/RISCV/float-imm.ll b/llvm/test/CodeGen/RISCV/float-imm.ll
index c38416d994ba57..69a506cd850f2c 100644
--- a/llvm/test/CodeGen/RISCV/float-imm.ll
+++ b/llvm/test/CodeGen/RISCV/float-imm.ll
@@ -24,8 +24,8 @@ define float @float_imm() nounwind {
 ;
 ; RV64ZFINX-LABEL: float_imm:
 ; RV64ZFINX:       # %bb.0:
-; RV64ZFINX-NEXT:    lui a0, %hi(.LCPI0_0)
-; RV64ZFINX-NEXT:    lw a0, %lo(.LCPI0_0)(a0)
+; RV64ZFINX-NEXT:    lui a0, 263313
+; RV64ZFINX-NEXT:    addiw a0, a0, -37
 ; RV64ZFINX-NEXT:    ret
   ret float 3.14159274101257324218750
 }
diff --git a/llvm/test/CodeGen/RISCV/half-imm.ll b/llvm/test/CodeGen/RISCV/half-imm.ll
index 9c11010540e15d..4c39885176f01a 100644
--- a/llvm/test/CodeGen/RISCV/half-imm.ll
+++ b/llvm/test/CodeGen/RISCV/half-imm.ll
@@ -15,10 +15,10 @@
 ; RUN:   -target-abi lp64f < %s | FileCheck -check-prefixes=CHECKIZFHMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zhinxmin -verify-machineinstrs \
 ; RUN:   -target-abi ilp32 < %s \
-; RUN:   | FileCheck -check-prefixes=CHECKIZHINXMIN %s
+; RUN:   | FileCheck -check-prefixes=CHECKIZHINXMIN,RV32IZHINXMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zhinxmin -verify-machineinstrs \
 ; RUN:   -target-abi lp64 < %s \
-; RUN:   | FileCheck -check-prefixes=CHECKIZHINXMIN %s
+; RUN:   | FileCheck -check-prefixes=CHECKIZHINXMIN,RV64IZHINXMIN %s
 
 ; TODO: constant pool shouldn't be necessary for RV32IZfh and RV64IZfh
 define half @half_imm() nounwind {
@@ -30,14 +30,14 @@ define half @half_imm() nounwind {
 ;
 ; RV32IZHINX-LABEL: half_imm:
 ; RV32IZHINX:       # %bb.0:
-; RV32IZHINX-NEXT:    lui a0, %hi(.LCPI0_0)
-; RV32IZHINX-NEXT:    lh a0, %lo(.LCPI0_0)(a0)
+; RV32IZHINX-NEXT:    lui a0, 4
+; RV32IZHINX-NEXT:    addi a0, a0, 512
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: half_imm:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    lui a0, %hi(.LCPI0_0)
-; RV64IZHINX-NEXT:    lh a0, %lo(.LCPI0_0)(a0)
+; RV64IZHINX-NEXT:    lui a0, 4
+; RV64IZHINX-NEXT:    addiw a0, a0, 512
 ; RV64IZHINX-NEXT:    ret
 ;
 ; CHECKIZFHMIN-LABEL: half_imm:
@@ -46,11 +46,17 @@ define half @half_imm() nounwind {
 ; CHECKIZFHMIN-NEXT:    flh fa0, %lo(.LCPI0_0)(a0)
 ; CHECKIZFHMIN-NEXT:    ret
 ;
-; CHECKIZHINXMIN-LABEL: half_imm:
-; CHECKIZHINXMIN:       # %bb.0:
-; CHECKIZHINXMIN-NEXT:    lui a0, %hi(.LCPI0_0)
-; CHECKIZHINXMIN-NEXT:    lh a0, %lo(.LCPI0_0)(a0)
-; CHECKIZHINXMIN-NEXT:    ret
+; RV32IZHINXMIN-LABEL: half_imm:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    lui a0, 4
+; RV32IZHINXMIN-NEXT:    addi a0, a0, 512
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: half_imm:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    lui a0, 4
+; RV64IZHINXMIN-NEXT:    addiw a0, a0, 512
+; RV64IZHINXMIN-NEXT:    ret
   ret half 3.0
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 5d5807cbadbad5..4be680e272e5b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -524,8 +524,7 @@ define float @vreduce_fadd_v7f32_neutralstart_fast(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 7, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    lui a0, 524288
-; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vmv.s.x v10, zero
 ; CHECK-NEXT:    vfredusum.vs v8, v8, v10
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
index ae3dce497c6d07..90a856605c70d8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll
@@ -19,12 +19,10 @@ define <2 x half> @vfabs_vv_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fabs.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -39,12 +37,10 @@ define <2 x half> @vfabs_vv_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v9, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fabs.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x half> %v
@@ -61,12 +57,10 @@ define <4 x half> @vfabs_vv_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fabs.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -81,12 +75,10 @@ define <4 x half> @vfabs_vv_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v9, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fabs.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x half> %v
@@ -103,12 +95,10 @@ define <8 x half> @vfabs_vv_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v10, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fabs.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -123,12 +113,10 @@ define <8 x half> @vfabs_vv_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fabs.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x half> %v
@@ -145,12 +133,10 @@ define <16 x half> @vfabs_vv_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fabs.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -165,12 +151,10 @@ define <16 x half> @vfabs_vv_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fabs.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
index fbc4c56a911340..019923ffdfdedf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll
@@ -19,12 +19,9 @@ define <2 x half> @vfneg_vv_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_v2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -39,12 +36,9 @@ define <2 x half> @vfneg_vv_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_v2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x half> %v
@@ -61,12 +55,9 @@ define <4 x half> @vfneg_vv_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -81,12 +72,9 @@ define <4 x half> @vfneg_vv_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_v4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x half> %v
@@ -103,12 +91,9 @@ define <8 x half> @vfneg_vv_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_v8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -123,12 +108,9 @@ define <8 x half> @vfneg_vv_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_v8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x half> %v
@@ -145,12 +127,9 @@ define <16 x half> @vfneg_vv_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_v16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -165,12 +144,9 @@ define <16 x half> @vfneg_vv_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_v16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.ll b/llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.ll
new file mode 100644
index 00000000000000..7f70b0ed224ec0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s
+
+define void @avl_not_dominated(<vscale x 2 x i32> %v, ptr %p) {
+; CHECK-LABEL: avl_not_dominated:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    slli a1, a1, 32
+; CHECK-NEXT:    srli a1, a1, 32
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vi v8, v8, 1
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %w = add <vscale x 2 x i32> %v, splat (i32 1)
+  %evl = extractelement <vscale x 2 x i32> %v, i32 0
+  call void @llvm.vp.store(<vscale x 2 x i32> %w, ptr %p, <vscale x 2 x i1> splat(i1 true), i32 %evl)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.mir
new file mode 100644
index 00000000000000..5a223580821b75
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/reduce-vl-peephole.mir
@@ -0,0 +1,15 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v -run-pass=riscv-vector-peephole \
+# RUN:     -verify-machineinstrs | FileCheck %s
+---
+name: avl_not_dominated
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: avl_not_dominated
+    ; CHECK: %evl:gprnox0 = ADDI $x0, 1
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, %evl, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: PseudoVSE32_V_M1 %x, $noreg, %evl, 5 /* e32 */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */
+    %evl:gprnox0 = ADDI $x0, 1
+    PseudoVSE32_V_M1 %x:vr, $noreg, %evl, 5 /* e32 */
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll
index 6e34d59a2d9894..e8a7d790758596 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll
@@ -19,13 +19,12 @@ define <vscale x 1 x half> @vfsgnj_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsgnj_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.copysign.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -40,13 +39,12 @@ define <vscale x 1 x half> @vfsgnj_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsgnj_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.copysign.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x half> %v
@@ -63,13 +61,12 @@ define <vscale x 2 x half> @vfsgnj_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsgnj_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.copysign.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -84,13 +81,12 @@ define <vscale x 2 x half> @vfsgnj_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsgnj_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v9, v9, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.copysign.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
@@ -107,13 +103,12 @@ define <vscale x 4 x half> @vfsgnj_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsgnj_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1, v0.t
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.copysign.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -128,13 +123,12 @@ define <vscale x 4 x half> @vfsgnj_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsgnj_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.copysign.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x half> %v
@@ -151,13 +145,12 @@ define <vscale x 8 x half> @vfsgnj_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsgnj_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.copysign.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -172,13 +165,12 @@ define <vscale x 8 x half> @vfsgnj_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsgnj_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.copysign.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x half> %v
@@ -195,13 +187,12 @@ define <vscale x 16 x half> @vfsgnj_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfsgnj_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v12, v12, a1, v0.t
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.copysign.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -216,13 +207,12 @@ define <vscale x 16 x half> @vfsgnj_vv_nxv16f16_unmasked(<vscale x 16 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfsgnj_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v12, v12, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.copysign.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
@@ -239,48 +229,12 @@ define <vscale x 32 x half> @vfsgnj_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfsgnj_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB10_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v16, v16, a1, v0.t
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.copysign.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -295,48 +249,12 @@ define <vscale x 32 x half> @vfsgnj_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfsgnj_vv_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB11_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB11_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v16, v16, a1
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.copysign.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
index 0f7e3f1e0ea5a2..b9be6eb1fa3737 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
@@ -19,12 +19,10 @@ define <vscale x 1 x half> @vfabs_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fabs.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -39,12 +37,10 @@ define <vscale x 1 x half> @vfabs_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v9, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fabs.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x half> %v
@@ -61,12 +57,10 @@ define <vscale x 2 x half> @vfabs_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fabs.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -81,12 +75,10 @@ define <vscale x 2 x half> @vfabs_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v9, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fabs.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
@@ -103,12 +95,10 @@ define <vscale x 4 x half> @vfabs_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v10, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fabs.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -123,12 +113,10 @@ define <vscale x 4 x half> @vfabs_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v10, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fabs.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x half> %v
@@ -145,12 +133,10 @@ define <vscale x 8 x half> @vfabs_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fabs.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -165,12 +151,10 @@ define <vscale x 8 x half> @vfabs_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fabs.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x half> %v
@@ -187,12 +171,10 @@ define <vscale x 16 x half> @vfabs_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fabs.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -207,12 +189,10 @@ define <vscale x 16 x half> @vfabs_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fabs.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
@@ -229,32 +209,10 @@ define <vscale x 32 x half> @vfabs_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB10_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fabs.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -269,32 +227,10 @@ define <vscale x 32 x half> @vfabs_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfabs_vv_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v16
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB11_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB11_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fabs.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
index 69ea7ce33cf6b6..af2668a9b0c545 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
@@ -19,12 +19,9 @@ define <vscale x 1 x half> @vfneg_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -39,12 +36,9 @@ define <vscale x 1 x half> @vfneg_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x half> %v
@@ -61,12 +55,9 @@ define <vscale x 2 x half> @vfneg_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -81,12 +72,9 @@ define <vscale x 2 x half> @vfneg_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v9, v9
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
@@ -103,12 +91,9 @@ define <vscale x 4 x half> @vfneg_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -123,12 +108,9 @@ define <vscale x 4 x half> @vfneg_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v10, v10
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x half> %v
@@ -145,12 +127,9 @@ define <vscale x 8 x half> @vfneg_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -165,12 +144,9 @@ define <vscale x 8 x half> @vfneg_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v12, v12
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x half> %v
@@ -187,12 +163,9 @@ define <vscale x 16 x half> @vfneg_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -207,12 +180,9 @@ define <vscale x 16 x half> @vfneg_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
@@ -229,32 +199,9 @@ define <vscale x 32 x half> @vfneg_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v24, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB10_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -269,32 +216,9 @@ define <vscale x 32 x half> @vfneg_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfneg_vv_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v16
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    bltu a0, a1, .LBB11_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a0, a1
-; ZVFHMIN-NEXT:  .LBB11_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfneg.v v16, v16
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fneg.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index adba502335f86c..c0b14d2064d5eb 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -290,3 +290,21 @@ define <8 x i16> @trunc_sat_u_v8i16(<8 x half> %x) {
   %a = fptoui <8 x half> %x to <8 x i16>
   ret <8 x i16> %a
 }
+
+define <8 x i16> @trunc_sat_s_v8i16_sat(<8 x half> %x) {
+; CHECK-LABEL: trunc_sat_s_v8i16_sat:
+; CHECK:         .functype trunc_sat_s_v8i16_sat (v128) -> (v128)
+; CHECK-NEXT:    i16x8.trunc_sat_f16x8_s $push0=, $0
+; CHECK-NEXT:    return $pop[[R]]{{$}}
+  %a = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> %x)
+  ret <8 x i16> %a
+}
+
+define <8 x i16> @trunc_sat_u_v8i16_sat(<8 x half> %x) {
+; CHECK-LABEL: trunc_sat_u_v8i16_sat:
+; CHECK:         .functype trunc_sat_u_v8i16_sat (v128) -> (v128)
+; CHECK-NEXT:    i16x8.trunc_sat_f16x8_u $push0=, $0
+; CHECK-NEXT:    return $pop[[R]]{{$}}
+  %a = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> %x)
+  ret <8 x i16> %a
+}
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
index 032168e28421b9..23b1043c700165 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
@@ -33,7 +33,7 @@ declare void @use32(ptr)
 ;.
 define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-LABEL: define void @test_alloca
-; DYNAMIC-SHADOW-SAME: () #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
+; DYNAMIC-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
 ; DYNAMIC-SHADOW-NEXT:  entry:
 ; DYNAMIC-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; DYNAMIC-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -42,33 +42,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
-; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
+; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
 ;
 ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca
-; ZERO-BASED-SHADOW-SAME: () #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
+; ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
 ; ZERO-BASED-SHADOW-NEXT:  entry:
 ; ZERO-BASED-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null)
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -77,30 +77,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
-; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
+; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
 ;
 entry:
   %x = alloca i32, align 4
@@ -131,17 +131,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !23 = !DILocation(line: 7, column: 5, scope: !15)
 !24 = !DILocation(line: 8, column: 1, scope: !15)
 ;.
-; DYNAMIC-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin }
-; DYNAMIC-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress }
-; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind }
-; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; DYNAMIC-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; DYNAMIC-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress }
+; DYNAMIC-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
-; ZERO-BASED-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin }
-; ZERO-BASED-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress }
-; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind }
-; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; ZERO-BASED-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress }
+; ZERO-BASED-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
 ; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -149,16 +147,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; DYNAMIC-SHADOW: [[META3]] = !{}
 ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
-; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
-; DYNAMIC-SHADOW: [[META10]] = !{null}
-; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
-; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
-; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
-; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
+; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; DYNAMIC-SHADOW: [[META9]] = !{null}
+; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
+; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
 ;.
 ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -166,14 +163,13 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; ZERO-BASED-SHADOW: [[META3]] = !{}
 ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
-; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META10]] = !{null}
-; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
-; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
-; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
+; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META9]] = !{null}
+; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
 ;.
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll
index dc2d11cb4b3538..9cebe2e845f772 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/basic.ll
@@ -9,6 +9,8 @@
 ; RUN: opt < %s -passes=hwasan -hwasan-recover=0 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=ABORT-ZERO-BASED-SHADOW
 ; RUN: opt < %s -passes=hwasan -hwasan-recover=1 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=RECOVER-ZERO-BASED-SHADOW
 
+; CHECK: @llvm.used = appending global [1 x ptr] [ptr @hwasan.module_ctor]
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @hwasan.module_ctor, ptr @hwasan.module_ctor }]
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "riscv64-unknown-linux"
@@ -30,7 +32,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -66,7 +68,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2:![0-9]+]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1:![0-9]+]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 0)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -86,7 +88,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -106,10 +108,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -118,13 +120,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -143,7 +145,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -163,10 +165,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 96", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -175,13 +177,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -210,7 +212,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -246,7 +248,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 1)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -266,7 +268,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -286,10 +288,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -298,13 +300,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -323,7 +325,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -343,10 +345,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 97", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -355,13 +357,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -390,7 +392,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -426,7 +428,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 2)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -446,7 +448,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -466,10 +468,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -478,13 +480,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -503,7 +505,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -523,10 +525,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 98", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -535,13 +537,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -570,7 +572,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -606,7 +608,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 3)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -626,7 +628,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -646,10 +648,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -658,13 +660,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -683,7 +685,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -703,10 +705,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 99", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -715,13 +717,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -750,7 +752,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -786,7 +788,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 4)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -806,7 +808,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -826,10 +828,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -838,13 +840,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -863,7 +865,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -883,10 +885,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 100", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -895,13 +897,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1011,7 +1013,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1047,7 +1049,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 16)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1067,7 +1069,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1087,10 +1089,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1099,13 +1101,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1124,7 +1126,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1144,10 +1146,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 112", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1156,13 +1158,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1191,7 +1193,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1227,7 +1229,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 17)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1247,7 +1249,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1267,10 +1269,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1279,13 +1281,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1304,7 +1306,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1324,10 +1326,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 113", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1336,13 +1338,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1371,7 +1373,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1407,7 +1409,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 18)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1427,7 +1429,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1447,10 +1449,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1459,13 +1461,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1484,7 +1486,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1504,10 +1506,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 114", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1516,13 +1518,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1551,7 +1553,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1587,7 +1589,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 19)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1607,7 +1609,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1627,10 +1629,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1639,13 +1641,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1664,7 +1666,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1684,10 +1686,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 115", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1696,13 +1698,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1731,7 +1733,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; CHECK:       12:
 ; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20)
 ; CHECK-NEXT:    br label [[TMP13]]
@@ -1767,7 +1769,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP8]]
 ; FASTPATH-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
 ; FASTPATH-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP6]], [[TMP10]]
-; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
 ; FASTPATH:       12:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP3]], ptr [[A]], i32 20)
 ; FASTPATH-NEXT:    br label [[TMP13]]
@@ -1787,7 +1789,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-DYNAMIC-SHADOW:       8:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20)
 ; ABORT-DYNAMIC-SHADOW-NEXT:    br label [[TMP9]]
@@ -1807,10 +1809,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1819,13 +1821,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1844,7 +1846,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; ABORT-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; ABORT-ZERO-BASED-SHADOW:       8:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20)
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    br label [[TMP9]]
@@ -1864,10 +1866,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "ebreak\0Aaddiw x0, x11, 116", "{x10}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1876,13 +1878,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -2058,43 +2060,43 @@ entry:
 
 define i8 @test_load_noattr(ptr %a) {
 ; CHECK-LABEL: define i8 @test_load_noattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 ; NOFASTPATH-LABEL: define i8 @test_load_noattr
-; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; NOFASTPATH-SAME: (ptr [[A:%.*]]) {
 ; NOFASTPATH-NEXT:  entry:
 ; NOFASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; NOFASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; FASTPATH-LABEL: define i8 @test_load_noattr
-; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; FASTPATH-SAME: (ptr [[A:%.*]]) {
 ; FASTPATH-NEXT:  entry:
 ; FASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; FASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr
-; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; ABORT-DYNAMIC-SHADOW-NEXT:  entry:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr
-; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:  entry:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr
-; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr
-; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:  entry:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
@@ -2106,43 +2108,43 @@ entry:
 
 define i8 @test_load_notmyattr(ptr %a) sanitize_address {
 ; CHECK-LABEL: define i8 @test_load_notmyattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 ; NOFASTPATH-LABEL: define i8 @test_load_notmyattr
-; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; NOFASTPATH-NEXT:  entry:
 ; NOFASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; NOFASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; FASTPATH-LABEL: define i8 @test_load_notmyattr
-; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; FASTPATH-NEXT:  entry:
 ; FASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; FASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr
-; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; ABORT-DYNAMIC-SHADOW-NEXT:  entry:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr
-; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:  entry:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr
-; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr
-; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:  entry:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
index 0f74736dc232ea..4bd23ea76c159b 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
@@ -34,7 +34,7 @@ declare void @use32(ptr)
 ;.
 define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-LABEL: define void @test_alloca(
-; DYNAMIC-SHADOW-SAME: ) #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
+; DYNAMIC-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
 ; DYNAMIC-SHADOW-NEXT:  entry:
 ; DYNAMIC-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
 ; DYNAMIC-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -43,33 +43,33 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
-; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
-; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
+; DYNAMIC-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
+; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP11]], !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP18]], !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
+; DYNAMIC-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
 ;
 ; ZERO-BASED-SHADOW-LABEL: define void @test_alloca(
-; ZERO-BASED-SHADOW-SAME: ) #[[ATTR1:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG8:![0-9]+]] {
+; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__hwasan_personality_thunk !dbg [[DBG7:![0-9]+]] {
 ; ZERO-BASED-SHADOW-NEXT:  entry:
 ; ZERO-BASED-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null)
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -78,30 +78,30 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
-; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META11:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META13:![0-9]+]])
-; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG14:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG14]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG15:![0-9]+]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG15]]
-; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG15]]
+; ZERO-BASED-SHADOW-NEXT:      #dbg_value(!DIArgList(ptr [[X]], ptr [[X]]), [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref), [[META12:![0-9]+]])
+; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG13:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP3]], 56, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], [[TMP6]], !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[X_HWASAN:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP3]] to i8, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], 72057594037927935, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 4, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG13]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 72057594037927935, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP17]], 4, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr, !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP19]], i8 [[TMP15]], i64 1, i1 false), !dbg [[DBG14]]
+; ZERO-BASED-SHADOW-NEXT:    ret void, !dbg [[DBG14]]
 ;
 entry:
   %x = alloca i32, align 4
@@ -112,13 +112,13 @@ entry:
 
 define void @test_vscale_alloca() sanitize_hwaddress {
 ; DYNAMIC-SHADOW-LABEL: define void @test_vscale_alloca(
-; DYNAMIC-SHADOW-SAME: ) #[[ATTR1]] {
+; DYNAMIC-SHADOW-SAME: ) #[[ATTR0]] {
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca <vscale x 4 x i64>, align 32
 ; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X]])
 ; DYNAMIC-SHADOW-NEXT:    ret void
 ;
 ; ZERO-BASED-SHADOW-LABEL: define void @test_vscale_alloca(
-; ZERO-BASED-SHADOW-SAME: ) #[[ATTR1]] {
+; ZERO-BASED-SHADOW-SAME: ) #[[ATTR0]] {
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca <vscale x 4 x i64>, align 32
 ; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X]])
 ; ZERO-BASED-SHADOW-NEXT:    ret void
@@ -150,17 +150,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !23 = !DILocation(line: 7, column: 5, scope: !15)
 !24 = !DILocation(line: 8, column: 1, scope: !15)
 ;.
-; DYNAMIC-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin }
-; DYNAMIC-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress }
-; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind }
-; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; DYNAMIC-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; DYNAMIC-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress }
+; DYNAMIC-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+; DYNAMIC-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; DYNAMIC-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
-; ZERO-BASED-SHADOW: attributes #[[ATTR0:[0-9]+]] = { nobuiltin }
-; ZERO-BASED-SHADOW: attributes #[[ATTR1]] = { nobuiltin sanitize_hwaddress }
-; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nounwind }
-; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-; ZERO-BASED-SHADOW: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; ZERO-BASED-SHADOW: attributes #[[ATTR0]] = { sanitize_hwaddress }
+; ZERO-BASED-SHADOW: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+; ZERO-BASED-SHADOW: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; ZERO-BASED-SHADOW: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
 ;.
 ; DYNAMIC-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; DYNAMIC-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -168,16 +166,15 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; DYNAMIC-SHADOW: [[META3]] = !{}
 ; DYNAMIC-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; DYNAMIC-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
-; DYNAMIC-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; DYNAMIC-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; DYNAMIC-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
-; DYNAMIC-SHADOW: [[META10]] = !{null}
-; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
-; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; DYNAMIC-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
-; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
-; DYNAMIC-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
+; DYNAMIC-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; DYNAMIC-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; DYNAMIC-SHADOW: [[META9]] = !{null}
+; DYNAMIC-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
+; DYNAMIC-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; DYNAMIC-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
+; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
 ;.
 ; ZERO-BASED-SHADOW: [[META0]] = !{ptr @hwasan.note}
 ; ZERO-BASED-SHADOW: [[META1:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META2:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META3:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
@@ -185,14 +182,13 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; ZERO-BASED-SHADOW: [[META3]] = !{}
 ; ZERO-BASED-SHADOW: [[META4:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; ZERO-BASED-SHADOW: [[META5:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{i32 4, !"nosanitize_hwaddress", i32 1}
-; ZERO-BASED-SHADOW: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; ZERO-BASED-SHADOW: [[DBG8]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META9:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
-; ZERO-BASED-SHADOW: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META10]] = !{null}
-; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: [[DBG8]], file: [[META2]], line: 5, type: [[META12:![0-9]+]])
-; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-; ZERO-BASED-SHADOW: [[META13]] = !DILocation(line: 0, scope: [[DBG8]])
-; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 7, column: 5, scope: [[DBG8]])
-; ZERO-BASED-SHADOW: [[DBG15]] = !DILocation(line: 8, column: 1, scope: [[DBG8]])
+; ZERO-BASED-SHADOW: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: [[META2]], file: [[META2]], line: 4, type: [[META8:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META1]], retainedNodes: [[META3]])
+; ZERO-BASED-SHADOW: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META9]] = !{null}
+; ZERO-BASED-SHADOW: [[META10]] = !DILocalVariable(name: "x", scope: [[DBG7]], file: [[META2]], line: 5, type: [[META11:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; ZERO-BASED-SHADOW: [[META12]] = !DILocation(line: 0, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 7, column: 5, scope: [[DBG7]])
+; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: [[DBG7]])
 ;.
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll b/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll
deleted file mode 100644
index eeb51aeda1000b..00000000000000
--- a/llvm/test/Instrumentation/HWAddressSanitizer/attrinfer.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; Standard library functions get inferred attributes, some of which are not
-; correct when building for HWASan.
-
-; RUN: opt < %s -passes=hwasan -S | FileCheck %s --check-prefixes=CHECK
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-android10000"
-
-declare float @frexpf(float noundef, ptr nocapture noundef) local_unnamed_addr #0
-
-attributes #0 = { mustprogress nofree nounwind willreturn memory(argmem: write) "frame-pointer"="non-leaf" "hwasan-abi"="interceptor" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fix-cortex-a53-835769,+fp-armv8,+neon,+outline-atomics,+tagged-globals,+v8a" }
-
-; CHECK-NOT: memory(argmem: write)
-; CHECK: nobuiltin
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
index 1e74f2891a2e3c..4212293f42545e 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/basic.ll
@@ -42,7 +42,7 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2:![0-9]+]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1:![0-9]+]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 0)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -70,10 +70,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -82,13 +82,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -115,10 +115,10 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2:![0-9]+]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1:![0-9]+]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2336", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -127,13 +127,13 @@ define i8 @test_load8(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -174,7 +174,7 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 1)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -202,10 +202,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -214,13 +214,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -247,10 +247,10 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2337", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -259,13 +259,13 @@ define i16 @test_load16(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -306,7 +306,7 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 2)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -334,10 +334,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -346,13 +346,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -379,10 +379,10 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2338", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -391,13 +391,13 @@ define i32 @test_load32(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -438,7 +438,7 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 3)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -466,10 +466,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -478,13 +478,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -511,10 +511,10 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2339", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -523,13 +523,13 @@ define i64 @test_load64(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -570,7 +570,7 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 4)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -598,10 +598,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -610,13 +610,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -643,10 +643,10 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2340", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -655,13 +655,13 @@ define i128 @test_load128(ptr %a) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -771,7 +771,7 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 16)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -799,10 +799,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -811,13 +811,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -844,10 +844,10 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2352", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -856,13 +856,13 @@ define void @test_store8(ptr %a, i8 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 0
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -903,7 +903,7 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 17)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -931,10 +931,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -943,13 +943,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -976,10 +976,10 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2353", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -988,13 +988,13 @@ define void @test_store16(ptr %a, i16 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1035,7 +1035,7 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 18)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -1063,10 +1063,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1075,13 +1075,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1108,10 +1108,10 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2354", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1120,13 +1120,13 @@ define void @test_store32(ptr %a, i32 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 3
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1167,7 +1167,7 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 19)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -1195,10 +1195,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1207,13 +1207,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1240,10 +1240,10 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2355", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1252,13 +1252,13 @@ define void @test_store64(ptr %a, i64 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 7
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1299,7 +1299,7 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; FASTPATH-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; FASTPATH-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; FASTPATH-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF2]]
+; FASTPATH-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; FASTPATH:       8:
 ; FASTPATH-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[A]], i32 20)
 ; FASTPATH-NEXT:    br label [[TMP9]]
@@ -1327,10 +1327,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DOTHWASAN_SHADOW]], i64 [[TMP4]]
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       8:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       10:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]])
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1339,13 +1339,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       16:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-DYNAMIC-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-DYNAMIC-SHADOW:       21:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-DYNAMIC-SHADOW:       22:
@@ -1372,10 +1372,10 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP5]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP2]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       8:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP9:%.*]] = icmp ugt i8 [[TMP6]], 15
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       10:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    call void asm sideeffect "brk #2356", "{x0}"(i64 [[TMP0]])
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP21:%.*]]
@@ -1384,13 +1384,13 @@ define void @test_store128(ptr %a, i128 %b) sanitize_hwaddress {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP13:%.*]] = trunc i64 [[TMP12]] to i8
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = icmp uge i8 [[TMP14]], [[TMP6]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP15]], label [[TMP10]], label [[TMP16:%.*]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       16:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP17:%.*]] = or i64 [[TMP3]], 15
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP2]], [[TMP19]]
-; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF2]]
+; RECOVER-ZERO-BASED-SHADOW-NEXT:    br i1 [[TMP20]], label [[TMP10]], label [[TMP21]], !prof [[PROF1]]
 ; RECOVER-ZERO-BASED-SHADOW:       21:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    br label [[TMP22]]
 ; RECOVER-ZERO-BASED-SHADOW:       22:
@@ -1542,43 +1542,43 @@ entry:
 
 define i8 @test_load_noattr(ptr %a) {
 ; CHECK-LABEL: define i8 @test_load_noattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 ; NOFASTPATH-LABEL: define i8 @test_load_noattr
-; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; NOFASTPATH-SAME: (ptr [[A:%.*]]) {
 ; NOFASTPATH-NEXT:  entry:
 ; NOFASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; NOFASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; FASTPATH-LABEL: define i8 @test_load_noattr
-; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; FASTPATH-SAME: (ptr [[A:%.*]]) {
 ; FASTPATH-NEXT:  entry:
 ; FASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; FASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr
-; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; ABORT-DYNAMIC-SHADOW-NEXT:  entry:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_noattr
-; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:  entry:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr
-; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_noattr
-; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:  entry:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
@@ -1590,43 +1590,43 @@ entry:
 
 define i8 @test_load_notmyattr(ptr %a) sanitize_address {
 ; CHECK-LABEL: define i8 @test_load_notmyattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
 ;
 ; NOFASTPATH-LABEL: define i8 @test_load_notmyattr
-; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; NOFASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; NOFASTPATH-NEXT:  entry:
 ; NOFASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; NOFASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; FASTPATH-LABEL: define i8 @test_load_notmyattr
-; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; FASTPATH-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; FASTPATH-NEXT:  entry:
 ; FASTPATH-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; FASTPATH-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr
-; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; ABORT-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; ABORT-DYNAMIC-SHADOW-NEXT:  entry:
 ; ABORT-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-DYNAMIC-SHADOW-LABEL: define i8 @test_load_notmyattr
-; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; RECOVER-DYNAMIC-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; RECOVER-DYNAMIC-SHADOW-NEXT:  entry:
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-DYNAMIC-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; ABORT-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr
-; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; ABORT-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
 ;
 ; RECOVER-ZERO-BASED-SHADOW-LABEL: define i8 @test_load_notmyattr
-; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; RECOVER-ZERO-BASED-SHADOW-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:  entry:
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; RECOVER-ZERO-BASED-SHADOW-NEXT:    ret i8 [[B]]
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll b/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll
index f72fc0a9720e4a..980189c5607f31 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/fixed-shadow.ll
@@ -194,7 +194,7 @@ entry:
 
 define i8 @test_load_noattr(ptr %a) {
 ; CHECK-LABEL: define i8 @test_load_noattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
@@ -206,7 +206,7 @@ entry:
 
 define i8 @test_load_notmyattr(ptr %a) sanitize_address {
 ; CHECK-LABEL: define i8 @test_load_notmyattr
-; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i8 [[B]]
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll b/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll
index 2635dfb75ed98f..00614b603fe799 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/hwasan-pass-second-run.ll
@@ -18,7 +18,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: @__hwasan_shadow = external global [0 x i8]
 ;.
 define i8 @test_load8(ptr %a) sanitize_hwaddress {
-; CHECK: Function Attrs: nobuiltin sanitize_hwaddress
+; CHECK: Function Attrs: sanitize_hwaddress
 ; CHECK-LABEL: define i8 @test_load8
 ; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -33,7 +33,7 @@ entry:
   ret i8 %b
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { nobuiltin sanitize_hwaddress }
+; CHECK: attributes #[[ATTR0]] = { sanitize_hwaddress }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 ;.
 ; CHECK: [[META0]] = !{ptr @hwasan.note}
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
index 919eacb2951f5e..c0e370f20213aa 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
@@ -11,5 +11,5 @@ entry:
   ret void
 }
 
-; CHECK: attributes #0 = { nobuiltin sanitize_hwaddress uwtable }
+; CHECK: attributes #0 = { sanitize_hwaddress uwtable }
 attributes #0 = { sanitize_hwaddress memory(argmem: write) uwtable }
diff --git a/llvm/test/MC/AArch64/SVE/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE/directive-arch-negative.s
new file mode 100644
index 00000000000000..e3029c16ffc8a6
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE/directive-arch-negative.s
@@ -0,0 +1,8 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+// Check that setting +nosve implies +nosve2
+.arch armv9-a+nosve
+
+adclb z0.s, z1.s, z31.s
+// CHECK: error: instruction requires: sve2
+// CHECK-NEXT: adclb z0.s, z1.s, z31.s
diff --git a/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s
index 661f13974d0bc8..31118f7490d00d 100644
--- a/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s
+++ b/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s
@@ -1,7 +1,12 @@
 // RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
 
-.arch_extension nosve
+.arch_extension sve2+nosve
 
 ptrue   p0.b, pow2
 // CHECK: error: instruction requires: sve or sme
 // CHECK-NEXT: ptrue   p0.b, pow2
+
+// Check that setting +nosve implies +nosve2
+adclb z0.s, z1.s, z31.s
+// CHECK: error: instruction requires: sve2
+// CHECK-NEXT: adclb z0.s, z1.s, z31.s
diff --git a/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s
index 82acc1b0b0be9b..6ba537ca70609e 100644
--- a/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s
+++ b/llvm/test/MC/AArch64/SVE/directive-cpu-negative.s
@@ -1,6 +1,11 @@
 // RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
 
-.cpu generic+sve+nosve
+.cpu generic+sve2+nosve
 ptrue   p0.b, pow2
 // CHECK: error: instruction requires: sve or sme
 // CHECK-NEXT: ptrue   p0.b, pow2
+
+// Check that setting +nosve implies +nosve2
+adclb z0.s, z1.s, z31.s
+// CHECK: error: instruction requires: sve2
+// CHECK-NEXT: adclb z0.s, z1.s, z31.s
diff --git a/llvm/test/MC/AArch64/directive-arch-negative.s b/llvm/test/MC/AArch64/directive-arch-negative.s
index f60759899aa6c9..406507d5fc8f4d 100644
--- a/llvm/test/MC/AArch64/directive-arch-negative.s
+++ b/llvm/test/MC/AArch64/directive-arch-negative.s
@@ -12,10 +12,13 @@
 # CHECK-NEXT: 	aese v0.8h, v1.8h
 # CHECK-NEXT:	^
 
-// We silently ignore invalid features.
 	.arch armv8+foo
 	aese v0.8h, v1.8h
 
+# CHECK: error: unsupported architectural extension: foo
+# CHECK-NEXT:   .arch armv8+foo
+# CHECK-NEXT:               ^
+
 # CHECK: error: invalid operand for instruction
 # CHECK-NEXT:	aese v0.8h, v1.8h
 # CHECK-NEXT:	^
diff --git a/llvm/test/MC/AArch64/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/directive-arch_extension-negative.s
index 1c1cfc9d33e3ed..1843af56555461 100644
--- a/llvm/test/MC/AArch64/directive-arch_extension-negative.s
+++ b/llvm/test/MC/AArch64/directive-arch_extension-negative.s
@@ -4,7 +4,7 @@
 // RUN: -filetype asm -o - %s 2>&1 | FileCheck %s
 
 .arch_extension axp64
-// CHECK: error: unknown architectural extension: axp64
+// CHECK: error: unsupported architectural extension: axp64
 // CHECK-NEXT: .arch_extension axp64
 
 crc32cx w0, w1, x3
@@ -49,6 +49,8 @@ fminnm d0, d0, d1
 // CHECK: [[@LINE-1]]:1: error: instruction requires: fp
 // CHECK-NEXT: fminnm d0, d0, d1
 
+// nofp implied nosimd, so reinstate it
+.arch_extension simd
 addp v0.4s, v0.4s, v0.4s
 // CHECK-NOT: [[@LINE-1]]:1: error: instruction requires: neon
 .arch_extension nosimd
@@ -70,6 +72,8 @@ casa w5, w7, [x20]
 // CHECK: [[@LINE-1]]:1: error: instruction requires: lse
 // CHECK-NEXT: casa w5, w7, [x20]
 
+// nolse implied nolse128, so reinstate it
+.arch_extension lse128
 swpp x0, x2, [x3]
 // CHECK-NOT: [[@LINE-1]]:1: error: instruction requires: lse128
 .arch_extension nolse128
@@ -84,6 +88,8 @@ cfp rctx, x0
 // CHECK: [[@LINE-1]]:5: error: CFPRCTX requires: predres
 // CHECK-NEXT: cfp rctx, x0
 
+// nopredres implied nopredres2, so reinstate it
+.arch_extension predres2
 cosp rctx, x0
 // CHECK-NOT: [[@LINE-1]]:6: error: COSP requires: predres2
 .arch_extension nopredres2
@@ -133,6 +139,8 @@ ldapr x0, [x1]
 // CHECK: [[@LINE-1]]:1: error: instruction requires: rcpc
 // CHECK-NEXT: ldapr x0, [x1]
 
+// norcpc implied norcpc3, so reinstate it
+.arch_extension rcpc3
 stilp w24, w0, [x16, #-8]!
 // CHECK-NOT: [[@LINE-1]]:1: error: instruction requires: rcpc3
 .arch_extension norcpc3
@@ -169,6 +177,8 @@ cpyfp [x0]!, [x1]!, x2!
 // CHECK: [[@LINE-1]]:1: error: instruction requires: mops
 // CHECK-NEXT: cpyfp [x0]!, [x1]!, x2!
 
+// nolse128 implied nod128, so reinstate it
+.arch_extension d128
 // This needs to come before `.arch_extension nothe` as it uses an instruction
 // that requires both the and d128
 sysp #0, c2, c0, #0, x0, x1
@@ -204,6 +214,8 @@ umax x0, x1, x2
 // CHECK: [[@LINE-1]]:1: error: instruction requires: cssc
 // CHECK-NEXT: umax x0, x1, x2
 
+// noras implied norasv2, so reinstate it
+.arch_extension rasv2
 mrs x0, ERXGSR_EL1
 // CHECK-NOT: [[@LINE-1]]:9: error: expected readable system register
 .arch_extension norasv2
diff --git a/llvm/test/MC/AVR/inst-brbc.s b/llvm/test/MC/AVR/inst-brbc.s
index 4d7d684da4468a..3ef3664cf07bfc 100644
--- a/llvm/test/MC/AVR/inst-brbc.s
+++ b/llvm/test/MC/AVR/inst-brbc.s
@@ -3,7 +3,6 @@
 ; RUN:     | llvm-objdump -d - | FileCheck --check-prefix=INST %s
 
 foo:
-
   brbc 3, .+8
   brbc 0, .-16
   .short 0xf759
@@ -11,14 +10,16 @@ foo:
   .short 0xf74c
   .short 0xf4c7
 
-; CHECK: brvc .Ltmp0+8              ; encoding: [0bAAAAA011,0b111101AA]
-; CHECK:                            ; fixup A - offset: 0, value: .Ltmp0+8, kind: fixup_7_pcrel
-; CHECK: brcc .Ltmp1-16             ; encoding: [0bAAAAA000,0b111101AA]
-; CHECK:                            ; fixup A - offset: 0, value: .Ltmp1-16, kind: fixup_7_pcrel
+; CHECK: brvc (.Ltmp0+8)+2   ; encoding: [0bAAAAA011,0b111101AA]
+; CHECK-NEXT:                ; fixup A - offset: 0, value: (.Ltmp0+8)+2, kind: fixup_7_pcrel
+;
+; CHECK: brcc (.Ltmp1-16)+2  ; encoding: [0bAAAAA000,0b111101AA]
+; CHECK-NEXT:                ; fixup A - offset: 0, value: (.Ltmp1-16)+2, kind: fixup_7_pcrel
 
-; INST: 23 f4   brvc .+8
-; INST: c0 f7   brsh .-16
-; INST: 59 f7   brne .-42
-; INST: 52 f7   brpl .-44
-; INST: 4c f7   brge .-46
-; INST: c7 f4   brid .+48
+; INST-LABEL: <foo>:
+; INST-NEXT: 23 f4   brvc .+8
+; INST-NEXT: c0 f7   brsh .-16
+; INST-NEXT: 59 f7   brne .-42
+; INST-NEXT: 52 f7   brpl .-44
+; INST-NEXT: 4c f7   brge .-46
+; INST-NEXT: c7 f4   brid .+48
diff --git a/llvm/test/MC/AVR/inst-brbs.s b/llvm/test/MC/AVR/inst-brbs.s
index 7987feeec654a1..f15a779a53654f 100644
--- a/llvm/test/MC/AVR/inst-brbs.s
+++ b/llvm/test/MC/AVR/inst-brbs.s
@@ -3,7 +3,6 @@
 ; RUN:     | llvm-objdump -d - | FileCheck --check-prefix=INST %s
 
 foo:
-
   brbs 3, .+8
   brbs 0, .-12
   .short 0xf359
@@ -11,14 +10,15 @@ foo:
   .short 0xf34c
   .short 0xf077
 
-; CHECK: brvs .Ltmp0+8              ; encoding: [0bAAAAA011,0b111100AA]
-; CHECK:                            ; fixup A - offset: 0, value: .Ltmp0+8, kind: fixup_7_pcrel
-; CHECK: brcs .Ltmp1-12             ; encoding: [0bAAAAA000,0b111100AA]
-; CHECK:                            ; fixup A - offset: 0, value: .Ltmp1-12, kind: fixup_7_pcrel
+; CHECK: brvs (.Ltmp0+8)+2   ; encoding: [0bAAAAA011,0b111100AA]
+; CHECK-NEXT:                ; fixup A - offset: 0, value: (.Ltmp0+8)+2, kind: fixup_7_pcrel
+; CHECK: brcs (.Ltmp1-12)+2  ; encoding: [0bAAAAA000,0b111100AA]
+; CHECK-NEXT:                ; fixup A - offset: 0, value: (.Ltmp1-12)+2, kind: fixup_7_pcrel
 
-; INST: 23 f0   brvs .+8
-; INST: d0 f3   brlo .-12
-; INST: 59 f3   breq .-42
-; INST: 52 f3   brmi .-44
-; INST: 4c f3   brlt .-46
-; INST: 77 f0   brie .+28
+; INST-LABEL: <foo>:
+; INST-NEXT: 23 f0   brvs .+8
+; INST-NEXT: d0 f3   brlo .-12
+; INST-NEXT: 59 f3   breq .-42
+; INST-NEXT: 52 f3   brmi .-44
+; INST-NEXT: 4c f3   brlt .-46
+; INST-NEXT: 77 f0   brie .+28
diff --git a/llvm/test/MC/AVR/inst-brcc.s b/llvm/test/MC/AVR/inst-brcc.s
new file mode 100644
index 00000000000000..d9218bc61e787f
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brcc.s
@@ -0,0 +1,28 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brcc .+66
+  brcc .-22
+  brbc 0, .+66
+  brbc 0, bar
+
+bar:
+
+; CHECK: brcc (.Ltmp0+66)+2  ; encoding: [0bAAAAA000,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0+66)+2, kind: fixup_7_pcrel
+; CHECK: brcc (.Ltmp1-22)+2  ; encoding: [0bAAAAA000,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1-22)+2, kind: fixup_7_pcrel
+; CHECK: brcc (.Ltmp2+66)+2  ; encoding: [0bAAAAA000,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp2+66)+2, kind: fixup_7_pcrel
+; CHECK: brcc bar            ; encoding: [0bAAAAA000,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: 08 f5      brsh .+66
+; INST-NEXT: a8 f7      brsh .-22
+; INST-NEXT: 08 f5      brsh .+66
+; INST-NEXT: 00 f4      brsh .+0
diff --git a/llvm/test/MC/AVR/inst-brcs.s b/llvm/test/MC/AVR/inst-brcs.s
new file mode 100644
index 00000000000000..0012cb31f61269
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brcs.s
@@ -0,0 +1,28 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brcs .+8
+  brcs .+4
+  brbs 0, .+8
+  brbs 0, bar
+
+bar:
+
+; CHECK: brcs (.Ltmp0+8)+2  ; encoding: [0bAAAAA000,0b111100AA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: (.Ltmp0+8)+2, kind: fixup_7_pcrel
+; CHECK: brcs (.Ltmp1+4)+2  ; encoding: [0bAAAAA000,0b111100AA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: (.Ltmp1+4)+2, kind: fixup_7_pcrel
+; CHECK: brcs (.Ltmp2+8)+2  ; encoding: [0bAAAAA000,0b111100AA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: (.Ltmp2+8)+2, kind: fixup_7_pcrel
+; CHECK: brcs bar           ; encoding: [0bAAAAA000,0b111100AA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: 20 f0      brlo .+8
+; INST-NEXT: 10 f0      brlo .+4
+; INST-NEXT: 20 f0      brlo .+8
+; INST-NEXT: 00 f0      brlo .+0
diff --git a/llvm/test/MC/AVR/inst-breq.s b/llvm/test/MC/AVR/inst-breq.s
new file mode 100644
index 00000000000000..f82010f02ba617
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-breq.s
@@ -0,0 +1,28 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  breq .-18
+  breq .-12
+  brbs 1, .-18
+  brbs 1, bar
+
+bar:
+
+; CHECK: breq    (.Ltmp0-18)+2     ; encoding: [0bAAAAA001,0b111100AA]
+; CHECK-NEXT:                      ;   fixup A - offset: 0, value: (.Ltmp0-18)+2, kind: fixup_7_pcrel
+; CHECK: breq    (.Ltmp1-12)+2     ; encoding: [0bAAAAA001,0b111100AA]
+; CHECK-NEXT:                      ;   fixup A - offset: 0, value: (.Ltmp1-12)+2, kind: fixup_7_pcrel
+; CHECK: brbs    1, (.Ltmp2-18)+2  ; encoding: [0bAAAAA001,0b111100AA]
+; CHECK-NEXT:                      ;   fixup A - offset: 0, value: (.Ltmp2-18)+2, kind: fixup_7_pcrel
+; CHECK: brbs    1, bar            ; encoding: [0bAAAAA001,0b111100AA]
+; CHECK-NEXT:                      ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: b9 f3      breq .-18
+; INST-NEXT: d1 f3      breq .-12
+; INST-NEXT: b9 f3      breq .-18
+; INST-NEXT: 01 f0      breq .+0
diff --git a/llvm/test/MC/AVR/inst-brge.s b/llvm/test/MC/AVR/inst-brge.s
new file mode 100644
index 00000000000000..1121284a114689
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brge.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brge .+50
+  brge .+42
+  brge bar
+
+bar:
+
+; CHECK: brge (.Ltmp0+50)+2  ; encoding: [0bAAAAA100,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0+50)+2, kind: fixup_7_pcrel
+; CHECK: brge (.Ltmp1+42)+2  ; encoding: [0bAAAAA100,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+42)+2, kind: fixup_7_pcrel
+; CHECK: brge bar            ; encoding: [0bAAAAA100,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: cc f4      brge .+50
+; INST-NEXT: ac f4      brge .+42
+; INST-NEXT: 04 f4      brge .+0
diff --git a/llvm/test/MC/AVR/inst-brhc.s b/llvm/test/MC/AVR/inst-brhc.s
new file mode 100644
index 00000000000000..eb16ac2ef7a64e
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brhc.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brhc .+12
+  brhc .+14
+  brhc bar
+
+bar:
+
+; CHECK: brhc (.Ltmp0+12)+2  ; encoding: [0bAAAAA101,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0+12)+2, kind: fixup_7_pcrel
+; CHECK: brhc (.Ltmp1+14)+2  ; encoding: [0bAAAAA101,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+14)+2, kind: fixup_7_pcrel
+; CHECK: brhc bar            ; encoding: [0bAAAAA101,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: 35 f4      brhc .+12
+; INST-NEXT: 3d f4      brhc .+14
+; INST-NEXT: 05 f4      brhc .+0
diff --git a/llvm/test/MC/AVR/inst-brhs.s b/llvm/test/MC/AVR/inst-brhs.s
new file mode 100644
index 00000000000000..77c49596b3b0b8
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brhs.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brhs .-66
+  brhs .+14
+  brhs bar
+
+bar:
+
+; CHECK: brhs (.Ltmp0-66)+2  ; encoding: [0bAAAAA101,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0-66)+2, kind: fixup_7_pcrel
+; CHECK: brhs (.Ltmp1+14)+2  ; encoding: [0bAAAAA101,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+14)+2, kind: fixup_7_pcrel
+; CHECK: brhs bar            ; encoding: [0bAAAAA101,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: fd f2      brhs .-66
+; INST-NEXT: 3d f0      brhs .+14
+; INST-NEXT: 05 f0      brhs .+0
diff --git a/llvm/test/MC/AVR/inst-brid.s b/llvm/test/MC/AVR/inst-brid.s
new file mode 100644
index 00000000000000..70d0ea83c49b2a
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brid.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brid .+42
+  brid .+62
+  brid bar
+
+bar:
+
+; CHECK: brid (.Ltmp0+42)+2  ; encoding: [0bAAAAA111,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0+42)+2, kind: fixup_7_pcrel
+; CHECK: brid (.Ltmp1+62)+2  ; encoding: [0bAAAAA111,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+62)+2, kind: fixup_7_pcrel
+; CHECK: brid bar            ; encoding: [0bAAAAA111,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: af f4      brid .+42
+; INST-NEXT: ff f4      brid .+62
+; INST-NEXT: 07 f4      brid .+0
diff --git a/llvm/test/MC/AVR/inst-brie.s b/llvm/test/MC/AVR/inst-brie.s
new file mode 100644
index 00000000000000..717c686e2ed44e
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brie.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brie .+20
+  brie .+40
+  brie bar
+
+bar:
+
+; CHECK: brie (.Ltmp0+20)+2  ; encoding: [0bAAAAA111,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0+20)+2, kind: fixup_7_pcrel
+; CHECK: brie (.Ltmp1+40)+2  ; encoding: [0bAAAAA111,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+40)+2, kind: fixup_7_pcrel
+; CHECK: brie bar            ; encoding: [0bAAAAA111,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: 57 f0      brie .+20
+; INST-NEXT: a7 f0      brie .+40
+; INST-NEXT: 07 f0      brie .+0
diff --git a/llvm/test/MC/AVR/inst-brlo.s b/llvm/test/MC/AVR/inst-brlo.s
new file mode 100644
index 00000000000000..4b56d66ffdfe00
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brlo.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brlo .+12
+  brlo .+28
+  brlo bar
+
+bar:
+
+; CHECK: brlo (.Ltmp0+12)+2  ; encoding: [0bAAAAA000,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0+12)+2, kind: fixup_7_pcrel
+; CHECK: brlo (.Ltmp1+28)+2  ; encoding: [0bAAAAA000,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+28)+2, kind: fixup_7_pcrel
+; CHECK: brlo bar            ; encoding: [0bAAAAA000,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: 30 f0      brlo .+12
+; INST-NEXT: 70 f0      brlo .+28
+; INST-NEXT: 00 f0      brlo .+0
diff --git a/llvm/test/MC/AVR/inst-brlt.s b/llvm/test/MC/AVR/inst-brlt.s
new file mode 100644
index 00000000000000..8a7c543f9444b1
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brlt.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brlt .+16
+  brlt .+2
+  brlt bar
+
+bar:
+
+; CHECK: brlt (.Ltmp0+16)+2  ; encoding: [0bAAAAA100,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0+16)+2, kind: fixup_7_pcrel
+; CHECK: brlt (.Ltmp1+2)+2   ; encoding: [0bAAAAA100,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+2)+2, kind: fixup_7_pcrel
+; CHECK: brlt bar            ; encoding: [0bAAAAA100,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: 44 f0    brlt .+16
+; INST-NEXT: 0c f0    brlt .+2
+; INST-NEXT: 04 f0    brlt .+0
diff --git a/llvm/test/MC/AVR/inst-brmi.s b/llvm/test/MC/AVR/inst-brmi.s
new file mode 100644
index 00000000000000..878612d294dd95
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brmi.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brmi .+66
+  brmi .+58
+  brmi bar
+
+bar:
+
+; CHECK: brmi (.Ltmp0+66)+2  ; encoding: [0bAAAAA010,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0+66)+2, kind: fixup_7_pcrel
+; CHECK: brmi (.Ltmp1+58)+2  ; encoding: [0bAAAAA010,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+58)+2, kind: fixup_7_pcrel
+; CHECK: brmi bar            ; encoding: [0bAAAAA010,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: 0a f1      brmi .+66
+; INST-NEXT: ea f0      brmi .+58
+; INST-NEXT: 02 f0      brmi .+0
diff --git a/llvm/test/MC/AVR/inst-brne.s b/llvm/test/MC/AVR/inst-brne.s
new file mode 100644
index 00000000000000..9d6bee4b754d95
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brne.s
@@ -0,0 +1,28 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brne .+10
+  brne .+2
+  brbc 1, .+10
+  brbc 1, bar
+
+bar:
+
+; CHECK: brne    (.Ltmp0+10)+2     ; encoding: [0bAAAAA001,0b111101AA]
+; CHECK-NEXT:                      ;   fixup A - offset: 0, value: (.Ltmp0+10)+2, kind: fixup_7_pcrel
+; CHECK: brne    (.Ltmp1+2)+2      ; encoding: [0bAAAAA001,0b111101AA]
+; CHECK-NEXT:                      ;   fixup A - offset: 0, value: (.Ltmp1+2)+2, kind: fixup_7_pcrel
+; CHECK: brbc    1, (.Ltmp2+10)+2  ; encoding: [0bAAAAA001,0b111101AA]
+; CHECK-NEXT:                      ;   fixup A - offset: 0, value: (.Ltmp2+10)+2, kind: fixup_7_pcrel
+; CHECK: brbc    1, bar            ; encoding: [0bAAAAA001,0b111101AA]
+; CHECK-NEXT:                      ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: 29 f4      brne .+10
+; INST-NEXT: 09 f4      brne .+2
+; INST-NEXT: 29 f4      brne .+10
+; INST-NEXT: 01 f4      brne .+0
diff --git a/llvm/test/MC/AVR/inst-brpl.s b/llvm/test/MC/AVR/inst-brpl.s
new file mode 100644
index 00000000000000..393365ee35339e
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brpl.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brpl .-12
+  brpl .+18
+  brpl bar
+
+bar:
+
+; CHECK: brpl (.Ltmp0-12)+2  ; encoding: [0bAAAAA010,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0-12)+2, kind: fixup_7_pcrel
+; CHECK: brpl (.Ltmp1+18)+2  ; encoding: [0bAAAAA010,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+18)+2, kind: fixup_7_pcrel
+; CHECK: brpl bar            ; encoding: [0bAAAAA010,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: d2 f7      brpl .-12
+; INST-NEXT: 4a f4      brpl .+18
+; INST-NEXT: 02 f4      brpl .+0
diff --git a/llvm/test/MC/AVR/inst-brsh.s b/llvm/test/MC/AVR/inst-brsh.s
new file mode 100644
index 00000000000000..0bacd64d3d8d05
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brsh.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brsh .+32
+  brsh .+70
+  brsh bar
+
+bar:
+
+; CHECK: brsh (.Ltmp0+32)+2  ; encoding: [0bAAAAA000,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0+32)+2, kind: fixup_7_pcrel
+; CHECK: brsh (.Ltmp1+70)+2  ; encoding: [0bAAAAA000,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+70)+2, kind: fixup_7_pcrel
+; CHECK: brsh bar            ; encoding: [0bAAAAA000,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: 80 f4      brsh .+32
+; INST-NEXT: 18 f5      brsh .+70
+; INST-NEXT: 00 f4      brsh .+0
diff --git a/llvm/test/MC/AVR/inst-brtc.s b/llvm/test/MC/AVR/inst-brtc.s
new file mode 100644
index 00000000000000..eb4ee211628721
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brtc.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brtc .+52
+  brtc .+50
+  brtc bar
+
+bar:
+
+; CHECK: brtc (.Ltmp0+52)+2  ; encoding: [0bAAAAA110,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0+52)+2, kind: fixup_7_pcrel
+; CHECK: brtc (.Ltmp1+50)+2  ; encoding: [0bAAAAA110,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+50)+2, kind: fixup_7_pcrel
+; CHECK: brtc bar            ; encoding: [0bAAAAA110,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: d6 f4      brtc .+52
+; INST-NEXT: ce f4      brtc .+50
+; INST-NEXT: 06 f4      brtc .+0
diff --git a/llvm/test/MC/AVR/inst-brts.s b/llvm/test/MC/AVR/inst-brts.s
new file mode 100644
index 00000000000000..ccd794a9225894
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brts.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brts .+18
+  brts .+22
+  brts bar
+
+bar:
+
+; CHECK: brts (.Ltmp0+18)+2  ; encoding: [0bAAAAA110,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0+18)+2, kind: fixup_7_pcrel
+; CHECK: brts (.Ltmp1+22)+2  ; encoding: [0bAAAAA110,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+22)+2, kind: fixup_7_pcrel
+; CHECK: brts bar            ; encoding: [0bAAAAA110,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: 4e f0      brts .+18
+; INST-NEXT: 5e f0      brts .+22
+; INST-NEXT: 06 f0      brts .+0
diff --git a/llvm/test/MC/AVR/inst-brvc.s b/llvm/test/MC/AVR/inst-brvc.s
new file mode 100644
index 00000000000000..573f779c0dcd6a
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brvc.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brvc .-28
+  brvc .-62
+  brvc bar
+
+bar:
+
+; CHECK: brvc (.Ltmp0-28)+2  ; encoding: [0bAAAAA011,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0-28)+2, kind: fixup_7_pcrel
+; CHECK: brvc (.Ltmp1-62)+2  ; encoding: [0bAAAAA011,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1-62)+2, kind: fixup_7_pcrel
+; CHECK: brvc bar            ; encoding: [0bAAAAA011,0b111101AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: 93 f7      brvc .-28
+; INST-NEXT: 0b f7      brvc .-62
+; INST-NEXT: 03 f4      brvc .+0
diff --git a/llvm/test/MC/AVR/inst-brvs.s b/llvm/test/MC/AVR/inst-brvs.s
new file mode 100644
index 00000000000000..d50a1a9ec5b62f
--- /dev/null
+++ b/llvm/test/MC/AVR/inst-brvs.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
+; RUN: llvm-mc -filetype=obj -triple avr < %s \
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
+
+foo:
+  brvs .+18
+  brvs .+32
+  brvs bar
+
+bar:
+
+; CHECK: brvs (.Ltmp0+18)+2  ; encoding: [0bAAAAA011,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp0+18)+2, kind: fixup_7_pcrel
+; CHECK: brvs (.Ltmp1+32)+2  ; encoding: [0bAAAAA011,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: (.Ltmp1+32)+2, kind: fixup_7_pcrel
+; CHECK: brvs bar            ; encoding: [0bAAAAA011,0b111100AA]
+; CHECK-NEXT:                ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
+
+; INST-LABEL: <foo>:
+; INST-NEXT: 4b f0      brvs .+18
+; INST-NEXT: 83 f0      brvs .+32
+; INST-NEXT: 03 f0      brvs .+0
diff --git a/llvm/test/MC/AVR/inst-family-cond-branch.s b/llvm/test/MC/AVR/inst-family-cond-branch.s
deleted file mode 100644
index dc36425a884f3b..00000000000000
--- a/llvm/test/MC/AVR/inst-family-cond-branch.s
+++ /dev/null
@@ -1,321 +0,0 @@
-; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
-; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - | FileCheck --check-prefix=INST %s
-
-
-foo:
-  ; BREQ
-  breq .-18
-  breq .-12
-  brbs 1, .-18
-  brbs 1, baz
-
-; CHECK: breq    .Ltmp0-18               ; encoding: [0bAAAAA001,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp0-18, kind: fixup_7_pcrel
-; CHECK: breq    .Ltmp1-12               ; encoding: [0bAAAAA001,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp1-12, kind: fixup_7_pcrel
-; CHECK: brbs    1, .Ltmp2-18            ; encoding: [0bAAAAA001,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp2-18, kind: fixup_7_pcrel
-; CHECK: brbs    1, baz                  ; encoding: [0bAAAAA001,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: baz, kind: fixup_7_pcrel
-
-; INST-LABEL: <foo>:
-; INST: breq .-18
-; INST: breq .-12
-; INST: breq .-18
-; INST: breq .+0
-
-  ; BRNE
-  brne .+10
-  brne .+2
-  brbc 1, .+10
-  brbc 1, bar
-
-; CHECK: brne    .Ltmp3+10               ; encoding: [0bAAAAA001,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp3+10, kind: fixup_7_pcrel
-; CHECK: brne    .Ltmp4+2                ; encoding: [0bAAAAA001,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp4+2, kind: fixup_7_pcrel
-; CHECK: brbc    1, .Ltmp5+10            ; encoding: [0bAAAAA001,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp5+10, kind: fixup_7_pcrel
-; CHECK: brbc    1, bar                  ; encoding: [0bAAAAA001,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: bar, kind: fixup_7_pcrel
-
-; INST: brne .+10
-; INST: brne .+2
-; INST: brne .+10
-; INST: brne .+0
-
-bar:
-  ; BRCS
-  brcs .+8
-  brcs .+4
-  brbs 0, .+8
-  brbs 0, end
-
-; CHECK: brcs    .Ltmp6+8                ; encoding: [0bAAAAA000,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp6+8, kind: fixup_7_pcrel
-; CHECK: brcs    .Ltmp7+4                ; encoding: [0bAAAAA000,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp7+4, kind: fixup_7_pcrel
-; CHECK: brcs    .Ltmp8+8                ; encoding: [0bAAAAA000,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp8+8, kind: fixup_7_pcrel
-; CHECK: brcs    end                     ; encoding: [0bAAAAA000,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: end, kind: fixup_7_pcrel
-
-; INST-LABEL: <bar>:
-; INST: brlo .+8
-; INST: brlo .+4
-; INST: brlo .+8
-; INST: brlo .+0
-
-  ; BRCC
-  brcc .+66
-  brcc .-22
-  brbc 0, .+66
-  brbc 0, baz
-
-; CHECK: brcc    .Ltmp9+66               ; encoding: [0bAAAAA000,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp9+66, kind: fixup_7_pcrel
-; CHECK: brcc    .Ltmp10-22              ; encoding: [0bAAAAA000,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp10-22, kind: fixup_7_pcrel
-; CHECK: brcc    .Ltmp11+66              ; encoding: [0bAAAAA000,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp11+66, kind: fixup_7_pcrel
-; CHECK: brcc    baz                     ; encoding: [0bAAAAA000,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: baz, kind: fixup_7_pcrel
-
-; INST: brsh .+66
-; INST: brsh .-22
-; INST: brsh .+66
-; INST: brsh .+0
-
-; BRSH
-  brsh .+32
-  brsh .+70
-  brsh car
-
-; CHECK: brsh    .Ltmp12+32              ; encoding: [0bAAAAA000,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp12+32, kind: fixup_7_pcrel
-; CHECK: brsh    .Ltmp13+70              ; encoding: [0bAAAAA000,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp13+70, kind: fixup_7_pcrel
-; CHECK: brsh    car                     ; encoding: [0bAAAAA000,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: car, kind: fixup_7_pcrel
-
-; INST: brsh .+32
-; INST: brsh .+70
-; INST: brsh .+0
-
-baz:
-
-  ; BRLO
-  brlo .+12
-  brlo .+28
-  brlo car
-
-; CHECK: brlo    .Ltmp14+12              ; encoding: [0bAAAAA000,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp14+12, kind: fixup_7_pcrel
-; CHECK: brlo    .Ltmp15+28              ; encoding: [0bAAAAA000,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp15+28, kind: fixup_7_pcrel
-; CHECK: brlo    car                     ; encoding: [0bAAAAA000,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: car, kind: fixup_7_pcrel
-
-; INST-LABEL: <baz>:
-; INST: brlo .+12
-; INST: brlo .+28
-; INST: brlo .+0
-
-  ; BRMI
-  brmi .+66
-  brmi .+58
-  brmi car
-
-; CHECK: brmi    .Ltmp16+66              ; encoding: [0bAAAAA010,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp16+66, kind: fixup_7_pcrel
-; CHECK: brmi    .Ltmp17+58              ; encoding: [0bAAAAA010,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp17+58, kind: fixup_7_pcrel
-; CHECK: brmi    car                     ; encoding: [0bAAAAA010,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: car, kind: fixup_7_pcrel
-
-; INST: brmi .+66
-; INST: brmi .+58
-; INST: brmi .+0
-
-  ; BRPL
-  brpl .-12
-  brpl .+18
-  brpl car
-
-; CHECK: brpl    .Ltmp18-12              ; encoding: [0bAAAAA010,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp18-12, kind: fixup_7_pcrel
-; CHECK: brpl    .Ltmp19+18              ; encoding: [0bAAAAA010,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp19+18, kind: fixup_7_pcrel
-; CHECK: brpl    car                     ; encoding: [0bAAAAA010,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: car, kind: fixup_7_pcrel
-
-; INST: brpl .-12
-; INST: brpl .+18
-; INST: brpl .+0
-
-; BRGE
-  brge .+50
-  brge .+42
-  brge car
-
-; CHECK: brge    .Ltmp20+50              ; encoding: [0bAAAAA100,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp20+50, kind: fixup_7_pcrel
-; CHECK: brge    .Ltmp21+42              ; encoding: [0bAAAAA100,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp21+42, kind: fixup_7_pcrel
-; CHECK: brge    car                     ; encoding: [0bAAAAA100,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: car, kind: fixup_7_pcrel
-
-; INST: brge .+50
-; INST: brge .+42
-; INST: brge .+0
-
-car:
-  ; BRLT
-  brlt .+16
-  brlt .+2
-  brlt end
-
-; CHECK: brlt    .Ltmp22+16              ; encoding: [0bAAAAA100,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp22+16, kind: fixup_7_pcrel
-; CHECK: brlt    .Ltmp23+2               ; encoding: [0bAAAAA100,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp23+2, kind: fixup_7_pcrel
-; CHECK: brlt    end                     ; encoding: [0bAAAAA100,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: end, kind: fixup_7_pcrel
-
-; INST-LABEL: <car>:
-; INST: brlt	.+16
-; INST: brlt	.+2
-; INST: brlt	.+0
-
-  ; BRHS
-  brhs .-66
-  brhs .+14
-  brhs just_another_label
-
-; CHECK: brhs    .Ltmp24-66              ; encoding: [0bAAAAA101,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp24-66, kind: fixup_7_pcrel
-; CHECK: brhs    .Ltmp25+14              ; encoding: [0bAAAAA101,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp25+14, kind: fixup_7_pcrel
-; CHECK: brhs    just_another_label      ; encoding: [0bAAAAA101,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: just_another_label, kind: fixup_7_pcrel
-
-; INST: brhs	.-66
-; INST: brhs	.+14
-; INST: brhs	.+0
-
-  ; BRHC
-  brhc .+12
-  brhc .+14
-  brhc just_another_label
-
-; CHECK: brhc    .Ltmp26+12              ; encoding: [0bAAAAA101,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp26+12, kind: fixup_7_pcrel
-; CHECK: brhc    .Ltmp27+14              ; encoding: [0bAAAAA101,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp27+14, kind: fixup_7_pcrel
-; CHECK: brhc    just_another_label      ; encoding: [0bAAAAA101,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: just_another_label, kind: fixup_7_pcrel
-
-; INST: brhc	.+12
-; INST: brhc	.+14
-; INST: brhc	.+0
-
-  ; BRTS
-  brts .+18
-  brts .+22
-  brts just_another_label
-
-; CHECK: brts    .Ltmp28+18              ; encoding: [0bAAAAA110,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp28+18, kind: fixup_7_pcrel
-; CHECK: brts    .Ltmp29+22              ; encoding: [0bAAAAA110,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp29+22, kind: fixup_7_pcrel
-; CHECK: brts    just_another_label      ; encoding: [0bAAAAA110,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: just_another_label, kind: fixup_7_pcrel
-
-; INST: brts	.+18
-; INST: brts	.+22
-; INST: brts	.+0
-
-just_another_label:
-  ; BRTC
-  brtc .+52
-  brtc .+50
-  brtc end
-
-; CHECK: brtc    .Ltmp30+52              ; encoding: [0bAAAAA110,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp30+52, kind: fixup_7_pcrel
-; CHECK: brtc    .Ltmp31+50              ; encoding: [0bAAAAA110,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp31+50, kind: fixup_7_pcrel
-; CHECK: brtc    end                     ; encoding: [0bAAAAA110,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: end, kind: fixup_7_pcrel
-
-; INST-LABEL: <just_another_label>:
-; INST: brtc	.+52
-; INST: brtc	.+50
-; INST: brtc	.+0
-
-  ; BRVS
-  brvs .+18
-  brvs .+32
-  brvs end
-
-; CHECK: brvs    .Ltmp32+18              ; encoding: [0bAAAAA011,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp32+18, kind: fixup_7_pcrel
-; CHECK: brvs    .Ltmp33+32              ; encoding: [0bAAAAA011,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp33+32, kind: fixup_7_pcrel
-; CHECK: brvs    end                     ; encoding: [0bAAAAA011,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: end, kind: fixup_7_pcrel
-
-; INST: brvs	.+18
-; INST: brvs	.+32
-; INST: brvs	.+0
-
-  ; BRVC
-  brvc .-28
-  brvc .-62
-  brvc end
-
-; CHECK: brvc    .Ltmp34-28              ; encoding: [0bAAAAA011,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp34-28, kind: fixup_7_pcrel
-; CHECK: brvc    .Ltmp35-62              ; encoding: [0bAAAAA011,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp35-62, kind: fixup_7_pcrel
-; CHECK: brvc    end                     ; encoding: [0bAAAAA011,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: end, kind: fixup_7_pcrel
-
-; INST: brvc	.-28
-; INST: brvc	.-62
-; INST: brvc	.+0
-
-  ; BRIE
-  brie .+20
-  brie .+40
-  brie end
-
-; CHECK: brie    .Ltmp36+20              ; encoding: [0bAAAAA111,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp36+20, kind: fixup_7_pcrel
-; CHECK: brie    .Ltmp37+40              ; encoding: [0bAAAAA111,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp37+40, kind: fixup_7_pcrel
-; CHECK: brie    end                     ; encoding: [0bAAAAA111,0b111100AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: end, kind: fixup_7_pcrel
-
-; INST: brie	.+20
-; INST: brie	.+40
-; INST: brie	.+0
-
-  ; BRID
-  brid .+42
-  brid .+62
-  brid end
-
-; CHECK: brid    .Ltmp38+42              ; encoding: [0bAAAAA111,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp38+42, kind: fixup_7_pcrel
-; CHECK: brid    .Ltmp39+62              ; encoding: [0bAAAAA111,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp39+62, kind: fixup_7_pcrel
-; CHECK: brid    end                     ; encoding: [0bAAAAA111,0b111101AA]
-; CHECK:                                 ;   fixup A - offset: 0, value: end, kind: fixup_7_pcrel
-
-; INST: brid	.+42
-; INST: brid	.+62
-; INST: brid	.+0
-
-end:
diff --git a/llvm/test/MC/AVR/inst-rcall.s b/llvm/test/MC/AVR/inst-rcall.s
index 006013aa6ea946..a4ec32d05b1a43 100644
--- a/llvm/test/MC/AVR/inst-rcall.s
+++ b/llvm/test/MC/AVR/inst-rcall.s
@@ -1,27 +1,28 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - | FileCheck --check-prefix=INST %s
-
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
-
   rcall  .+0
   rcall  .-8
   rcall  .+12
   rcall  .+46
   .short  0xdfea
 
-; CHECK: rcall  .Ltmp0+0             ; encoding: [A,0b1101AAAA]
-; CHECK:                             ;   fixup A - offset: 0, value: .Ltmp0+0, kind: fixup_13_pcrel
-; CHECK: rcall  .Ltmp1-8             ; encoding: [A,0b1101AAAA]
-; CHECK:                             ;   fixup A - offset: 0, value: .Ltmp1-8, kind: fixup_13_pcrel
-; CHECK: rcall  .Ltmp2+12            ; encoding: [A,0b1101AAAA]
-; CHECK:                             ;   fixup A - offset: 0, value: .Ltmp2+12, kind: fixup_13_pcrel
-; CHECK: rcall  .Ltmp3+46            ; encoding: [A,0b1101AAAA]
-; CHECK:                             ;   fixup A - offset: 0, value: .Ltmp3+46, kind: fixup_13_pcrel
+; CHECK: rcall (.Ltmp0+0)+2   ; encoding: [A,0b1101AAAA]
+; CHECK-NEXT:                 ;   fixup A - offset: 0, value: (.Ltmp0+0)+2, kind: fixup_13_pcrel
+; CHECK: rcall (.Ltmp1-8)+2   ; encoding: [A,0b1101AAAA]
+; CHECK-NEXT:                 ;   fixup A - offset: 0, value: (.Ltmp1-8)+2, kind: fixup_13_pcrel
+; CHECK: rcall (.Ltmp2+12)+2  ; encoding: [A,0b1101AAAA]
+; CHECK-NEXT:                 ;   fixup A - offset: 0, value: (.Ltmp2+12)+2, kind: fixup_13_pcrel
+; CHECK: rcall (.Ltmp3+46)+2  ; encoding: [A,0b1101AAAA]
+; CHECK-NEXT:                 ;   fixup A - offset: 0, value: (.Ltmp3+46)+2, kind: fixup_13_pcrel
 
-; INST: 00 d0    rcall .+0
-; INST: fc df    rcall .-8
-; INST: 06 d0    rcall .+12
-; INST: 17 d0    rcall .+46
-; INST: ea df    rcall .-44
+; INST-LABEL: <foo>:
+; INST-NEXT: 00 d0    rcall .+0
+; INST-NEXT: fc df    rcall .-8
+; INST-NEXT: 06 d0    rcall .+12
+; INST-NEXT: 17 d0    rcall .+46
+; INST-NEXT: ea df    rcall .-44
diff --git a/llvm/test/MC/AVR/inst-rjmp.s b/llvm/test/MC/AVR/inst-rjmp.s
index 3dbac39e055ddf..cc843a58b55d2c 100644
--- a/llvm/test/MC/AVR/inst-rjmp.s
+++ b/llvm/test/MC/AVR/inst-rjmp.s
@@ -1,49 +1,56 @@
 ; RUN: llvm-mc -triple avr -show-encoding < %s | FileCheck %s
+;
 ; RUN: llvm-mc -filetype=obj -triple avr < %s \
-; RUN:     | llvm-objdump -d - | FileCheck --check-prefix=INST %s
-
+; RUN:     | llvm-objdump -d - \
+; RUN:     | FileCheck --check-prefix=INST %s
 
 foo:
-
   rjmp  .+2
   rjmp  .-2
   rjmp  foo
   rjmp  .+8
   rjmp  end
   rjmp  .+0
+
 end:
   rjmp .-4
   rjmp .-6
+
 x:
   rjmp x
   .short 0xc00f
 
-; CHECK: rjmp    .Ltmp0+2                ; encoding: [A,0b1100AAAA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp0+2, kind: fixup_13_pcrel
-; CHECK: rjmp    .Ltmp1-2                ; encoding: [A,0b1100AAAA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp1-2, kind: fixup_13_pcrel
-; CHECK: rjmp    foo                     ; encoding: [A,0b1100AAAA]
-; CHECK:                                 ;   fixup A - offset: 0, value: foo, kind: fixup_13_pcrel
-; CHECK: rjmp    .Ltmp2+8                ; encoding: [A,0b1100AAAA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp2+8, kind: fixup_13_pcrel
-; CHECK: rjmp    end                     ; encoding: [A,0b1100AAAA]
-; CHECK:                                 ;   fixup A - offset: 0, value: end, kind: fixup_13_pcrel
-; CHECK: rjmp    .Ltmp3+0                ; encoding: [A,0b1100AAAA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp3+0, kind: fixup_13_pcrel
-; CHECK: rjmp    .Ltmp4-4                ; encoding: [A,0b1100AAAA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp4-4, kind: fixup_13_pcrel
-; CHECK: rjmp    .Ltmp5-6                ; encoding: [A,0b1100AAAA]
-; CHECK:                                 ;   fixup A - offset: 0, value: .Ltmp5-6, kind: fixup_13_pcrel
-; CHECK: rjmp    x                       ; encoding: [A,0b1100AAAA]
-; CHECK:                                 ;   fixup A - offset: 0, value: x, kind: fixup_13_pcrel
+; CHECK: rjmp (.Ltmp0+2)+2  ; encoding: [A,0b1100AAAA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: (.Ltmp0+2)+2, kind: fixup_13_pcrel
+; CHECK: rjmp (.Ltmp1-2)+2  ; encoding: [A,0b1100AAAA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: (.Ltmp1-2)+2, kind: fixup_13_pcrel
+; CHECK: rjmp foo           ; encoding: [A,0b1100AAAA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: foo, kind: fixup_13_pcrel
+; CHECK: rjmp (.Ltmp2+8)+2  ; encoding: [A,0b1100AAAA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: (.Ltmp2+8)+2, kind: fixup_13_pcrel
+; CHECK: rjmp end           ; encoding: [A,0b1100AAAA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: end, kind: fixup_13_pcrel
+; CHECK: rjmp (.Ltmp3+0)+2  ; encoding: [A,0b1100AAAA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: (.Ltmp3+0)+2, kind: fixup_13_pcrel
+; CHECK: rjmp (.Ltmp4-4)+2  ; encoding: [A,0b1100AAAA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: (.Ltmp4-4)+2, kind: fixup_13_pcrel
+; CHECK: rjmp (.Ltmp5-6)+2  ; encoding: [A,0b1100AAAA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: (.Ltmp5-6)+2, kind: fixup_13_pcrel
+; CHECK: rjmp x             ; encoding: [A,0b1100AAAA]
+; CHECK-NEXT:               ;   fixup A - offset: 0, value: x, kind: fixup_13_pcrel
 
-; INST: 01 c0      rjmp  .+2
-; INST: ff cf      rjmp  .-2
-; INST: 00 c0      rjmp  .+0
-; INST: 04 c0      rjmp  .+8
-; INST: 00 c0      rjmp  .+0
-; INST: 00 c0      rjmp  .+0
-; INST: fe cf      rjmp  .-4
-; INST: fd cf      rjmp  .-6
-; INST: 00 c0      rjmp  .+0
-; INST: 0f c0      rjmp  .+30
+; INST-LABEL: <foo>:
+; INST-NEXT: 01 c0      rjmp  .+2
+; INST-NEXT: ff cf      rjmp  .-2
+; INST-NEXT: fd cf      rjmp  .-6
+; INST-NEXT: 04 c0      rjmp  .+8
+; INST-NEXT: 01 c0      rjmp  .+2
+; INST-NEXT: 00 c0      rjmp  .+0
+; INST-EMPTY:
+; INST-LABEL: <end>:
+; INST-NEXT: fe cf      rjmp  .-4
+; INST-NEXT: fd cf      rjmp  .-6
+; INST-EMPTY:
+; INST-LABEL: <x>:
+; INST-NEXT: ff cf      rjmp  .-2
+; INST-NEXT: 0f c0      rjmp  .+30
diff --git a/llvm/test/MC/LoongArch/Macros/macros-li.s b/llvm/test/MC/LoongArch/Macros/macros-li.s
index 994aa439effa1b..8ac82a766f6043 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-li.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-li.s
@@ -45,8 +45,7 @@ li.d $a0, 0x7ffff00000800
 
 li.d $a0, 0x8000000000fff
 # CHECK:      ori $a0, $zero, 4095
-# CHECK-NEXT: lu32i.d $a0, -524288
-# CHECK-NEXT: lu52i.d $a0, $a0, 0
+# CHECK-NEXT: bstrins.d $a0, $a0, 51, 51
 
 li.d $a0, 0x8000080000800
 # CHECK:      lu12i.w $a0, -524288
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 45335b348b7e8f..48aec4bc52a0c5 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -854,85 +854,85 @@ main:
     # CHECK: f16x8.replace_lane 1 # encoding: [0xfd,0xa2,0x02,0x01]
     f16x8.replace_lane 1
 
-    # CHECK: f16x8.add # encoding: [0xfd,0xb4,0x02]
+    # CHECK: f16x8.add # encoding: [0xfd,0xbd,0x02]
     f16x8.add
 
-    # CHECK: f16x8.sub # encoding: [0xfd,0xb5,0x02]
+    # CHECK: f16x8.sub # encoding: [0xfd,0xbe,0x02]
     f16x8.sub
 
-    # CHECK: f16x8.mul # encoding: [0xfd,0xb6,0x02]
+    # CHECK: f16x8.mul # encoding: [0xfd,0xbf,0x02]
     f16x8.mul
 
-    # CHECK: f16x8.div # encoding: [0xfd,0xb7,0x02]
+    # CHECK: f16x8.div # encoding: [0xfd,0xc0,0x02]
     f16x8.div
 
-    # CHECK: f16x8.min # encoding: [0xfd,0xb8,0x02]
+    # CHECK: f16x8.min # encoding: [0xfd,0xc1,0x02]
     f16x8.min
 
-    # CHECK: f16x8.max # encoding: [0xfd,0xb9,0x02]
+    # CHECK: f16x8.max # encoding: [0xfd,0xc2,0x02]
     f16x8.max
 
-    # CHECK: f16x8.pmin # encoding: [0xfd,0xba,0x02]
+    # CHECK: f16x8.pmin # encoding: [0xfd,0xc3,0x02]
     f16x8.pmin
 
-    # CHECK: f16x8.pmax # encoding: [0xfd,0xbb,0x02]
+    # CHECK: f16x8.pmax # encoding: [0xfd,0xc4,0x02]
     f16x8.pmax
 
-    # CHECK: f16x8.eq # encoding: [0xfd,0xc0,0x02]
+    # CHECK: f16x8.eq # encoding: [0xfd,0xb7,0x02]
     f16x8.eq
 
-    # CHECK: f16x8.ne # encoding: [0xfd,0xc1,0x02]
+    # CHECK: f16x8.ne # encoding: [0xfd,0xb8,0x02]
     f16x8.ne
 
-    # CHECK: f16x8.lt # encoding: [0xfd,0xc2,0x02]
+    # CHECK: f16x8.lt # encoding: [0xfd,0xb9,0x02]
     f16x8.lt
 
-    # CHECK: f16x8.gt # encoding: [0xfd,0xc3,0x02]
+    # CHECK: f16x8.gt # encoding: [0xfd,0xba,0x02]
     f16x8.gt
 
-    # CHECK: f16x8.le # encoding: [0xfd,0xc4,0x02]
+    # CHECK: f16x8.le # encoding: [0xfd,0xbb,0x02]
     f16x8.le
 
-    # CHECK: f16x8.ge # encoding: [0xfd,0xc5,0x02]
+    # CHECK: f16x8.ge # encoding: [0xfd,0xbc,0x02]
     f16x8.ge
 
-    # CHECK: f16x8.abs # encoding: [0xfd,0xb1,0x02]
+    # CHECK: f16x8.abs # encoding: [0xfd,0xb0,0x02]
     f16x8.abs
 
-    # CHECK: f16x8.neg # encoding: [0xfd,0xb2,0x02]
+    # CHECK: f16x8.neg # encoding: [0xfd,0xb1,0x02]
     f16x8.neg
 
-    # CHECK: f16x8.sqrt # encoding: [0xfd,0xb3,0x02]
+    # CHECK: f16x8.sqrt # encoding: [0xfd,0xb2,0x02]
     f16x8.sqrt
 
-    # CHECK: f16x8.ceil # encoding: [0xfd,0xbc,0x02]
+    # CHECK: f16x8.ceil # encoding: [0xfd,0xb3,0x02]
     f16x8.ceil
 
-    # CHECK: f16x8.floor # encoding: [0xfd,0xbd,0x02]
+    # CHECK: f16x8.floor # encoding: [0xfd,0xb4,0x02]
     f16x8.floor
 
-    # CHECK: f16x8.trunc # encoding: [0xfd,0xbe,0x02]
+    # CHECK: f16x8.trunc # encoding: [0xfd,0xb5,0x02]
     f16x8.trunc
 
-    # CHECK: f16x8.nearest # encoding: [0xfd,0xbf,0x02]
+    # CHECK: f16x8.nearest # encoding: [0xfd,0xb6,0x02]
     f16x8.nearest
 
-    # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xc6,0x02]
+    # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xce,0x02]
     f16x8.relaxed_madd
 
-    # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xc7,0x02]
+    # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xcf,0x02]
     f16x8.relaxed_nmadd
 
-    # CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc8,0x02]
+    # CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc5,0x02]
     i16x8.trunc_sat_f16x8_s
 
-    # CHECK: i16x8.trunc_sat_f16x8_u # encoding: [0xfd,0xc9,0x02]
+    # CHECK: i16x8.trunc_sat_f16x8_u # encoding: [0xfd,0xc6,0x02]
     i16x8.trunc_sat_f16x8_u
 
-    # CHECK: f16x8.convert_i16x8_s # encoding: [0xfd,0xca,0x02]
+    # CHECK: f16x8.convert_i16x8_s # encoding: [0xfd,0xc7,0x02]
     f16x8.convert_i16x8_s
 
-    # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xcb,0x02]
+    # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xc8,0x02]
     f16x8.convert_i16x8_u
 
     end_function
diff --git a/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml b/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml
index f3cfa90d1cf901..1a2f341f03ef71 100644
--- a/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/DomainMaskVectors.yaml
@@ -75,7 +75,8 @@ Parts:
           LowerBound:      0
           UpperBound:      0
           Kind:            CBuffer
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  true
       SigInputElements:
         - Name:            AAA_HSFoo
           Indices:         [ 0 ]
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml
index 8bae742b573919..1e00e604f3e248 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-amplification.yaml
@@ -30,13 +30,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -77,13 +79,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml
index 74eb2b86ad01b2..c8bfd9acf68efc 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-compute.yaml
@@ -29,13 +29,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -75,13 +77,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml
index 38f81bd93d67cf..021fb1b5fffb1f 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-domain.yaml
@@ -33,13 +33,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -84,13 +86,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml
index 99fdbbb7c9edaf..74e32efbe2c659 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-geometry.yaml
@@ -34,13 +34,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -85,13 +87,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml
index de8af95dbcbd89..79d92e2f0c5e6f 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-hull.yaml
@@ -34,13 +34,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -86,13 +88,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml
index 78fc077348f42a..27bf148126005b 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-mesh.yaml
@@ -36,13 +36,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -89,13 +91,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml
index ebe1e51faff3f8..1a1a74d7f3121d 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-pixel.yaml
@@ -31,13 +31,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -79,13 +81,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml
index 2bca2f211136b2..6b0ba5eb3d19f0 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv2-vertex.yaml
@@ -30,13 +30,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -77,13 +79,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml
index 9e31d40ec7c1b4..6f7d151b266c9d 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-amplification.yaml
@@ -31,13 +31,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -79,13 +81,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml
index 530a8597cb6498..2de3d435af1de9 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-compute.yaml
@@ -30,13 +30,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -77,13 +79,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml
index a71ab67633eb6f..91afb2f11fc7c4 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-domain.yaml
@@ -34,13 +34,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -86,13 +88,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml
index db530253c6a745..f661e81fe869b9 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-geometry.yaml
@@ -35,13 +35,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -87,13 +89,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml
index 3e3ba493e98450..4140c3180e32ca 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-hull.yaml
@@ -35,13 +35,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -88,13 +90,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT:   UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT:   UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml
index 57bbcecfa1796b..03ce5b583315d0 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-mesh.yaml
@@ -37,13 +37,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -91,13 +93,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml
index c94c234142a34b..2434567b2a6f5c 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-pixel.yaml
@@ -32,13 +32,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -81,13 +83,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml b/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml
index 697fa870c2257c..b43f6aa6b71d4a 100644
--- a/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml
+++ b/llvm/test/ObjectYAML/DXContainer/PSVv3-vertex.yaml
@@ -31,13 +31,15 @@ Parts:
           LowerBound:      3
           UpperBound:      4
           Kind:            TextureCube
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  false
         - Type:            Invalid
           Space:           32768
           LowerBound:      8388608
           UpperBound:      2147483648
           Kind:            Invalid
-          Flags:           0
+          Flags:
+            UsedByAtomic64:  true
       SigInputElements: []
       SigOutputElements: []
       SigPatchOrPrimElements: []
@@ -79,13 +81,15 @@ Parts:
 # CHECK-NEXT: LowerBound:      3
 # CHECK-NEXT: UpperBound:      4
 # CHECK-NEXT: Kind:            TextureCube
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  false
 # CHECK-NEXT: - Type:            Invalid
 # CHECK-NEXT: Space:           32768
 # CHECK-NEXT: LowerBound:      8388608
 # CHECK-NEXT: UpperBound:      2147483648
 # CHECK-NEXT: Kind:            Invalid
-# CHECK-NEXT: Flags:           0
+# CHECK-NEXT: Flags:
+# CHECK-NEXT: UsedByAtomic64:  true
 # CHECK-NEXT: SigInputElements: []
 # CHECK-NEXT: SigOutputElements: []
 # CHECK-NEXT: SigPatchOrPrimElements: []
diff --git a/llvm/test/Transforms/Attributor/nofpclass.ll b/llvm/test/Transforms/Attributor/nofpclass.ll
index 781ba636c3ab3c..2a6780b60211cf 100644
--- a/llvm/test/Transforms/Attributor/nofpclass.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass.ll
@@ -2685,11 +2685,291 @@ define <vscale x 4 x float> @scalable_splat_zero() {
 ; See https://github.com/llvm/llvm-project/issues/78507
 
 define double @call_abs(double noundef %__x) {
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; TUNIT-LABEL: define noundef nofpclass(ninf nzero nsub nnorm) double @call_abs
+; TUNIT-SAME: (double noundef [[__X:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:  entry:
+; TUNIT-NEXT:    [[ABS:%.*]] = tail call noundef nofpclass(ninf nzero nsub nnorm) double @llvm.fabs.f64(double noundef [[__X]]) #[[ATTR22]]
+; TUNIT-NEXT:    ret double [[ABS]]
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CGSCC-LABEL: define noundef nofpclass(ninf nzero nsub nnorm) double @call_abs
+; CGSCC-SAME: (double noundef [[__X:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:  entry:
+; CGSCC-NEXT:    [[ABS:%.*]] = tail call noundef nofpclass(ninf nzero nsub nnorm) double @llvm.fabs.f64(double noundef [[__X]]) #[[ATTR19]]
+; CGSCC-NEXT:    ret double [[ABS]]
+;
 entry:
   %abs = tail call double @llvm.fabs.f64(double %__x)
   ret double %abs
 }
 
+define float @bitcast_to_float_sign_0(i32 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @bitcast_to_float_sign_0
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ARG]], 1
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i32 [[SHR]] to float
+; CHECK-NEXT:    ret float [[CAST]]
+;
+  %shr = lshr i32 %arg, 1
+  %cast = bitcast i32 %shr to float
+  ret float %cast
+}
+
+define float @bitcast_to_float_nnan(i32 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) float @bitcast_to_float_nnan
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ARG]], 2
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i32 [[SHR]] to float
+; CHECK-NEXT:    ret float [[CAST]]
+;
+  %shr = lshr i32 %arg, 2
+  %cast = bitcast i32 %shr to float
+  ret float %cast
+}
+
+define float @bitcast_to_float_sign_1(i32 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @bitcast_to_float_sign_1
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[ARG]], -2147483648
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i32 [[OR]] to float
+; CHECK-NEXT:    ret float [[CAST]]
+;
+  %or = or i32 %arg, -2147483648
+  %cast = bitcast i32 %or to float
+  ret float %cast
+}
+
+define float @bitcast_to_float_nan(i32 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(inf zero sub norm) float @bitcast_to_float_nan
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[ARG]], 2139095041
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i32 [[OR]] to float
+; CHECK-NEXT:    ret float [[CAST]]
+;
+  %or = or i32 %arg, 2139095041
+  %cast = bitcast i32 %or to float
+  ret float %cast
+}
+
+define float @bitcast_to_float_zero(i32 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(nan inf sub norm) float @bitcast_to_float_zero
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[ARG]], 31
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i32 [[SHL]] to float
+; CHECK-NEXT:    ret float [[CAST]]
+;
+  %shl = shl i32 %arg, 31
+  %cast = bitcast i32 %shl to float
+  ret float %cast
+}
+
+define float @bitcast_to_float_nzero(i32 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(zero) float @bitcast_to_float_nzero
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[ARG]], 134217728
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i32 [[OR]] to float
+; CHECK-NEXT:    ret float [[CAST]]
+;
+  %or = or i32 %arg, 134217728
+  %cast = bitcast i32 %or to float
+  ret float %cast
+}
+
+define float @bitcast_to_float_inf(i32 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(nan zero sub norm) float @bitcast_to_float_inf
+; CHECK-SAME: (i32 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SHR:%.*]] = shl i32 [[ARG]], 31
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], 2139095040
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i32 [[OR]] to float
+; CHECK-NEXT:    ret float [[CAST]]
+;
+  %shr = shl i32 %arg, 31
+  %or = or i32 %shr, 2139095040
+  %cast = bitcast i32 %or to float
+  ret float %cast
+}
+
+define double @bitcast_to_double_sign_0(i64 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) double @bitcast_to_double_sign_0
+; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i64 [[ARG]], 1
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i64 [[SHR]] to double
+; CHECK-NEXT:    ret double [[CAST]]
+;
+  %shr = lshr i64 %arg, 1
+  %cast = bitcast i64 %shr to double
+  ret double %cast
+}
+
+define double @bitcast_to_double_nnan(i64 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) double @bitcast_to_double_nnan
+; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i64 [[ARG]], 2
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i64 [[SHR]] to double
+; CHECK-NEXT:    ret double [[CAST]]
+;
+  %shr = lshr i64 %arg, 2
+  %cast = bitcast i64 %shr to double
+  ret double %cast
+}
+
+define double @bitcast_to_double_sign_1(i64 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) double @bitcast_to_double_sign_1
+; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[OR:%.*]] = or i64 [[ARG]], -9223372036854775808
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i64 [[OR]] to double
+; CHECK-NEXT:    ret double [[CAST]]
+;
+  %or = or i64 %arg, -9223372036854775808
+  %cast = bitcast i64 %or to double
+  ret double %cast
+}
+
+define double @bitcast_to_double_nan(i64 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(inf zero sub norm) double @bitcast_to_double_nan
+; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[OR:%.*]] = or i64 [[ARG]], -4503599627370495
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i64 [[OR]] to double
+; CHECK-NEXT:    ret double [[CAST]]
+;
+  %or = or i64 %arg, -4503599627370495
+  %cast = bitcast i64 %or to double
+  ret double %cast
+}
+
+
+define double @bitcast_to_double_zero(i64 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(nan inf sub norm) double @bitcast_to_double_zero
+; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[ARG]], 63
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i64 [[SHL]] to double
+; CHECK-NEXT:    ret double [[CAST]]
+;
+  %shl = shl i64 %arg, 63
+  %cast = bitcast i64 %shl to double
+  ret double %cast
+}
+
+define double @bitcast_to_double_nzero(i64 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(zero) double @bitcast_to_double_nzero
+; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[OR:%.*]] = or i64 [[ARG]], 1152921504606846976
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i64 [[OR]] to double
+; CHECK-NEXT:    ret double [[CAST]]
+;
+  %or = or i64 %arg, 1152921504606846976
+  %cast = bitcast i64 %or to double
+  ret double %cast
+}
+
+define double @bitcast_to_double_inf(i64 %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(nan zero sub norm) double @bitcast_to_double_inf
+; CHECK-SAME: (i64 [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SHR:%.*]] = shl i64 [[ARG]], 63
+; CHECK-NEXT:    [[OR:%.*]] = or i64 [[SHR]], 9218868437227405312
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i64 [[OR]] to double
+; CHECK-NEXT:    ret double [[CAST]]
+;
+  %shr = shl i64 %arg, 63
+  %or = or i64 %shr, 9218868437227405312
+  %cast = bitcast i64 %or to double
+  ret double %cast
+}
+
+
+define <2 x float> @bitcast_to_float_vect_sign_0(<2 x i32> %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) <2 x float> @bitcast_to_float_vect_sign_0
+; CHECK-SAME: (<2 x i32> [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> [[ARG]], <i32 1, i32 2>
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i32> [[SHR]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[CAST]]
+;
+  %shr = lshr <2 x i32> %arg, <i32 1, i32 2>
+  %cast = bitcast <2 x i32> %shr to <2 x float>
+  ret <2 x float> %cast
+}
+
+define <2 x float> @bitcast_to_float_vect_nnan(<2 x i32> %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(nan inf nzero nsub nnorm) <2 x float> @bitcast_to_float_vect_nnan
+; CHECK-SAME: (<2 x i32> [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> [[ARG]], <i32 4, i32 4>
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i32> [[SHR]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[CAST]]
+;
+  %shr = lshr <2 x i32> %arg, <i32 4, i32 4>
+  %cast = bitcast <2 x i32> %shr to <2 x float>
+  ret <2 x float> %cast
+}
+
+define <2 x float> @bitcast_to_float_vect_sign_1(<2 x i32> %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) <2 x float> @bitcast_to_float_vect_sign_1
+; CHECK-SAME: (<2 x i32> [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[ARG]], <i32 -2147483648, i32 -2147483648>
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i32> [[OR]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[CAST]]
+;
+  %or = or <2 x i32> %arg, <i32 -2147483648, i32 -2147483648>
+  %cast = bitcast <2 x i32> %or to <2 x float>
+  ret <2 x float> %cast
+}
+
+define <2 x float> @bitcast_to_float_vect_nan(<2 x i32> %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define nofpclass(inf zero sub norm) <2 x float> @bitcast_to_float_vect_nan
+; CHECK-SAME: (<2 x i32> [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[ARG]], <i32 2139095041, i32 2139095041>
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i32> [[OR]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[CAST]]
+;
+  %or = or <2 x i32> %arg, <i32 2139095041, i32 2139095041>
+  %cast = bitcast <2 x i32> %or to <2 x float>
+  ret <2 x float> %cast
+}
+
+define <2 x float> @bitcast_to_float_vect_conservative_1(<2 x i32> %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define <2 x float> @bitcast_to_float_vect_conservative_1
+; CHECK-SAME: (<2 x i32> [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[ARG]], <i32 -2147483648, i32 0>
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i32> [[OR]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[CAST]]
+;
+  %or = or <2 x i32> %arg, <i32 -2147483648, i32 0>
+  %cast = bitcast <2 x i32> %or to <2 x float>
+  ret <2 x float> %cast
+}
+
+define <2 x float> @bitcast_to_float_vect_conservative_2(<2 x i32> %arg) {
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; CHECK-LABEL: define <2 x float> @bitcast_to_float_vect_conservative_2
+; CHECK-SAME: (<2 x i32> [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[ARG]], <i32 0, i32 2139095041>
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <2 x i32> [[OR]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[CAST]]
+;
+  %or = or <2 x i32> %arg, <i32 0, i32 2139095041>
+  %cast = bitcast <2 x i32> %or to <2 x float>
+  ret <2 x float> %cast
+}
+
 declare i64 @_Z13get_global_idj(i32 noundef)
 
 attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/Transforms/Inline/X86/inline-target-cpu-i686.ll b/llvm/test/Transforms/Inline/X86/inline-target-cpu-i686.ll
index bd05cffcaa8b7f..187278d1c9035a 100644
--- a/llvm/test/Transforms/Inline/X86/inline-target-cpu-i686.ll
+++ b/llvm/test/Transforms/Inline/X86/inline-target-cpu-i686.ll
@@ -1,12 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=i686-unknown-unknown -S -passes=inline | FileCheck %s
 
 define i32 @func_target_cpu_nocona() #0 {
+; CHECK-LABEL: @func_target_cpu_nocona(
+; CHECK-NEXT:    ret i32 0
+;
   ret i32 0
 }
 
-; CHECK-LABEL: @target_cpu_prescott_call_target_cpu_nocona(
-; CHECK-NEXT: ret i32 0
 define i32 @target_cpu_prescott_call_target_cpu_nocona() #1 {
+; CHECK-LABEL: @target_cpu_prescott_call_target_cpu_nocona(
+; CHECK-NEXT:    ret i32 0
+;
   %call = call i32 @func_target_cpu_nocona()
   ret i32 %call
 }
diff --git a/llvm/test/Transforms/Inline/X86/inline-target-cpu-x86_64.ll b/llvm/test/Transforms/Inline/X86/inline-target-cpu-x86_64.ll
index b0a145d54cf593..e6693a637d820d 100644
--- a/llvm/test/Transforms/Inline/X86/inline-target-cpu-x86_64.ll
+++ b/llvm/test/Transforms/Inline/X86/inline-target-cpu-x86_64.ll
@@ -1,37 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown-unknown -S -passes=inline | FileCheck %s
 
 define i32 @func_target_cpu_base() #0 {
+; CHECK-LABEL: @func_target_cpu_base(
+; CHECK-NEXT:    ret i32 0
+;
   ret i32 0
 }
 
-; CHECK-LABEL: @target_cpu_k8_call_target_cpu_base(
-; CHECK-NEXT: ret i32 0
 define i32 @target_cpu_k8_call_target_cpu_base() #1 {
+; CHECK-LABEL: @target_cpu_k8_call_target_cpu_base(
+; CHECK-NEXT:    ret i32 0
+;
   %call = call i32 @func_target_cpu_base()
   ret i32 %call
 }
 
-; CHECK-LABEL: @target_cpu_target_nehalem_call_target_cpu_base(
-; CHECK-NEXT: ret i32 0
 define i32 @target_cpu_target_nehalem_call_target_cpu_base() #2 {
+; CHECK-LABEL: @target_cpu_target_nehalem_call_target_cpu_base(
+; CHECK-NEXT:    ret i32 0
+;
   %call = call i32 @func_target_cpu_base()
   ret i32 %call
 }
 
-; CHECK-LABEL: @target_cpu_target_goldmont_call_target_cpu_base(
-; CHECK-NEXT: ret i32 0
 define i32 @target_cpu_target_goldmont_call_target_cpu_base() #3 {
+; CHECK-LABEL: @target_cpu_target_goldmont_call_target_cpu_base(
+; CHECK-NEXT:    ret i32 0
+;
   %call = call i32 @func_target_cpu_base()
   ret i32 %call
 }
 
 define i32 @func_target_cpu_nocona() #4 {
+; CHECK-LABEL: @func_target_cpu_nocona(
+; CHECK-NEXT:    ret i32 0
+;
   ret i32 0
 }
 
-; CHECK-LABEL: @target_cpu_target_base_call_target_cpu_nocona(
-; CHECK-NEXT: ret i32 0
 define i32 @target_cpu_target_base_call_target_cpu_nocona() #0 {
+; CHECK-LABEL: @target_cpu_target_base_call_target_cpu_nocona(
+; CHECK-NEXT:    ret i32 0
+;
   %call = call i32 @func_target_cpu_nocona()
   ret i32 %call
 }
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll
index 80d8e1b16ed28b..3c44da84813fdb 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll
@@ -1814,1366 +1814,6 @@ define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double>
   ret double %13
 }
 
-declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
-
-define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_si_256(
-; CHECK-NEXT:    ret <8 x i32> [[A0:%.*]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
-  ret <8 x i32> %3
-}
-
-define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_si_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
-  ret <8 x i32> %3
-}
-
-define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_si_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
-  ret <8 x i32> %3
-}
-
-define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_si_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
-  ret <8 x i32> %3
-}
-
-declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
-
-define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_sf_256(
-; CHECK-NEXT:    ret <8 x float> [[A0:%.*]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-  ret <8 x float> %1
-}
-
-define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[TMP2]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
-  ret <8 x float> %3
-}
-
-define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_sf_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
-  ret <8 x float> %1
-}
-
-define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
-  ret <8 x float> %3
-}
-
-define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_sf_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x float> %1
-}
-
-define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
-  ret <8 x float> %3
-}
-
-define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_sf_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x float> %1
-}
-
-define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
-  ret <8 x float> %3
-}
-
-declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>)
-
-define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_di_256(
-; CHECK-NEXT:    ret <4 x i64> [[A0:%.*]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
-  ret <4 x i64> %3
-}
-
-define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_di_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
-  ret <4 x i64> %3
-}
-
-define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_di_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
-  ret <4 x i64> %3
-}
-
-define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_di_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
-  ret <4 x i64> %3
-}
-
-declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>)
-
-define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_df_256(
-; CHECK-NEXT:    ret <4 x double> [[A0:%.*]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
-  ret <4 x double> %1
-}
-
-define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[A0:%.*]], <4 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[TMP2]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
-  ret <4 x double> %3
-}
-
-define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_df_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer)
-  ret <4 x double> %1
-}
-
-define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
-  ret <4 x double> %3
-}
-
-define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_df_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
-  ret <4 x double> %1
-}
-
-define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
-  ret <4 x double> %3
-}
-
-define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_df_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
-  ret <4 x double> %1
-}
-
-define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
-  ret <4 x double> %3
-}
-
-declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
-
-define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_si_512(
-; CHECK-NEXT:    ret <16 x i32> [[A0:%.*]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
-  ret <16 x i32> %3
-}
-
-define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
-  ret <16 x i32> %3
-}
-
-define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
-  ret <16 x i32> %3
-}
-
-define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
-  ret <16 x i32> %3
-}
-
-declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
-
-define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_sf_512(
-; CHECK-NEXT:    ret <16 x float> [[A0:%.*]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
-  ret <16 x float> %1
-}
-
-define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer)
-  ret <16 x float> %1
-}
-
-define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <16 x float> %1
-}
-
-define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <16 x float> %1
-}
-
-define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
-
-define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_di_512(
-; CHECK-NEXT:    ret <8 x i64> [[A0:%.*]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
-  ret <8 x i64> %3
-}
-
-define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
-  ret <8 x i64> %3
-}
-
-define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
-  ret <8 x i64> %3
-}
-
-define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
-  ret <8 x i64> %3
-}
-
-declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
-
-define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_df_512(
-; CHECK-NEXT:    ret <8 x double> [[A0:%.*]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
-  ret <8 x double> %1
-}
-
-define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
-  ret <8 x double> %3
-}
-
-define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer)
-  ret <8 x double> %1
-}
-
-define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
-  ret <8 x double> %3
-}
-
-define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  ret <8 x double> %1
-}
-
-define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
-  ret <8 x double> %3
-}
-
-define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  ret <8 x double> %1
-}
-
-define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
-  ret <8 x double> %3
-}
-
-declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_hi_128(
-; CHECK-NEXT:    ret <8 x i16> [[A0:%.*]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_hi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_hi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_hi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
-  ret <8 x i16> %3
-}
-
-declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>)
-
-define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_hi_256(
-; CHECK-NEXT:    ret <16 x i16> [[A0:%.*]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_hi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_hi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_hi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
-  ret <16 x i16> %3
-}
-
-declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>)
-
-define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_hi_512(
-; CHECK-NEXT:    ret <32 x i16> [[A0:%.*]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 poison, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 poison, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
-  ret <32 x i16> %3
-}
-
-declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_qi_128(
-; CHECK-NEXT:    ret <16 x i8> [[A0:%.*]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
-  ret <16 x i8> %3
-}
-
-define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_qi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
-  ret <16 x i8> %3
-}
-
-define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_qi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
-  ret <16 x i8> %3
-}
-
-define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_qi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
-  ret <16 x i8> %3
-}
-
-declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>)
-
-define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_qi_256(
-; CHECK-NEXT:    ret <32 x i8> [[A0:%.*]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
-  ret <32 x i8> %3
-}
-
-define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_qi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
-  ret <32 x i8> %3
-}
-
-define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_qi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
-  ret <32 x i8> %3
-}
-
-define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_qi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 poison, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 poison, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
-  ret <32 x i8> %3
-}
-
-declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>)
-
-define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_qi_512(
-; CHECK-NEXT:    ret <64 x i8> [[A0:%.*]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <64 x i8> [[TMP2]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
-  ret <64 x i8> %3
-}
-
-define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
-  ret <64 x i8> %3
-}
-
-define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
-  ret <64 x i8> %3
-}
-
-define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 poison, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 poison, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
-  ret <64 x i8> %3
-}
-
 declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
 
 define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) {
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
index 906e84b6074811..d89cf6b0bb9868 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
@@ -1814,1366 +1814,6 @@ define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double>
   ret double %13
 }
 
-declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
-
-define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_si_256(
-; CHECK-NEXT:    ret <8 x i32> [[A0:%.*]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
-  ret <8 x i32> %3
-}
-
-define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_si_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
-  ret <8 x i32> %3
-}
-
-define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_si_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
-  ret <8 x i32> %3
-}
-
-define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_si_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x i32> %1
-}
-
-define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_si_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
-;
-  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
-  ret <8 x i32> %3
-}
-
-declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
-
-define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_sf_256(
-; CHECK-NEXT:    ret <8 x float> [[A0:%.*]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-  ret <8 x float> %1
-}
-
-define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[TMP2]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
-  ret <8 x float> %3
-}
-
-define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_sf_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
-  ret <8 x float> %1
-}
-
-define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
-  ret <8 x float> %3
-}
-
-define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_sf_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x float> %1
-}
-
-define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
-  ret <8 x float> %3
-}
-
-define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_sf_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <8 x float> %1
-}
-
-define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_sf_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
-;
-  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
-  ret <8 x float> %3
-}
-
-declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>)
-
-define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_di_256(
-; CHECK-NEXT:    ret <4 x i64> [[A0:%.*]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
-  ret <4 x i64> %3
-}
-
-define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_di_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
-  ret <4 x i64> %3
-}
-
-define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_di_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
-  ret <4 x i64> %3
-}
-
-define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_di_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
-  ret <4 x i64> %1
-}
-
-define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_di_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
-;
-  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
-  ret <4 x i64> %3
-}
-
-declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>)
-
-define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_df_256(
-; CHECK-NEXT:    ret <4 x double> [[A0:%.*]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
-  ret <4 x double> %1
-}
-
-define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[A0:%.*]], <4 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[TMP2]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
-  ret <4 x double> %3
-}
-
-define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_df_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer)
-  ret <4 x double> %1
-}
-
-define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
-  ret <4 x double> %3
-}
-
-define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_df_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
-  ret <4 x double> %1
-}
-
-define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
-  ret <4 x double> %3
-}
-
-define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_df_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
-  ret <4 x double> %1
-}
-
-define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_df_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
-;
-  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
-  ret <4 x double> %3
-}
-
-declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
-
-define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_si_512(
-; CHECK-NEXT:    ret <16 x i32> [[A0:%.*]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
-  ret <16 x i32> %3
-}
-
-define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
-  ret <16 x i32> %3
-}
-
-define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
-  ret <16 x i32> %3
-}
-
-define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_si_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
-  ret <16 x i32> %3
-}
-
-declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
-
-define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_sf_512(
-; CHECK-NEXT:    ret <16 x float> [[A0:%.*]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
-  ret <16 x float> %1
-}
-
-define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer)
-  ret <16 x float> %1
-}
-
-define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <16 x float> %1
-}
-
-define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x float> [[TMP1]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  ret <16 x float> %1
-}
-
-define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_sf_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
-
-define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_di_512(
-; CHECK-NEXT:    ret <8 x i64> [[A0:%.*]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
-  ret <8 x i64> %3
-}
-
-define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
-  ret <8 x i64> %3
-}
-
-define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
-  ret <8 x i64> %3
-}
-
-define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_di_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
-  ret <8 x i64> %3
-}
-
-declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
-
-define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_df_512(
-; CHECK-NEXT:    ret <8 x double> [[A0:%.*]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
-  ret <8 x double> %1
-}
-
-define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
-  ret <8 x double> %3
-}
-
-define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer)
-  ret <8 x double> %1
-}
-
-define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
-  ret <8 x double> %3
-}
-
-define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  ret <8 x double> %1
-}
-
-define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
-  ret <8 x double> %3
-}
-
-define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x double> [[TMP1]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  ret <8 x double> %1
-}
-
-define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_df_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
-  ret <8 x double> %3
-}
-
-declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_hi_128(
-; CHECK-NEXT:    ret <8 x i16> [[A0:%.*]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_hi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_hi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_hi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <8 x i16> %1
-}
-
-define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_hi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
-;
-  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
-  ret <8 x i16> %3
-}
-
-declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>)
-
-define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_hi_256(
-; CHECK-NEXT:    ret <16 x i16> [[A0:%.*]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_hi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_hi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
-  ret <16 x i16> %3
-}
-
-define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_hi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <16 x i16> %1
-}
-
-define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_hi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
-;
-  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
-  ret <16 x i16> %3
-}
-
-declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>)
-
-define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_hi_512(
-; CHECK-NEXT:    ret <32 x i16> [[A0:%.*]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
-  ret <32 x i16> %3
-}
-
-define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 poison, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  ret <32 x i16> %1
-}
-
-define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_hi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 poison, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
-;
-  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
-  ret <32 x i16> %3
-}
-
-declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_qi_128(
-; CHECK-NEXT:    ret <16 x i8> [[A0:%.*]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
-  ret <16 x i8> %3
-}
-
-define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_qi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
-  ret <16 x i8> %3
-}
-
-define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_qi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
-  ret <16 x i8> %3
-}
-
-define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_qi_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <16 x i8> %1
-}
-
-define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_qi_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
-;
-  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
-  ret <16 x i8> %3
-}
-
-declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>)
-
-define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_qi_256(
-; CHECK-NEXT:    ret <32 x i8> [[A0:%.*]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
-  ret <32 x i8> %3
-}
-
-define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_qi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
-  ret <32 x i8> %3
-}
-
-define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_qi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
-  ret <32 x i8> %3
-}
-
-define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_qi_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 poison, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <32 x i8> %1
-}
-
-define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_qi_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 poison, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
-;
-  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  %2 = bitcast i32 %mask to <32 x i1>
-  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
-  ret <32 x i8> %3
-}
-
-declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>)
-
-define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) {
-;
-; CHECK-LABEL: @identity_test_permvar_qi_512(
-; CHECK-NEXT:    ret <64 x i8> [[A0:%.*]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
-;
-; CHECK-LABEL: @identity_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <64 x i8> [[TMP2]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
-  ret <64 x i8> %3
-}
-
-define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) {
-;
-; CHECK-LABEL: @zero_test_permvar_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
-;
-; CHECK-LABEL: @zero_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
-  ret <64 x i8> %3
-}
-
-define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
-;
-; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
-  ret <64 x i8> %3
-}
-
-define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) {
-;
-; CHECK-LABEL: @undef_test_permvar_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 poison, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <64 x i8> %1
-}
-
-define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
-;
-; CHECK-LABEL: @undef_test_permvar_qi_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 poison, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
-; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
-;
-  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  %2 = bitcast i64 %mask to <64 x i1>
-  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
-  ret <64 x i8> %3
-}
-
 declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
 
 define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) {
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll b/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll
new file mode 100644
index 00000000000000..6519e4f5348484
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vperm.ll
@@ -0,0 +1,1404 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
+
+define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) {
+; CHECK-LABEL: @identity_test_permvar_si_256(
+; CHECK-NEXT:    ret <8 x i32> [[A0:%.*]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) {
+; CHECK-LABEL: @zero_test_permvar_si_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_si_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) {
+; CHECK-LABEL: @undef_test_permvar_si_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @demandedbit_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @demandedbit_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <8 x i32> [[A1:%.*]], <i32 0, i32 8, i32 -8, i32 16, i32 -16, i32 32, i32 -32, i32 64>
+; CHECK-NEXT:    [[S:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[M]])
+; CHECK-NEXT:    ret <8 x i32> [[S]]
+;
+  %m = or <8 x i32> %a1, <i32 0, i32 8, i32 -8, i32 16, i32 -16, i32 32, i32 -32, i32 64>
+  %s = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %m)
+  ret <8 x i32> %s
+}
+
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
+
+define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) {
+; CHECK-LABEL: @identity_test_permvar_sf_256(
+; CHECK-NEXT:    ret <8 x float> [[A0:%.*]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret <8 x float> %1
+}
+
+define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
+  ret <8 x float> %3
+}
+
+define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) {
+; CHECK-LABEL: @zero_test_permvar_sf_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
+  ret <8 x float> %1
+}
+
+define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
+  ret <8 x float> %3
+}
+
+define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_sf_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %1
+}
+
+define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
+  ret <8 x float> %3
+}
+
+define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) {
+; CHECK-LABEL: @undef_test_permvar_sf_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %1
+}
+
+define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
+  ret <8 x float> %3
+}
+
+define <8 x float> @demandedbit_test_permvar_sf_256_mask(<8 x float> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @demandedbit_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <8 x i32> [[A1:%.*]], <i32 0, i32 8, i32 -8, i32 16, i32 -16, i32 32, i32 -32, i32 64>
+; CHECK-NEXT:    [[S:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[M]])
+; CHECK-NEXT:    ret <8 x float> [[S]]
+;
+  %m = or <8 x i32> %a1, <i32 0, i32 8, i32 -8, i32 16, i32 -16, i32 32, i32 -32, i32 64>
+  %s = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %m)
+  ret <8 x float> %s
+}
+
+declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>)
+
+define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) {
+; CHECK-LABEL: @identity_test_permvar_di_256(
+; CHECK-NEXT:    ret <4 x i64> [[A0:%.*]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) {
+; CHECK-LABEL: @zero_test_permvar_di_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_di_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) {
+; CHECK-LABEL: @undef_test_permvar_di_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @demandedbits_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: @demandedbits_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <4 x i64> [[A1:%.*]], <i64 0, i64 4, i64 -4, i64 8>
+; CHECK-NEXT:    [[S:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[A0:%.*]], <4 x i64> [[M]])
+; CHECK-NEXT:    ret <4 x i64> [[S]]
+;
+  %m = or <4 x i64> %a1, <i64 0, i64 4, i64 -4, i64 8>
+  %s = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %m)
+  ret <4 x i64> %s
+}
+
+declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>)
+
+define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) {
+; CHECK-LABEL: @identity_test_permvar_df_256(
+; CHECK-NEXT:    ret <4 x double> [[A0:%.*]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
+  ret <4 x double> %1
+}
+
+define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[A0:%.*]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
+  ret <4 x double> %3
+}
+
+define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) {
+; CHECK-LABEL: @zero_test_permvar_df_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer)
+  ret <4 x double> %1
+}
+
+define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
+  ret <4 x double> %3
+}
+
+define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_df_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
+  ret <4 x double> %1
+}
+
+define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
+  ret <4 x double> %3
+}
+
+define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) {
+; CHECK-LABEL: @undef_test_permvar_df_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
+  ret <4 x double> %1
+}
+
+define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
+  ret <4 x double> %3
+}
+
+define <4 x double> @demandedbits_test_permvar_df_256_mask(<4 x double> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: @demandedbits_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <4 x i64> [[A1:%.*]], <i64 0, i64 4, i64 -4, i64 8>
+; CHECK-NEXT:    [[S:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[A0:%.*]], <4 x i64> [[M]])
+; CHECK-NEXT:    ret <4 x double> [[S]]
+;
+  %m = or <4 x i64> %a1, <i64 0, i64 4, i64 -4, i64 8>
+  %s = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> %m)
+  ret <4 x double> %s
+}
+
+declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
+
+define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) {
+; CHECK-LABEL: @identity_test_permvar_si_512(
+; CHECK-NEXT:    ret <16 x i32> [[A0:%.*]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: @identity_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
+  ret <16 x i32> %3
+}
+
+define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) {
+; CHECK-LABEL: @zero_test_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: @zero_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
+  ret <16 x i32> %3
+}
+
+define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
+  ret <16 x i32> %3
+}
+
+define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) {
+; CHECK-LABEL: @undef_test_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: @undef_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
+  ret <16 x i32> %3
+}
+
+define <16 x i32> @demandedbit_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @demandedbit_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <16 x i32> [[A1:%.*]], <i32 0, i32 16, i32 -16, i32 32, i32 -32, i32 64, i32 -64, i32 128, i32 -128, i32 256, i32 -256, i32 512, i32 -512, i32 1024, i32 -1024, i32 2048>
+; CHECK-NEXT:    [[S:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[A0:%.*]], <16 x i32> [[M]])
+; CHECK-NEXT:    ret <16 x i32> [[S]]
+;
+  %m = or <16 x i32> %a1, <i32 0, i32 16, i32 -16, i32 32, i32 -32, i32 64, i32 -64, i32 128, i32 -128, i32 256, i32 -256, i32 512, i32 -512, i32 1024, i32 -1024, i32 2048>
+  %s = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> %m)
+  ret <16 x i32> %s
+}
+
+declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
+
+define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) {
+; CHECK-LABEL: @identity_test_permvar_sf_512(
+; CHECK-NEXT:    ret <16 x float> [[A0:%.*]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
+  ret <16 x float> %1
+}
+
+define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: @identity_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) {
+; CHECK-LABEL: @zero_test_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer)
+  ret <16 x float> %1
+}
+
+define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: @zero_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x float> %1
+}
+
+define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) {
+; CHECK-LABEL: @undef_test_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x float> %1
+}
+
+define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: @undef_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <16 x float> @demandedbit_test_permvar_sf_512_mask(<16 x float> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @demandedbit_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <16 x i32> [[A1:%.*]], <i32 0, i32 16, i32 -16, i32 32, i32 -32, i32 64, i32 -64, i32 128, i32 -128, i32 256, i32 -256, i32 512, i32 -512, i32 1024, i32 -1024, i32 2048>
+; CHECK-NEXT:    [[S:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[A0:%.*]], <16 x i32> [[M]])
+; CHECK-NEXT:    ret <16 x float> [[S]]
+;
+  %m = or <16 x i32> %a1, <i32 0, i32 16, i32 -16, i32 32, i32 -32, i32 64, i32 -64, i32 128, i32 -128, i32 256, i32 -256, i32 512, i32 -512, i32 1024, i32 -1024, i32 2048>
+  %s = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> %m)
+  ret <16 x float> %s
+}
+
+declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
+
+define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) {
+; CHECK-LABEL: @identity_test_permvar_di_512(
+; CHECK-NEXT:    ret <8 x i64> [[A0:%.*]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) {
+; CHECK-LABEL: @zero_test_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) {
+; CHECK-LABEL: @undef_test_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @demandedbit_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK-LABEL: @demandedbit_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <8 x i64> [[A1:%.*]], <i64 0, i64 8, i64 -8, i64 16, i64 -16, i64 32, i64 -32, i64 64>
+; CHECK-NEXT:    [[S:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[A0:%.*]], <8 x i64> [[M]])
+; CHECK-NEXT:    ret <8 x i64> [[S]]
+;
+  %m = or <8 x i64> %a1, <i64 0, i64 8, i64 -8, i64 16, i64 -16, i64 32, i64 -32, i64 64>
+  %s = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %m)
+  ret <8 x i64> %s
+}
+
+declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
+
+define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) {
+; CHECK-LABEL: @identity_test_permvar_df_512(
+; CHECK-NEXT:    ret <8 x double> [[A0:%.*]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
+  ret <8 x double> %1
+}
+
+define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
+  ret <8 x double> %3
+}
+
+define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) {
+; CHECK-LABEL: @zero_test_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer)
+  ret <8 x double> %1
+}
+
+define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
+  ret <8 x double> %3
+}
+
+define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  ret <8 x double> %1
+}
+
+define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
+  ret <8 x double> %3
+}
+
+define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) {
+; CHECK-LABEL: @undef_test_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  ret <8 x double> %1
+}
+
+define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
+  ret <8 x double> %3
+}
+
+define <8 x double> @demandedbit_test_permvar_df_512_mask(<8 x double> %a0, <8 x i64> %a1) {
+; CHECK-LABEL: @demandedbit_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <8 x i64> [[A1:%.*]], <i64 0, i64 8, i64 -8, i64 16, i64 -16, i64 32, i64 -32, i64 64>
+; CHECK-NEXT:    [[S:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[A0:%.*]], <8 x i64> [[M]])
+; CHECK-NEXT:    ret <8 x double> [[S]]
+;
+  %m = or <8 x i64> %a1, <i64 0, i64 8, i64 -8, i64 16, i64 -16, i64 32, i64 -32, i64 64>
+  %s = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> %m)
+  ret <8 x double> %s
+}
+
+declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) {
+; CHECK-LABEL: @identity_test_permvar_hi_128(
+; CHECK-NEXT:    ret <8 x i16> [[A0:%.*]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) {
+; CHECK-LABEL: @zero_test_permvar_hi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) {
+; CHECK-LABEL: @undef_test_permvar_hi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @demandedbit_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @demandedbit_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <8 x i16> [[A1:%.*]], <i16 0, i16 8, i16 -8, i16 16, i16 -16, i16 32, i16 -32, i16 64>
+; CHECK-NEXT:    [[S:%.*]] = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> [[A0:%.*]], <8 x i16> [[M]])
+; CHECK-NEXT:    ret <8 x i16> [[S]]
+;
+  %m = or <8 x i16> %a1, <i16 0, i16 8, i16 -8, i16 16, i16 -16, i16 32, i16 -32, i16 64>
+  %s = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> %m)
+  ret <8 x i16> %s
+}
+
+declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>)
+
+define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) {
+; CHECK-LABEL: @identity_test_permvar_hi_256(
+; CHECK-NEXT:    ret <16 x i16> [[A0:%.*]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: @identity_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) {
+; CHECK-LABEL: @zero_test_permvar_hi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: @zero_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) {
+; CHECK-LABEL: @undef_test_permvar_hi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: @undef_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @demandedbit_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: @demandedbit_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <16 x i16> [[A1:%.*]], <i16 0, i16 16, i16 -16, i16 32, i16 -32, i16 64, i16 -64, i16 128, i16 -128, i16 256, i16 -256, i16 512, i16 -512, i16 1024, i16 -1024, i16 2048>
+; CHECK-NEXT:    [[S:%.*]] = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> [[A0:%.*]], <16 x i16> [[M]])
+; CHECK-NEXT:    ret <16 x i16> [[S]]
+;
+  %m = or <16 x i16> %a1, <i16 0, i16 16, i16 -16, i16 32, i16 -32, i16 64, i16 -64, i16 128, i16 -128, i16 256, i16 -256, i16 512, i16 -512, i16 1024, i16 -1024, i16 2048>
+  %s = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> %m)
+  ret <16 x i16> %s
+}
+
+declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>)
+
+define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) {
+; CHECK-LABEL: @identity_test_permvar_hi_512(
+; CHECK-NEXT:    ret <32 x i16> [[A0:%.*]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: @identity_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) {
+; CHECK-LABEL: @zero_test_permvar_hi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: @zero_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) {
+; CHECK-LABEL: @undef_test_permvar_hi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 poison, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: @undef_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 poison, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @demandedbit_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @demandedbit_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <32 x i16> [[A1:%.*]], <i16 0, i16 32, i16 -32, i16 64, i16 -64, i16 128, i16 -128, i16 256, i16 -256, i16 512, i16 -512, i16 1024, i16 -1024, i16 2048, i16 -2048, i16 4096, i16 0, i16 32, i16 -32, i16 64, i16 -64, i16 128, i16 -128, i16 256, i16 -256, i16 512, i16 -512, i16 1024, i16 -1024, i16 2048, i16 -2048, i16 4096>
+; CHECK-NEXT:    [[S:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[A0:%.*]], <32 x i16> [[M]])
+; CHECK-NEXT:    ret <32 x i16> [[S]]
+;
+  %m = or <32 x i16> %a1, <i16 0, i16 32, i16 -32, i16 64, i16 -64, i16 128, i16 -128, i16 256, i16 -256, i16 512, i16 -512, i16 1024, i16 -1024, i16 2048, i16 -2048, i16 4096, i16 0, i16 32, i16 -32, i16 64, i16 -64, i16 128, i16 -128, i16 256, i16 -256, i16 512, i16 -512, i16 1024, i16 -1024, i16 2048, i16 -2048, i16 4096>
+  %s = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> %m)
+  ret <32 x i16> %s
+}
+
+declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) {
+; CHECK-LABEL: @identity_test_permvar_qi_128(
+; CHECK-NEXT:    ret <16 x i8> [[A0:%.*]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: @identity_test_permvar_qi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) {
+; CHECK-LABEL: @zero_test_permvar_qi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: @zero_test_permvar_qi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) {
+; CHECK-LABEL: @undef_test_permvar_qi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: @undef_test_permvar_qi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @demandedbit_test_permvar_qi_129_mask(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: @demandedbit_test_permvar_qi_129_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <16 x i8> [[A1:%.*]], <i8 0, i8 16, i8 -16, i8 32, i8 -32, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 16, i8 -16, i8 32, i8 -32, i8 64, i8 -64>
+; CHECK-NEXT:    [[S:%.*]] = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> [[A0:%.*]], <16 x i8> [[M]])
+; CHECK-NEXT:    ret <16 x i8> [[S]]
+;
+  %m = or <16 x i8> %a1, <i8 0, i8 16, i8 -16, i8 32, i8 -32, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 16, i8 -16, i8 32, i8 -32, i8 64, i8 -64>
+  %s = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> %m)
+  ret <16 x i8> %s
+}
+
+declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>)
+
+define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) {
+; CHECK-LABEL: @identity_test_permvar_qi_256(
+; CHECK-NEXT:    ret <32 x i8> [[A0:%.*]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: @identity_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) {
+; CHECK-LABEL: @zero_test_permvar_qi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: @zero_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) {
+; CHECK-LABEL: @undef_test_permvar_qi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 poison, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: @undef_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 poison, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @demandedbit_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK-LABEL: @demandedbit_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <32 x i8> [[A1:%.*]], <i8 0, i8 32, i8 -32, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 32, i8 -32, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+; CHECK-NEXT:    [[S:%.*]] = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> [[A0:%.*]], <32 x i8> [[M]])
+; CHECK-NEXT:    ret <32 x i8> [[S]]
+;
+  %m = or <32 x i8> %a1, <i8 0, i8 32, i8 -32, i8 64, i8 -64, i8 128, i8 -128, i8 256, i8 -256, i8 512, i8 -512, i8 1024, i8 -1024, i8 2048, i8 -2048, i8 4096, i8 0, i8 32, i8 -32, i8 64, i8 -64, i8 128, i8 -128, i8 256, i8 -256, i8 512, i8 -512, i8 1024, i8 -1024, i8 2048, i8 -2048, i8 4096>
+  %s = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> %m)
+  ret <32 x i8> %s
+}
+
+declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>)
+
+define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) {
+; CHECK-LABEL: @identity_test_permvar_qi_512(
+; CHECK-NEXT:    ret <64 x i8> [[A0:%.*]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: @identity_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <64 x i8> [[TMP2]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) {
+; CHECK-LABEL: @zero_test_permvar_qi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: @zero_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) {
+; CHECK-LABEL: @undef_test_permvar_qi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 poison, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: @undef_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 poison, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @demandedbit_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %a1) {
+; CHECK-LABEL: @demandedbit_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[M:%.*]] = or <64 x i8> [[A1:%.*]], <i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128, i8 -128, i8 0, i8 64, i8 -64, i8 -128>
+; CHECK-NEXT:    [[S:%.*]] = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> [[A0:%.*]], <64 x i8> [[M]])
+; CHECK-NEXT:    ret <64 x i8> [[S]]
+;
+  %m = or <64 x i8> %a1, <i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128, i8 -128, i8 0, i8 64, i8 -64, i8 128>
+  %s = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> %m)
+  ret <64 x i8> %s
+}
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll b/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll
index a65358e1033cc6..eb6ad4458d932e 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll
@@ -25,6 +25,30 @@ define <2 x i64> @shuffle_vpermv3_v2i64_unary(<2 x i64> %x0) {
   ret <2 x i64> %r
 }
 
+define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) {
+; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 4>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %t = or <2 x i64> %m, <i64 0, i64 4>
+  %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %t, <2 x i64> %x1)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) {
+; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 2>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %t = or <2 x i64> %m, <i64 0, i64 2>
+  %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %t, <2 x i64> %x1)
+  ret <2 x i64> %r
+}
+
 define <4 x i64> @shuffle_vpermv3_v4i64(<4 x i64> %x0, <4 x i64> %x1) {
 ; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]]) {
@@ -45,6 +69,18 @@ define <4 x i64> @shuffle_vpermv3_v4i64_unary(<4 x i64> %x0) {
   ret <4 x i64> %r
 }
 
+define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %m) {
+; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <4 x i64> [[M]], <i64 0, i64 8, i64 16, i64 32>
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[T]], <4 x i64> [[X1]])
+; CHECK-NEXT:    ret <4 x i64> [[R]]
+;
+  %t = or <4 x i64> %m, <i64 0, i64 8, i64 16, i64 32>
+  %r = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %t, <4 x i64> %x1)
+  ret <4 x i64> %r
+}
+
 define <8 x i64> @shuffle_vpermv3_v8i64(<8 x i64> %x0, <8 x i64> %x1) {
 ; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64(
 ; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) {
@@ -65,6 +101,18 @@ define <8 x i64> @shuffle_vpermv3_v8i64_unary(<8 x i64> %x0) {
   ret <8 x i64> %r
 }
 
+define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %m) {
+; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(
+; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <8 x i64> [[M]], <i64 0, i64 16, i64 32, i64 64, i64 256, i64 512, i64 1024, i64 -16>
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> [[T]], <8 x i64> [[X1]])
+; CHECK-NEXT:    ret <8 x i64> [[R]]
+;
+  %t = or <8 x i64> %m, <i64 0, i64 16, i64 32, i64 64, i64 256, i64 512, i64 1024, i64 -16>
+  %r = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %t, <8 x i64> %x1)
+  ret <8 x i64> %r
+}
+
 ;
 ; vXi32
 ;
@@ -89,6 +137,18 @@ define <4 x i32> @shuffle_vpermv3_v4i32_unary(<4 x i32> %x0) {
   ret <4 x i32> %r
 }
 
+define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %m) {
+; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <4 x i32> [[M]], <i32 0, i32 8, i32 16, i32 32>
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[T]], <4 x i32> [[X1]])
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %t = or <4 x i32> %m, <i32 0, i32 8, i32 16, i32 32>
+  %r = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %t, <4 x i32> %x1)
+  ret <4 x i32> %r
+}
+
 define <8 x i32> @shuffle_vpermv3_v8i32(<8 x i32> %x0, <8 x i32> %x1) {
 ; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]]) {
@@ -109,6 +169,18 @@ define <8 x i32> @shuffle_vpermv3_v8i32_unary(<8 x i32> %x0) {
   ret <8 x i32> %r
 }
 
+define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %m) {
+; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <8 x i32> [[M]], <i32 0, i32 16, i32 32, i32 64, i32 256, i32 512, i32 -16, i32 -32>
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[T]], <8 x i32> [[X1]])
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %t = or <8 x i32> %m, <i32 0, i32 16, i32 32, i32 64, i32 256, i32 512, i32 -16, i32 -32>
+  %r = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %t, <8 x i32> %x1)
+  ret <8 x i32> %r
+}
+
 define <16 x i32> @shuffle_vpermv3_v16i32(<16 x i32> %x0, <16 x i32> %x1) {
 ; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) {
@@ -129,6 +201,18 @@ define <16 x i32> @shuffle_vpermv3_v16i32_unary(<16 x i32> %x0) {
   ret <16 x i32> %r
 }
 
+define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %m) {
+; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(
+; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <16 x i32> [[M]], <i32 0, i32 32, i32 64, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 -32, i32 -64, i32 -128, i32 -256, i32 -512, i32 -1024, i32 -2048>
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> [[T]], <16 x i32> [[X1]])
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %t = or <16 x i32> %m, <i32 0, i32 32, i32 64, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 -32, i32 -64, i32 -128, i32 -256, i32 -512, i32 -1024, i32 -2048>
+  %r = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %t, <16 x i32> %x1)
+  ret <16 x i32> %r
+}
+
 ;
 ; vXi16
 ;
@@ -153,6 +237,18 @@ define <8 x i16> @shuffle_vpermv3_v8i16_unary(<8 x i16> %x0) {
   ret <8 x i16> %r
 }
 
+define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %m) {
+; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(
+; CHECK-SAME: <8 x i16> [[X0:%.*]], <8 x i16> [[X1:%.*]], <8 x i16> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <8 x i16> [[M]], <i16 0, i16 16, i16 32, i16 64, i16 256, i16 512, i16 -16, i16 -32>
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> [[T]], <8 x i16> [[X1]])
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %t = or <8 x i16> %m, <i16 0, i16 16, i16 32, i16 64, i16 256, i16 512, i16 -16, i16 -32>
+  %r = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %t, <8 x i16> %x1)
+  ret <8 x i16> %r
+}
+
 define <16 x i16> @shuffle_vpermv3_v16i16(<16 x i16> %x0, <16 x i16> %x1) {
 ; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16(
 ; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]]) {
@@ -173,6 +269,18 @@ define <16 x i16> @shuffle_vpermv3_v16i16_unary(<16 x i16> %x0) {
   ret <16 x i16> %r
 }
 
+define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %m) {
+; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(
+; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]], <16 x i16> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <16 x i16> [[M]], <i16 0, i16 32, i16 64, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 -32, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> [[T]], <16 x i16> [[X1]])
+; CHECK-NEXT:    ret <16 x i16> [[R]]
+;
+  %t = or <16 x i16> %m, <i16 0, i16 32, i16 64, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 -32, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
+  %r = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %t, <16 x i16> %x1)
+  ret <16 x i16> %r
+}
+
 define <32 x i16> @shuffle_vpermv3_v32i16(<32 x i16> %x0, <32 x i16> %x1) {
 ; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16(
 ; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]]) {
@@ -193,6 +301,18 @@ define <32 x i16> @shuffle_vpermv3_v32i16_unary(<32 x i16> %x0) {
   ret <32 x i16> %r
 }
 
+define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %m) {
+; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(
+; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]], <32 x i16> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <32 x i16> [[M]], <i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096, i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> [[T]], <32 x i16> [[X1]])
+; CHECK-NEXT:    ret <32 x i16> [[R]]
+;
+  %t = or <32 x i16> %m, <i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096, i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
+  %r = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %t, <32 x i16> %x1)
+  ret <32 x i16> %r
+}
+
 ;
 ; vXi8
 ;
@@ -217,6 +337,18 @@ define <16 x i8> @shuffle_vpermv3_v16i8_unary(<16 x i8> %x0) {
   ret <16 x i8> %r
 }
 
+define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %m) {
+; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(
+; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <16 x i8> [[M]], <i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128, i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128>
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> [[T]], <16 x i8> [[X1]])
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %t = or <16 x i8> %m, <i8 0, i8 32, i8 64, i8 128, i8 0, i8 -32, i8 -64, i8 -128, i8 0, i8 32, i8 64, i8 128, i8 0, i8 -32, i8 -64, i8 -128>
+  %r = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %t, <16 x i8> %x1)
+  ret <16 x i8> %r
+}
+
 define <32 x i8> @shuffle_vpermv3_v32i8(<32 x i8> %x0, <32 x i8> %x1) {
 ; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8(
 ; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]]) {
@@ -237,6 +369,18 @@ define <32 x i8> @shuffle_vpermv3_v32i8_unary(<32 x i8> %x0) {
   ret <32 x i8> %r
 }
 
+define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %m) {
+; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(
+; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <32 x i8> [[M]], <i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128>
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> [[T]], <32 x i8> [[X1]])
+; CHECK-NEXT:    ret <32 x i8> [[R]]
+;
+  %t = or <32 x i8> %m, <i8 0, i8 0, i8 64, i8 128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 128, i8 0, i8 0, i8 -64, i8 -128>
+  %r = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %t, <32 x i8> %x1)
+  ret <32 x i8> %r
+}
+
 define <64 x i8> @shuffle_vpermv3_v64i8(<64 x i8> %x0, <64 x i8> %x1) {
 ; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8(
 ; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) {
@@ -256,3 +400,15 @@ define <64 x i8> @shuffle_vpermv3_v64i8_unary(<64 x i8> %x0) {
   %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> <i8 128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> %x0)
   ret <64 x i8> %r
 }
+
+define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %m) {
+; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(
+; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[M:%.*]]) {
+; CHECK-NEXT:    [[T:%.*]] = or <64 x i8> [[M]], <i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128>
+; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> [[T]], <64 x i8> [[X1]])
+; CHECK-NEXT:    ret <64 x i8> [[R]]
+;
+  %t = or <64 x i8> %m, <i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128>
+  %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %t, <64 x i8> %x1)
+  ret <64 x i8> %r
+}
diff --git a/llvm/test/Transforms/LICM/hoist-add-sub.ll b/llvm/test/Transforms/LICM/hoist-add-sub.ll
index 5393cdb1d29c43..d9b868eda579f9 100644
--- a/llvm/test/Transforms/LICM/hoist-add-sub.ll
+++ b/llvm/test/Transforms/LICM/hoist-add-sub.ll
@@ -51,6 +51,55 @@ out_of_bounds:
   ret i32 -1
 }
 
+define i32 @test_01_unsigned(ptr %p, ptr %x_p, ptr %length_p) {
+; CHECK-LABEL: define i32 @test_01_unsigned
+; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG1:![0-9]+]]
+; CHECK-NEXT:    [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG2:![0-9]+]]
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = sub nuw i32 [[X]], 4
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp ugt i32 [[IV]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[IV_NEXT_LCSSA]]
+; CHECK:       out_of_bounds:
+; CHECK-NEXT:    ret i32 -1
+;
+entry:
+  %x = load i32, ptr %x_p, !range !2
+  %length = load i32, ptr %length_p, !range !1
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  %arith = sub nuw i32 %x, %iv
+  %x_check = icmp ult i32 %arith, 4
+  br i1 %x_check, label %out_of_bounds, label %backedge
+
+backedge:
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  store i32 1, ptr %el.ptr
+  %iv.next = add nuw nsw i32 %iv, 4
+  %loop_cond = icmp ult i32 %iv.next, %length
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv.next
+
+out_of_bounds:
+  ret i32 -1
+}
+
 ; TODO: x - iv < 4 ==> iv > x - 4
 define i32 @test_01a(ptr %p, ptr %x_p, ptr %length_p) {
 ; CHECK-LABEL: define i32 @test_01a
@@ -114,6 +163,68 @@ failed:
   ret i32 -2
 }
 
+define i32 @test_01a_unsigned(ptr %p, ptr %x_p, ptr %length_p) {
+; CHECK-LABEL: define i32 @test_01a_unsigned
+; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4
+; CHECK-NEXT:    [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4
+; CHECK-NEXT:    [[PRECOND_1:%.*]] = icmp uge i32 [[X]], 0
+; CHECK-NEXT:    [[PRECOND_2:%.*]] = icmp uge i32 [[LENGTH]], 0
+; CHECK-NEXT:    [[PRECOND:%.*]] = and i1 [[PRECOND_1]], [[PRECOND_2]]
+; CHECK-NEXT:    br i1 [[PRECOND]], label [[LOOP_PREHEADER:%.*]], label [[FAILED:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[ARITH:%.*]] = sub nuw i32 [[X]], [[IV]]
+; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp ult i32 [[ARITH]], 4
+; CHECK-NEXT:    br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[IV_NEXT_LCSSA]]
+; CHECK:       out_of_bounds:
+; CHECK-NEXT:    ret i32 -1
+; CHECK:       failed:
+; CHECK-NEXT:    ret i32 -2
+;
+entry:
+  %x = load i32, ptr %x_p
+  %length = load i32, ptr %length_p
+  %precond_1 = icmp uge i32 %x, 0
+  %precond_2 = icmp uge i32 %length, 0
+  %precond = and i1 %precond_1, %precond_2
+  br i1 %precond, label %loop, label %failed
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  %arith = sub nuw i32 %x, %iv
+  %x_check = icmp ult i32 %arith, 4
+  br i1 %x_check, label %out_of_bounds, label %backedge
+
+backedge:
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  store i32 1, ptr %el.ptr
+  %iv.next = add nuw nsw i32 %iv, 4
+  %loop_cond = icmp ult i32 %iv.next, %length
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv.next
+
+out_of_bounds:
+  ret i32 -1
+
+failed:
+  ret i32 -2
+}
+
 ; Range info is missing for x, cannot prove no-overflow. Should not hoist.
 define i32 @test_01_neg(ptr %p, ptr %x_p, ptr %length_p) {
 ; CHECK-LABEL: define i32 @test_01_neg
@@ -164,6 +275,54 @@ out_of_bounds:
   ret i32 -1
 }
 
+define i32 @test_01_neg_unsigned(ptr %p, ptr %x_p, ptr %length_p) {
+; CHECK-LABEL: define i32 @test_01_neg_unsigned
+; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4
+; CHECK-NEXT:    [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[ARITH:%.*]] = sub nuw i32 [[X]], [[IV]]
+; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp ult i32 [[ARITH]], 4
+; CHECK-NEXT:    br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[IV_NEXT_LCSSA]]
+; CHECK:       out_of_bounds:
+; CHECK-NEXT:    ret i32 -1
+;
+entry:
+  %x = load i32, ptr %x_p
+  %length = load i32, ptr %length_p, !range !0
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  %arith = sub nuw i32 %x, %iv
+  %x_check = icmp ult i32 %arith, 4
+  br i1 %x_check, label %out_of_bounds, label %backedge
+
+backedge:
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  store i32 1, ptr %el.ptr
+  %iv.next = add nuw nsw i32 %iv, 4
+  %loop_cond = icmp ult i32 %iv.next, %length
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv.next
+
+out_of_bounds:
+  ret i32 -1
+}
 
 ; x + iv < 4 ==> iv < 4 - x
 define i32 @test_02(ptr %p, ptr %x_p, ptr %length_p) {
@@ -215,6 +374,55 @@ out_of_bounds:
   ret i32 -1
 }
 
+define i32 @test_02_unsigned(ptr %p, ptr %x_p, ptr %length_p) {
+; CHECK-LABEL: define i32 @test_02_unsigned
+; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG3:![0-9]+]]
+; CHECK-NEXT:    [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG2]]
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = sub nuw i32 4, [[X]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp ult i32 [[IV]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[IV_NEXT_LCSSA]]
+; CHECK:       out_of_bounds:
+; CHECK-NEXT:    ret i32 -1
+;
+entry:
+  %x = load i32, ptr %x_p, !range !3
+  %length = load i32, ptr %length_p, !range !1
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  %arith = add nuw i32 %x, %iv
+  %x_check = icmp ult i32 %arith, 4
+  br i1 %x_check, label %out_of_bounds, label %backedge
+
+backedge:
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  store i32 1, ptr %el.ptr
+  %iv.next = add nuw nsw i32 %iv, 4
+  %loop_cond = icmp ult i32 %iv.next, %length
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv.next
+
+out_of_bounds:
+  ret i32 -1
+}
+
 ; TODO: x + iv < 4 ==> iv < 4 - x
 define i32 @test_02a(ptr %p, ptr %x_p, ptr %length_p) {
 ; CHECK-LABEL: define i32 @test_02a
@@ -278,12 +486,74 @@ failed:
   ret i32 -2
 }
 
+define i32 @test_02a_unsigned(ptr %p, ptr %x_p, ptr %length_p) {
+; CHECK-LABEL: define i32 @test_02a_unsigned
+; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4
+; CHECK-NEXT:    [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4
+; CHECK-NEXT:    [[PRECOND_1:%.*]] = icmp uge i32 [[X]], 0
+; CHECK-NEXT:    [[PRECOND_2:%.*]] = icmp uge i32 [[LENGTH]], 0
+; CHECK-NEXT:    [[PRECOND:%.*]] = and i1 [[PRECOND_1]], [[PRECOND_2]]
+; CHECK-NEXT:    br i1 [[PRECOND]], label [[LOOP_PREHEADER:%.*]], label [[FAILED:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[ARITH:%.*]] = add nuw i32 [[X]], [[IV]]
+; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp ult i32 [[ARITH]], 4
+; CHECK-NEXT:    br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[IV_NEXT_LCSSA]]
+; CHECK:       out_of_bounds:
+; CHECK-NEXT:    ret i32 -1
+; CHECK:       failed:
+; CHECK-NEXT:    ret i32 -2
+;
+entry:
+  %x = load i32, ptr %x_p
+  %length = load i32, ptr %length_p
+  %precond_1 = icmp uge i32 %x, 0
+  %precond_2 = icmp uge i32 %length, 0
+  %precond = and i1 %precond_1, %precond_2
+  br i1 %precond, label %loop, label %failed
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  %arith = add nuw i32 %x, %iv
+  %x_check = icmp ult i32 %arith, 4
+  br i1 %x_check, label %out_of_bounds, label %backedge
+
+backedge:
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  store i32 1, ptr %el.ptr
+  %iv.next = add nuw nsw i32 %iv, 4
+  %loop_cond = icmp ult i32 %iv.next, %length
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv.next
+
+out_of_bounds:
+  ret i32 -1
+
+failed:
+  ret i32 -2
+}
+
 ; iv - x < 4 ==> iv < 4 + x
 define i32 @test_03(ptr %p, ptr %x_p, ptr %length_p) {
 ; CHECK-LABEL: define i32 @test_03
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG1:![0-9]+]]
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG2]]
 ; CHECK-NEXT:    [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG0]]
 ; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add nsw i32 [[X]], 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -328,6 +598,55 @@ out_of_bounds:
   ret i32 -1
 }
 
+define i32 @test_03_unsigned(ptr %p, ptr %x_p, ptr %length_p) {
+; CHECK-LABEL: define i32 @test_03_unsigned
+; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG2]]
+; CHECK-NEXT:    [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG2]]
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add nuw i32 [[X]], 4
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp ult i32 [[IV]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[IV_NEXT_LCSSA]]
+; CHECK:       out_of_bounds:
+; CHECK-NEXT:    ret i32 -1
+;
+entry:
+  %x = load i32, ptr %x_p, !range !1
+  %length = load i32, ptr %length_p, !range !1
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  %arith = sub nuw i32 %iv, %x
+  %x_check = icmp ult i32 %arith, 4
+  br i1 %x_check, label %out_of_bounds, label %backedge
+
+backedge:
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  store i32 1, ptr %el.ptr
+  %iv.next = add nuw nsw i32 %iv, 4
+  %loop_cond = icmp ult i32 %iv.next, %length
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv.next
+
+out_of_bounds:
+  ret i32 -1
+}
+
 ; TODO: iv - x < 4 ==> iv < 4 + x
 define i32 @test_03a(ptr %p, ptr %x_p, ptr %length_p) {
 ; CHECK-LABEL: define i32 @test_03a
@@ -391,6 +710,68 @@ failed:
   ret i32 -2
 }
 
+define i32 @test_03a_unsigned(ptr %p, ptr %x_p, ptr %length_p) {
+; CHECK-LABEL: define i32 @test_03a_unsigned
+; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4
+; CHECK-NEXT:    [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4
+; CHECK-NEXT:    [[PRECOND_1:%.*]] = icmp ult i32 [[X]], 2147483640
+; CHECK-NEXT:    [[PRECOND_2:%.*]] = icmp uge i32 [[LENGTH]], 0
+; CHECK-NEXT:    [[PRECOND:%.*]] = and i1 [[PRECOND_1]], [[PRECOND_2]]
+; CHECK-NEXT:    br i1 [[PRECOND]], label [[LOOP_PREHEADER:%.*]], label [[FAILED:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[ARITH:%.*]] = sub nuw i32 [[IV]], [[X]]
+; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp ult i32 [[ARITH]], 4
+; CHECK-NEXT:    br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[IV_NEXT_LCSSA]]
+; CHECK:       out_of_bounds:
+; CHECK-NEXT:    ret i32 -1
+; CHECK:       failed:
+; CHECK-NEXT:    ret i32 -2
+;
+entry:
+  %x = load i32, ptr %x_p
+  %length = load i32, ptr %length_p
+  %precond_1 = icmp ult i32 %x, 2147483640
+  %precond_2 = icmp uge i32 %length, 0
+  %precond = and i1 %precond_1, %precond_2
+  br i1 %precond, label %loop, label %failed
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  %arith = sub nuw i32 %iv, %x
+  %x_check = icmp ult i32 %arith, 4
+  br i1 %x_check, label %out_of_bounds, label %backedge
+
+backedge:
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  store i32 1, ptr %el.ptr
+  %iv.next = add nuw nsw i32 %iv, 4
+  %loop_cond = icmp ult i32 %iv.next, %length
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv.next
+
+out_of_bounds:
+  ret i32 -1
+
+failed:
+  ret i32 -2
+}
+
 ; iv + x < 4 ==> iv < 4 - x
 define i32 @test_04(ptr %p, ptr %x_p, ptr %length_p) {
 ; CHECK-LABEL: define i32 @test_04
@@ -441,6 +822,55 @@ out_of_bounds:
   ret i32 -1
 }
 
+define i32 @test_04_unsigned(ptr %p, ptr %x_p, ptr %length_p) {
+; CHECK-LABEL: define i32 @test_04_unsigned
+; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4, !range [[RNG3]]
+; CHECK-NEXT:    [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4, !range [[RNG2]]
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = sub nuw i32 4, [[X]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp ult i32 [[IV]], [[INVARIANT_OP]]
+; CHECK-NEXT:    br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[IV_NEXT_LCSSA]]
+; CHECK:       out_of_bounds:
+; CHECK-NEXT:    ret i32 -1
+;
+entry:
+  %x = load i32, ptr %x_p, !range !3
+  %length = load i32, ptr %length_p, !range !1
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  %arith = add nuw i32 %iv, %x
+  %x_check = icmp ult i32 %arith, 4
+  br i1 %x_check, label %out_of_bounds, label %backedge
+
+backedge:
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  store i32 1, ptr %el.ptr
+  %iv.next = add nuw nsw i32 %iv, 4
+  %loop_cond = icmp ult i32 %iv.next, %length
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv.next
+
+out_of_bounds:
+  ret i32 -1
+}
+
 ; TODO: iv + x < 4 ==> iv < 4 - x
 define i32 @test_04a(ptr %p, ptr %x_p, ptr %length_p) {
 ; CHECK-LABEL: define i32 @test_04a
@@ -504,5 +934,69 @@ failed:
   ret i32 -2
 }
 
+define i32 @test_04a_unsigned(ptr %p, ptr %x_p, ptr %length_p) {
+; CHECK-LABEL: define i32 @test_04a_unsigned
+; CHECK-SAME: (ptr [[P:%.*]], ptr [[X_P:%.*]], ptr [[LENGTH_P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P]], align 4
+; CHECK-NEXT:    [[LENGTH:%.*]] = load i32, ptr [[LENGTH_P]], align 4
+; CHECK-NEXT:    [[PRECOND_1:%.*]] = icmp sge i32 [[X]], 0
+; CHECK-NEXT:    [[PRECOND_2:%.*]] = icmp sge i32 [[LENGTH]], 0
+; CHECK-NEXT:    [[PRECOND:%.*]] = and i1 [[PRECOND_1]], [[PRECOND_2]]
+; CHECK-NEXT:    br i1 [[PRECOND]], label [[LOOP_PREHEADER:%.*]], label [[FAILED:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[ARITH:%.*]] = add nuw i32 [[IV]], [[X]]
+; CHECK-NEXT:    [[X_CHECK:%.*]] = icmp ult i32 [[ARITH]], 4
+; CHECK-NEXT:    br i1 [[X_CHECK]], label [[OUT_OF_BOUNDS:%.*]], label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], [[LENGTH]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[IV_NEXT_LCSSA]]
+; CHECK:       out_of_bounds:
+; CHECK-NEXT:    ret i32 -1
+; CHECK:       failed:
+; CHECK-NEXT:    ret i32 -2
+;
+entry:
+  %x = load i32, ptr %x_p
+  %length = load i32, ptr %length_p
+  %precond_1 = icmp sge i32 %x, 0
+  %precond_2 = icmp sge i32 %length, 0
+  %precond = and i1 %precond_1, %precond_2
+  br i1 %precond, label %loop, label %failed
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  %arith = add nuw i32 %iv, %x
+  %x_check = icmp ult i32 %arith, 4
+  br i1 %x_check, label %out_of_bounds, label %backedge
+
+backedge:
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  store i32 1, ptr %el.ptr
+  %iv.next = add nuw nsw i32 %iv, 4
+  %loop_cond = icmp ult i32 %iv.next, %length
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret i32 %iv.next
+
+out_of_bounds:
+  ret i32 -1
+
+failed:
+  ret i32 -2
+}
+
 !0 = !{i32 0, i32 2147483648}
 !1 = !{i32 0, i32 2147483640}
+!2 = !{i32 256, i32 32768}
+!3 = !{i32 0, i32 2}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index f467f3cf262d2f..93034f4dbe56ec 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -215,16 +215,14 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFCOMMON-NEXT:    [[TMP7:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
 ; TFCOMMON-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP7]])
-; TFCOMMON-NEXT:    [[TMP9:%.*]] = xor <vscale x 2 x i1> [[TMP6]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFCOMMON-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i1> zeroinitializer
-; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP8]]
-; TFCOMMON-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i64> zeroinitializer
+; TFCOMMON-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
-; TFCOMMON-NEXT:    [[TMP12:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFCOMMON-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 2 x i1> [[TMP12]], i32 0
-; TFCOMMON-NEXT:    br i1 [[TMP13]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; TFCOMMON-NEXT:    [[TMP10:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFCOMMON-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 2 x i1> [[TMP10]], i32 0
+; TFCOMMON-NEXT:    br i1 [[TMP11]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; TFCOMMON:       for.cond.cleanup:
 ; TFCOMMON-NEXT:    ret void
 ;
@@ -259,27 +257,23 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i1> zeroinitializer
 ; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP13]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[TMP14]])
-; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = xor <vscale x 2 x i1> [[TMP11]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = xor <vscale x 2 x i1> [[TMP12]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> zeroinitializer
-; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP18]], <vscale x 2 x i1> zeroinitializer
-; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP19]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP15]]
-; TFA_INTERLEAVE-NEXT:    [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP20]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP16]]
-; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP22]], 2
-; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP23]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP21]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP24]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 [[TMP19]]
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP17]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP20]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 2
-; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX_NEXT]], [[TMP26]]
+; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX_NEXT]], [[TMP22]]
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
-; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT5]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP27]], i64 1025)
-; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 2 x i1> [[TMP28]], i32 0
-; TFA_INTERLEAVE-NEXT:    br i1 [[TMP29]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT5]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP23]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 2 x i1> [[TMP24]], i32 0
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP25]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; TFA_INTERLEAVE:       for.cond.cleanup:
 ; TFA_INTERLEAVE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index f922873210b052..66d001498e457b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -1216,7 +1216,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF:       vector.body:
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP10]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
@@ -1226,41 +1226,39 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP10]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[TMP15]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP14]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
-; CHECK-ORDERED-TF-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD1]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP20]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP19]])
+; CHECK-ORDERED-TF-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x float> [[WIDE_MASKED_LOAD1]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP18]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP17]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[TMP21]], i32 0
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[TMP19]], i32 0
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED-TF:       scalar.ph:
 ; CHECK-ORDERED-TF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED-TF:       for.body:
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP23]], 0.000000e+00
+; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-ORDERED-TF-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP21]], 0.000000e+00
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK-ORDERED-TF:       if.then:
 ; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_INC]]
 ; CHECK-ORDERED-TF:       for.inc:
-; CHECK-ORDERED-TF-NEXT:    [[PHI:%.*]] = phi float [ [[TMP24]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[PHI:%.*]] = phi float [ [[TMP22]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[FADD]] = fadd float [[RES]], [[PHI]]
 ; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[RDX]]
 ;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 9641dd7d21fd2a..852a967e764819 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -1521,10 +1521,10 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP33]] to i32
 ; CHECK-NEXT:    [[I1]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[IV1:%.*]] = or disjoint i64 [[IV]], 1
 ; CHECK-NEXT:    [[IV2]] = add nuw nsw i64 [[IV]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index f6a6d021f03c9f..6fa1e7fbbac602 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -467,16 +467,15 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
 ; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP17]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
-; CHECK-NEXT:    [[TMP19:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[TMP19]], i32 0
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -485,14 +484,14 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP20]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[SRC]], align 4
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[TMP22]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[TMP21]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    store i32 [[VAL_0]], ptr [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll
index aafe849b7042ab..e3af831f83c970 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll
@@ -7,19 +7,19 @@ define dso_local double @test(ptr %Arr) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[ARR:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x double> @__sind2_P8(<2 x double> [[TMP3]])
-; CHECK-NEXT:    [[TMP5]] = fadd fast <2 x double> [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast <2 x double> @__sind2_P8(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4]] = fadd fast <2 x double> [[VEC_PHI]], [[TMP3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP5]])
-; CHECK-NEXT:    ret double [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP4]])
+; CHECK-NEXT:    ret double [[TMP6]]
 ;
 entry:
   br label %for.cond
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index 1db718a0e42f9f..3e2f290a497db1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -152,6 +152,77 @@ exit:
   ret void
 }
 
+; Test case for https://github.com/llvm/llvm-project/issues/106641.
+define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
+; CHECK-LABEL: define void @truncate_to_i1_used_by_branch(
+; CHECK-SAME: i8 [[X:%.*]], ptr [[DST:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc <2 x i8> [[BROADCAST_SPLAT]] to <2 x i1>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT3]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IV]], i32 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 [[TMP1]], i32 2)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i1> <i1 true, i1 true>, [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP2]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> zeroinitializer, <2 x ptr> [[BROADCAST_SPLAT4]], i32 1, <2 x i1> [[TMP3]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[F_039:%.*]] = phi i8 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i8 23, [[X]]
+; CHECK-NEXT:    [[EXTRACT_T:%.*]] = trunc i8 [[TMP4]] to i1
+; CHECK-NEXT:    br i1 [[EXTRACT_T]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    store i8 0, ptr [[DST]], align 1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[ADD]] = add i8 [[F_039]], 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[F_039]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %f.039 = phi i8  [ 0, %entry ], [ %add, %loop.latch ]
+  %0 = or i8 23, %x
+  %extract.t = trunc i8 %0 to i1
+  br i1 %extract.t, label %then, label %loop.latch
+
+then:
+  store i8 0, ptr %dst, align 1
+  br label %loop.latch
+
+loop.latch:
+  %add = add i8 %f.039, 1
+  %conv = sext i8 %f.039 to i32
+  %cmp = icmp slt i32 %conv, 1
+  br i1 %cmp, label %loop.header, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+64bit,+v,+zvl256b" }
+
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -159,4 +230,6 @@ exit:
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
index 04289d43f40e2f..c051e2f18380bd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
@@ -414,6 +414,7 @@ for.end:
 
 define void @acos_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @acos_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_acosf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_acosf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_acosf(<16 x float> [[TMP4:%.*]])
@@ -487,7 +488,10 @@ for.end:
 
 define void @asin_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @asin_f64_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @llvm.asin.v2f64(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @llvm.asin.v4f64(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_asin(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.asin.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -510,6 +514,7 @@ for.end:
 
 define void @asin_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @asin_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_asinf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_asinf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_asinf(<16 x float> [[TMP4:%.*]])
@@ -588,6 +593,7 @@ define void @atan_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_atan(<2 x double> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_atan(<4 x double> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_atan(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.atan.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -610,6 +616,7 @@ for.end:
 
 define void @atan_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @atan_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_atanf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_atanf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_atanf(<16 x float> [[TMP4:%.*]])
@@ -683,6 +690,9 @@ for.end:
 define void @cosh_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @cosh_f64_intrinsic(
 ; CHECK-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_cosh(<2 x double> [[TMP4:%.*]])
+; CHECK-VF4:    [[TMP5:%.*]] = call <4 x double> @llvm.cosh.v4f64(<4 x double> [[TMP4:%.*]])
+; CHECK-VF8:    [[TMP5:%.*]] = call <8 x double> @llvm.cosh.v8f64(<8 x double> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x double> @llvm.cosh.v16f64(<16 x double> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -705,8 +715,10 @@ for.end:
 
 define void @cosh_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @cosh_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_coshf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_coshf(<8 x float> [[TMP4:%.*]])
+; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @llvm.cosh.v16f32(<16 x float> [[TMP4:%.*]])
 ; CHECK:        ret void
 ;
 entry:
@@ -754,6 +766,7 @@ for.end:
 
 define void @tanh_f32_intrinsic(ptr nocapture %varray) {
 ; CHECK-LABEL: @tanh_f32_intrinsic(
+; CHECK-VF2:    [[TMP5:%.*]] = call <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP4:%.*]])
 ; CHECK-VF4:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanhf(<4 x float> [[TMP4:%.*]])
 ; CHECK-VF8:    [[TMP5:%.*]] = call <8 x float> @amd_vrs8_tanhf(<8 x float> [[TMP4:%.*]])
 ; CHECK-VF16:   [[TMP5:%.*]] = call <16 x float> @amd_vrs16_tanhf(<16 x float> [[TMP4:%.*]])
diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
index 7752af558f7d61..7cf4070f76d76e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
@@ -84,20 +84,20 @@ define double @sumIfVector(ptr nocapture readonly %arr) {
 ; SSE-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
 ; SSE-NEXT:    [[TMP6:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01>
 ; SSE-NEXT:    [[TMP7:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD2]], <double 4.200000e+01, double 4.200000e+01>
-; SSE-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
-; SSE-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[VEC_PHI1]], [[WIDE_LOAD2]]
-; SSE-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP6]], <2 x double> [[TMP10]], <2 x double> [[VEC_PHI]]
-; SSE-NEXT:    [[PREDPHI3]] = select <2 x i1> [[TMP7]], <2 x double> [[TMP11]], <2 x double> [[VEC_PHI1]]
+; SSE-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
+; SSE-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[VEC_PHI1]], [[WIDE_LOAD2]]
+; SSE-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP6]], <2 x double> [[TMP8]], <2 x double> [[VEC_PHI]]
+; SSE-NEXT:    [[PREDPHI3]] = select <2 x i1> [[TMP7]], <2 x double> [[TMP9]], <2 x double> [[VEC_PHI1]]
 ; SSE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; SSE-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; SSE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SSE-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
+; SSE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SSE:       middle.block:
 ; SSE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[PREDPHI3]], [[PREDPHI]]
-; SSE-NEXT:    [[TMP13:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[BIN_RDX]])
+; SSE-NEXT:    [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[BIN_RDX]])
 ; SSE-NEXT:    br i1 true, label [[DONE:%.*]], label [[SCALAR_PH]]
 ; SSE:       scalar.ph:
 ; SSE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SSE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; SSE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
 ; SSE-NEXT:    br label [[LOOP:%.*]]
 ; SSE:       loop:
 ; SSE-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
@@ -117,7 +117,7 @@ define double @sumIfVector(ptr nocapture readonly %arr) {
 ; SSE-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
 ; SSE-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP3:![0-9]+]]
 ; SSE:       done:
-; SSE-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; SSE-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; SSE-NEXT:    ret double [[TOT_NEXT_LCSSA]]
 ;
 ; AVX-LABEL: @sumIfVector(
@@ -151,26 +151,26 @@ define double @sumIfVector(ptr nocapture readonly %arr) {
 ; AVX-NEXT:    [[TMP13:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD4]], <double 4.200000e+01, double 4.200000e+01, double 4.200000e+01, double 4.200000e+01>
 ; AVX-NEXT:    [[TMP14:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD5]], <double 4.200000e+01, double 4.200000e+01, double 4.200000e+01, double 4.200000e+01>
 ; AVX-NEXT:    [[TMP15:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD6]], <double 4.200000e+01, double 4.200000e+01, double 4.200000e+01, double 4.200000e+01>
-; AVX-NEXT:    [[TMP20:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
-; AVX-NEXT:    [[TMP21:%.*]] = fadd fast <4 x double> [[VEC_PHI1]], [[WIDE_LOAD4]]
-; AVX-NEXT:    [[TMP22:%.*]] = fadd fast <4 x double> [[VEC_PHI2]], [[WIDE_LOAD5]]
-; AVX-NEXT:    [[TMP23:%.*]] = fadd fast <4 x double> [[VEC_PHI3]], [[WIDE_LOAD6]]
-; AVX-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP12]], <4 x double> [[TMP20]], <4 x double> [[VEC_PHI]]
-; AVX-NEXT:    [[PREDPHI7]] = select <4 x i1> [[TMP13]], <4 x double> [[TMP21]], <4 x double> [[VEC_PHI1]]
-; AVX-NEXT:    [[PREDPHI8]] = select <4 x i1> [[TMP14]], <4 x double> [[TMP22]], <4 x double> [[VEC_PHI2]]
-; AVX-NEXT:    [[PREDPHI9]] = select <4 x i1> [[TMP15]], <4 x double> [[TMP23]], <4 x double> [[VEC_PHI3]]
+; AVX-NEXT:    [[TMP16:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
+; AVX-NEXT:    [[TMP17:%.*]] = fadd fast <4 x double> [[VEC_PHI1]], [[WIDE_LOAD4]]
+; AVX-NEXT:    [[TMP18:%.*]] = fadd fast <4 x double> [[VEC_PHI2]], [[WIDE_LOAD5]]
+; AVX-NEXT:    [[TMP19:%.*]] = fadd fast <4 x double> [[VEC_PHI3]], [[WIDE_LOAD6]]
+; AVX-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP12]], <4 x double> [[TMP16]], <4 x double> [[VEC_PHI]]
+; AVX-NEXT:    [[PREDPHI7]] = select <4 x i1> [[TMP13]], <4 x double> [[TMP17]], <4 x double> [[VEC_PHI1]]
+; AVX-NEXT:    [[PREDPHI8]] = select <4 x i1> [[TMP14]], <4 x double> [[TMP18]], <4 x double> [[VEC_PHI2]]
+; AVX-NEXT:    [[PREDPHI9]] = select <4 x i1> [[TMP15]], <4 x double> [[TMP19]], <4 x double> [[VEC_PHI3]]
 ; AVX-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; AVX-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; AVX-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
+; AVX-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX:       middle.block:
 ; AVX-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[PREDPHI7]], [[PREDPHI]]
 ; AVX-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <4 x double> [[PREDPHI8]], [[BIN_RDX]]
 ; AVX-NEXT:    [[BIN_RDX11:%.*]] = fadd fast <4 x double> [[PREDPHI9]], [[BIN_RDX10]]
-; AVX-NEXT:    [[TMP25:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX11]])
+; AVX-NEXT:    [[TMP21:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX11]])
 ; AVX-NEXT:    br i1 true, label [[DONE:%.*]], label [[SCALAR_PH]]
 ; AVX:       scalar.ph:
 ; AVX-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; AVX-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP25]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; AVX-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
 ; AVX-NEXT:    br label [[LOOP:%.*]]
 ; AVX:       loop:
 ; AVX-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
@@ -190,7 +190,7 @@ define double @sumIfVector(ptr nocapture readonly %arr) {
 ; AVX-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
 ; AVX-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP3:![0-9]+]]
 ; AVX:       done:
-; AVX-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
 ; AVX-NEXT:    ret double [[TOT_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
index d19ca172a8c0a8..8b0c99b353c8b7 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -33,18 +33,15 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 19, i32 19, i32 19, i32 19>
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP10:%.*]] = and <4 x i1> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
-; CHECK-NEXT:    [[PREDPHI3:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP12]], <4 x i32> [[PREDPHI]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 20, i32 20, i32 20, i32 20>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP10]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[PREDPHI3:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[PREDPHI]], <4 x i32> <i32 9, i32 9, i32 9, i32 9>
 ; CHECK-NEXT:    store <4 x i32> [[PREDPHI3]], ptr [[TMP5]], align 4, !alias.scope [[META0]], !noalias [[META3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -54,16 +51,16 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP12]], [[TMP13]]
 ; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_END14]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP15]], 19
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP12]], 19
 ; CHECK-NEXT:    br i1 [[CMP6]], label [[IF_END14]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp slt i32 [[TMP16]], 4
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp slt i32 [[TMP13]], 4
 ; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP10]], i32 4, i32 5
 ; CHECK-NEXT:    br label [[IF_END14]]
 ; CHECK:       if.end14:
@@ -112,3 +109,122 @@ for.end:
   ret i32 undef
 }
 
+; As above but with multiple variables set per block.
+define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
+; CHECK-LABEL: @multi_variable_if_nest(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP26]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[N]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 19, i32 19, i32 19, i32 19>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP10:%.*]] = and <4 x i1> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> <i32 6, i32 6, i32 6, i32 6>, <4 x i32> <i32 11, i32 11, i32 11, i32 11>
+; CHECK-NEXT:    [[TMP14:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
+; CHECK-NEXT:    [[PREDPHI3:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP12]], <4 x i32> [[PREDPHI]]
+; CHECK-NEXT:    [[PREDPHI4:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32> <i32 18, i32 18, i32 18, i32 18>
+; CHECK-NEXT:    [[PREDPHI5:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP13]], <4 x i32> [[PREDPHI4]]
+; CHECK-NEXT:    store <4 x i32> [[PREDPHI3]], ptr [[TMP5]], align 4, !alias.scope [[META9]], !noalias [[META12]]
+; CHECK-NEXT:    store <4 x i32> [[PREDPHI5]], ptr [[TMP6]], align 4, !alias.scope [[META12]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_END14]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP16]], 19
+; CHECK-NEXT:    br i1 [[CMP6]], label [[IF_END14]], label [[IF_ELSE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp slt i32 [[TMP17]], 4
+; CHECK-NEXT:    [[X_ELSE:%.*]] = select i1 [[CMP10]], i32 4, i32 5
+; CHECK-NEXT:    [[Y_ELSE:%.*]] = select i1 [[CMP10]], i32 6, i32 11
+; CHECK-NEXT:    br label [[IF_END14]]
+; CHECK:       if.end14:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 9, [[FOR_BODY]] ], [ 3, [[IF_THEN]] ], [ [[X_ELSE]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[Y_0:%.*]] = phi i32 [ 18, [[FOR_BODY]] ], [ 7, [[IF_THEN]] ], [ [[Y_ELSE]], [[IF_ELSE]] ]
+; CHECK-NEXT:    store i32 [[X_0]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i32 [[Y_0]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[N]], [[LFTR_WIDEIV]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %cmp26 = icmp sgt i32 %n, 0
+  br i1 %cmp26, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.end14 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %cmp3 = icmp sgt i32 %0, %1
+  br i1 %cmp3, label %if.then, label %if.end14
+
+if.then:
+  %cmp6 = icmp sgt i32 %0, 19
+  br i1 %cmp6, label %if.end14, label %if.else
+
+if.else:
+  %cmp10 = icmp slt i32 %1, 4
+  %x.else = select i1 %cmp10, i32 4, i32 5
+  %y.else = select i1 %cmp10, i32 6, i32 11
+  br label %if.end14
+
+if.end14:
+  %x.0 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %x.else, %if.else ]  ; <------------- A PHI with 3 entries that we can still vectorize.
+  %y.0 = phi i32 [ 18, %for.body ], [ 7, %if.then ], [ %y.else, %if.else ]  ; <------------- A PHI with 3 entries that we can still vectorize.
+  store i32 %x.0, ptr %arrayidx, align 4
+  store i32 %y.0, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll
index e9761a60fd6ebe..0d5871e24c5247 100644
--- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll
@@ -678,9 +678,8 @@ for.end:                                          ; preds = %for.inc, %entry
 ; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
 ; CHECK-DAG: %[[SUB:.*]] = fsub fast <4 x float>
 ; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float>
-; CHECK-DAG: %[[C12:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C2]], <4 x i1> zeroinitializer
-; CHECK: %[[C22:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer
-; CHECK: %[[S1:.*]] = select <4 x i1> %[[C12]], <4 x float> %[[SUB]], <4 x float> %[[ADD]]
+; CHECK-DAG: %[[C22:.*]] = select <4 x i1> %[[C11]], <4 x i1> %[[C21]], <4 x i1> zeroinitializer
+; CHECK: %[[S1:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[ADD]], <4 x float> %[[SUB]]
 ; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]]
 define float @fcmp_fadd_fsub(ptr nocapture readonly %a, i32 %n) nounwind readonly {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/phi-cost.ll b/llvm/test/Transforms/LoopVectorize/phi-cost.ll
index e571b624ed1940..8d407c969b5278 100644
--- a/llvm/test/Transforms/LoopVectorize/phi-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/phi-cost.ll
@@ -49,8 +49,8 @@ for.end:
 ; CHECK:       define void @phi_three_incoming_values(
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK:         [[PREDPHI:%.*]] = select <2 x i1> {{.*}}, <2 x i32> <i32 3, i32 3>, <2 x i32> <i32 9, i32 9>
-; CHECK:         [[PREDPHI7:%.*]] = select <2 x i1> {{.*}}, <2 x i32> {{.*}}, <2 x i32> [[PREDPHI]]
+; CHECK:         [[PREDPHI:%.*]] = select <2 x i1> {{.*}}, <2 x i32> {{.*}}, <2 x i32> <i32 3, i32 3>
+; CHECK:         [[PREDPHI7:%.*]] = select <2 x i1> {{.*}}, <2 x i32> [[PREDPHI]], <2 x i32> <i32 9, i32 9>
 ; CHECK:         store <2 x i32> [[PREDPHI7]], ptr {{.*}}
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
index c50bcf8ae88f5c..2e111332ef6c48 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
@@ -587,9 +587,7 @@ define i64 @nested_cond_and(ptr noalias nocapture readonly %a, ptr noalias nocap
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK:       pred.load.continue14:
 ; CHECK-NEXT:    [[TMP46:%.*]] = phi <4 x i64> [ [[TMP41]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP45]], [[PRED_LOAD_IF13]] ]
-; CHECK-NEXT:    [[TMP47:%.*]] = xor <4 x i1> [[TMP25]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP48:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP47]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP48]], <4 x i64> [[TMP24]], <4 x i64> [[TMP46]]
+; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP26]], <4 x i64> [[TMP46]], <4 x i64> [[TMP24]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[PREDPHI_V]], <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>
 ; CHECK-NEXT:    [[PREDPHI15]] = and <4 x i64> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
index 7fd762c7b735a0..40383c7e551bcf 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
@@ -65,7 +65,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ poison, [[DOTLR_PH]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
index 8ee12cc2241c35..6407583061e601 100644
--- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
@@ -111,17 +111,16 @@ define void @single_incoming_phi_with_blend_mask(i64 %a, i64 %b) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP6]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP8]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x i16> [[WIDE_LOAD]], <2 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP8]], <2 x i16> zeroinitializer, <2 x i16> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[PREDPHI]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <2 x i16> [[PREDPHI1]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    store <2 x i16> [[PREDPHI1]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -304,17 +303,16 @@ define void @single_incoming_needs_predication(i64 %a, i64 %b) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[TMP15]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP18:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP17]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP18]], <2 x i16> [[TMP14]], <2 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[TMP2]], <i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP17]], <2 x i16> zeroinitializer, <2 x i16> [[TMP14]]
 ; CHECK-NEXT:    [[PREDPHI3:%.*]] = select <2 x i1> [[TMP16]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[PREDPHI]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    store <2 x i16> [[PREDPHI3]], ptr [[TMP20]], align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    store <2 x i16> [[PREDPHI3]], ptr [[TMP19]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
-; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
index eae38295ba08cf..da827a5b674d8b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
@@ -611,6 +611,44 @@ entry:
   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
   ret <4 x float> %vecins.3
 }
+define <4 x float> @int_asin_4x(ptr %a) {
+; CHECK-LABEL: @int_asin_4x(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+; NOACCELERATE-LABEL: @int_asin_4x(
+; NOACCELERATE-NEXT:  entry:
+; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT]])
+; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]])
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.asin.f32(float %vecext)
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.asin.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.asin.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.asin.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
 declare float @acosf(float) readonly nounwind willreturn
 define <4 x float> @acos_4x(ptr %a) {
 ; CHECK-LABEL: @acos_4x(
@@ -652,6 +690,44 @@ entry:
   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
   ret <4 x float> %vecins.3
 }
+define <4 x float> @int_acos_4x(ptr %a) {
+; CHECK-LABEL: @int_acos_4x(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+; NOACCELERATE-LABEL: @int_acos_4x(
+; NOACCELERATE-NEXT:  entry:
+; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]])
+; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.acos.f32(float %vecext)
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.acos.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.acos.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.acos.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
 declare float @atanf(float) readonly nounwind willreturn
 define <4 x float> @atan_4x(ptr %a) {
 ; CHECK-LABEL: @atan_4x(
@@ -693,6 +769,44 @@ entry:
   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
   ret <4 x float> %vecins.3
 }
+define <4 x float> @int_atan_4x(ptr %a) {
+; CHECK-LABEL: @int_atan_4x(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+; NOACCELERATE-LABEL: @int_atan_4x(
+; NOACCELERATE-NEXT:  entry:
+; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]])
+; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]])
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.atan.f32(float %vecext)
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.atan.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.atan.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.atan.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
 declare float @sinhf(float) readonly nounwind willreturn
 define <4 x float> @sinh_4x(ptr %a) {
 ; CHECK-LABEL: @sinh_4x(
@@ -734,6 +848,44 @@ entry:
   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
   ret <4 x float> %vecins.3
 }
+define <4 x float> @int_sinh_4x(ptr %a) {
+; CHECK-LABEL: @int_sinh_4x(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+; NOACCELERATE-LABEL: @int_sinh_4x(
+; NOACCELERATE-NEXT:  entry:
+; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]])
+; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]])
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.sinh.f32(float %vecext)
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.sinh.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.sinh.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.sinh.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
 declare float @coshf(float) readonly nounwind willreturn
 define <4 x float> @cosh_4x(ptr %a) {
 ; CHECK-LABEL: @cosh_4x(
@@ -775,6 +927,44 @@ entry:
   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
   ret <4 x float> %vecins.3
 }
+define <4 x float> @int_cosh_4x(ptr %a) {
+; CHECK-LABEL: @int_cosh_4x(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+; NOACCELERATE-LABEL: @int_cosh_4x(
+; NOACCELERATE-NEXT:  entry:
+; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT]])
+; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]])
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.cosh.f32(float %vecext)
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.cosh.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.cosh.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.cosh.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
 declare float @tanhf(float) readonly nounwind willreturn
 define <4 x float> @tanh_4x(ptr %a) {
 ; CHECK-LABEL: @tanh_4x(
@@ -816,6 +1006,44 @@ entry:
   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
   ret <4 x float> %vecins.3
 }
+define <4 x float> @int_tanh_4x(ptr %a) {
+; CHECK-LABEL: @int_tanh_4x(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+; NOACCELERATE-LABEL: @int_tanh_4x(
+; NOACCELERATE-NEXT:  entry:
+; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]])
+; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]])
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.tanh.f32(float %vecext)
+  %vecins = insertelement <4 x float> poison, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.tanh.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.tanh.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.tanh.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
 declare float @asinhf(float) readonly nounwind willreturn
 define <4 x float> @asinh_4x(ptr %a) {
 ; CHECK-LABEL: @asinh_4x(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
index 5e2dd305f05576..62b8c0ce3291a6 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
@@ -611,6 +611,44 @@ entry:
   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
   ret <4 x float> %vecins.3
 }
+define <4 x float> @int_asin_4x(ptr %a) {
+; CHECK-LABEL: @int_asin_4x(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+; NOACCELERATE-LABEL: @int_asin_4x(
+; NOACCELERATE-NEXT:  entry:
+; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT]])
+; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]])
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.asin.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.asin.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.asin.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.asin.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
 declare float @acosf(float) readonly nounwind willreturn
 define <4 x float> @acos_4x(ptr %a) {
 ; CHECK-LABEL: @acos_4x(
@@ -652,6 +690,44 @@ entry:
   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
   ret <4 x float> %vecins.3
 }
+define <4 x float> @int_acos_4x(ptr %a) {
+; CHECK-LABEL: @int_acos_4x(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+; NOACCELERATE-LABEL: @int_acos_4x(
+; NOACCELERATE-NEXT:  entry:
+; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]])
+; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.acos.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.acos.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.acos.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.acos.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
 declare float @atanf(float) readonly nounwind willreturn
 define <4 x float> @atan_4x(ptr %a) {
 ; CHECK-LABEL: @atan_4x(
@@ -693,6 +769,44 @@ entry:
   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
   ret <4 x float> %vecins.3
 }
+define <4 x float> @int_atan_4x(ptr %a) {
+; CHECK-LABEL: @int_atan_4x(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+; NOACCELERATE-LABEL: @int_atan_4x(
+; NOACCELERATE-NEXT:  entry:
+; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]])
+; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]])
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.atan.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.atan.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.atan.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.atan.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
 declare float @sinhf(float) readonly nounwind willreturn
 define <4 x float> @sinh_4x(ptr %a) {
 ; CHECK-LABEL: @sinh_4x(
@@ -734,6 +848,44 @@ entry:
   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
   ret <4 x float> %vecins.3
 }
+define <4 x float> @int_sinh_4x(ptr %a) {
+; CHECK-LABEL: @int_sinh_4x(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+; NOACCELERATE-LABEL: @int_sinh_4x(
+; NOACCELERATE-NEXT:  entry:
+; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]])
+; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]])
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.sinh.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.sinh.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.sinh.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.sinh.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
 declare float @coshf(float) readonly nounwind willreturn
 define <4 x float> @cosh_4x(ptr %a) {
 ; CHECK-LABEL: @cosh_4x(
@@ -775,6 +927,44 @@ entry:
   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
   ret <4 x float> %vecins.3
 }
+define <4 x float> @int_cosh_4x(ptr %a) {
+; CHECK-LABEL: @int_cosh_4x(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+; NOACCELERATE-LABEL: @int_cosh_4x(
+; NOACCELERATE-NEXT:  entry:
+; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT]])
+; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]])
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.cosh.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.cosh.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.cosh.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.cosh.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
 declare float @tanhf(float) readonly nounwind willreturn
 define <4 x float> @tanh_4x(ptr %a) {
 ; CHECK-LABEL: @tanh_4x(
@@ -816,6 +1006,44 @@ entry:
   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
   ret <4 x float> %vecins.3
 }
+define <4 x float> @int_tanh_4x(ptr %a) {
+; CHECK-LABEL: @int_tanh_4x(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+; NOACCELERATE-LABEL: @int_tanh_4x(
+; NOACCELERATE-NEXT:  entry:
+; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
+; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]])
+; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]])
+; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.tanh.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.tanh.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.tanh.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.tanh.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
 declare float @asinhf(float) readonly nounwind willreturn
 define <4 x float> @asinh_4x(ptr %a) {
 ; CHECK-LABEL: @asinh_4x(
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 059e4c38b519bd..6fbd05aaedfe5b 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -597,6 +597,690 @@ entry:
   ret <4 x float> %vecins.3
 }
 
+declare float @cosf(float) readonly nounwind willreturn
+
+; We can not vectorized cos cosce RISCV has no such instruction.
+define <4 x float> @cos_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @cos_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @cos_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @cosf(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @cosf(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @cosf(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @cosf(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @llvm.cos.f32(float)
+
+; We can not vectorized cos cosce RISCV has no such instruction.
+define <4 x float> @int_cos_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @int_cos_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @int_cos_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.cos.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.cos.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.cos.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.cos.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @acosf(float) readonly nounwind willreturn
+
+; We can not vectorized acos cosce RISCV has no such instruction.
+define <4 x float> @acos_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @acos_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @acosf(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @acos_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @acosf(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @acosf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @acosf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @acosf(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @acosf(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @acosf(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @acosf(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @llvm.acos.f32(float)
+
+; We can not vectorized acos cosce RISCV has no such instruction.
+define <4 x float> @int_acos_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @int_acos_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @int_acos_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.acos.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.acos.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.acos.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.acos.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @tanf(float) readonly nounwind willreturn
+
+; We can not vectorized tan tance RISCV has no such instruction.
+define <4 x float> @tan_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @tan_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @tanf(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @tan_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @tanf(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @tanf(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @tanf(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @tanf(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @tanf(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @llvm.tan.f32(float)
+
+; We can not vectorized tan tance RISCV has no such instruction.
+define <4 x float> @int_tan_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @int_tan_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @int_tan_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.tan.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.tan.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.tan.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.tan.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.tan.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @atanf(float) readonly nounwind willreturn
+
+; We can not vectorized atan tance RISCV has no such instruction.
+define <4 x float> @atan_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @atan_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @atanf(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @atanf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @atanf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @atan_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @atanf(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @atanf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @atanf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @atanf(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @atanf(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @atanf(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @atanf(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @llvm.atan.f32(float)
+
+; We can not vectorized atan tance RISCV has no such instruction.
+define <4 x float> @int_atan_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @int_atan_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @int_atan_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.atan.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.atan.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.atan.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.atan.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @sinhf(float) readonly nounwind willreturn
+
+; We can not vectorized sinh since RISCV has no such instruction.
+define <4 x float> @sinh_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @sinh_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @sinhf(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @sinhf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @sinhf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @sinh_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @sinhf(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @sinhf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @sinhf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @sinhf(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @sinhf(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @sinhf(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @sinhf(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @llvm.sinh.f32(float)
+
+; We can not vectorized sinh since RISCV has no such instruction.
+define <4 x float> @int_sinh_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @int_sinh_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @int_sinh_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.sinh.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.sinh.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.sinh.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.sinh.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @asinhf(float) readonly nounwind willreturn
+
+; We can not vectorized asinh since RISCV has no such instruction.
+define <4 x float> @asinh_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @asinh_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @asinhf(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @asinhf(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @asinhf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @asinhf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @asinh_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @asinhf(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @asinhf(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @asinhf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @asinhf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @asinhf(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @asinhf(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @asinhf(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @asinhf(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @llvm.asinh.f32(float)
+
+; We can not vectorized asinh since RISCV has no such instruction.
+define <4 x float> @int_asinh_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @int_asinh_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @int_asinh_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.asinh.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.asinh.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.asinh.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.asinh.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.asinh.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
 declare float @coshf(float) readonly nounwind willreturn
 
 ; We can not vectorized cosh since RISCV has no such instruction.
@@ -711,6 +1395,234 @@ entry:
   ret <4 x float> %vecins.3
 }
 
+declare float @acoshf(float) readonly nounwind willreturn
+
+; We can not vectorized acosh since RISCV has no such instruction.
+define <4 x float> @acosh_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @acosh_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @acoshf(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @acoshf(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @acoshf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @acoshf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @acosh_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @acoshf(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @acoshf(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @acoshf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @acoshf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @acoshf(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @acoshf(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @acoshf(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @acoshf(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @llvm.acosh.f32(float)
+
+; We can not vectorized acosh since RISCV has no such instruction.
+define <4 x float> @int_acosh_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @int_acosh_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @int_acosh_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.acosh.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.acosh.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.acosh.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.acosh.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.acosh.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @tanhf(float) readonly nounwind willreturn
+
+; We can not vectorized tanh since RISCV has no such instruction.
+define <4 x float> @tanh_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @tanh_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @tanhf(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @tanhf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @tanhf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @tanh_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @tanhf(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @tanhf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @tanhf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @tanhf(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @tanhf(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @tanhf(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @tanhf(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
+declare float @llvm.tanh.f32(float)
+
+; We can not vectorized tanh since RISCV has no such instruction.
+define <4 x float> @int_tanh_4x(ptr %a) {
+; CHECK-LABEL: define <4 x float> @int_tanh_4x
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+; DEFAULT-LABEL: define <4 x float> @int_tanh_4x
+; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
+; DEFAULT-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]])
+; DEFAULT-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]])
+; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+;
+entry:
+  %0 = load <4 x float>, ptr %a, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %1 = tail call fast float @llvm.tanh.f32(float %vecext)
+  %vecins = insertelement <4 x float> undef, float %1, i32 0
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %2 = tail call fast float @llvm.tanh.f32(float %vecext.1)
+  %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %3 = tail call fast float @llvm.tanh.f32(float %vecext.2)
+  %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %4 = tail call fast float @llvm.tanh.f32(float %vecext.3)
+  %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
+  ret <4 x float> %vecins.3
+}
+
 declare float @atanhf(float) readonly nounwind willreturn
 
 ; We can not vectorized atanh since RISCV has no such instruction.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll
index 54dc33dbc0d00b..c9a3158acdda34 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reduction-whole-regs-loads.ll
@@ -4,15 +4,11 @@
 define i64 @test(ptr %p) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP2]], <4 x i64> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v2i64(<8 x i64> [[TMP3]], <2 x i64> [[TMP1]], i64 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
-; CHECK-NEXT:    ret i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i64> [[TMP1]], <i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42, i64 42>
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %arrayidx.1 = getelementptr inbounds i64, ptr %p, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index af27572cfeaef8..4352b3d0c80d32 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -12,19 +12,19 @@ define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %p
 ; CHECK-LABEL: @dot4f64(
 ; CHECK-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 2
 ; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
-; CHECK-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP10]], i32 1
-; CHECK-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP14]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; CHECK-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; CHECK-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP10]]
 ; CHECK-NEXT:    ret double [[DOT0123]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
@@ -55,19 +55,19 @@ define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt
 ; CHECK-LABEL: @dot4f32(
 ; CHECK-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 2
 ; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
-; CHECK-NEXT:    [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
-; CHECK-NEXT:    [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP14]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x float> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
+; CHECK-NEXT:    [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
+; CHECK-NEXT:    [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP10]]
 ; CHECK-NEXT:    ret float [[DOT0123]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
@@ -96,11 +96,11 @@ define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt
 
 define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
 ; CHECK-LABEL: @dot4f64_fast(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP5]])
-; CHECK-NEXT:    ret double [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP3]])
+; CHECK-NEXT:    ret double [[TMP4]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
@@ -128,11 +128,11 @@ define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3
 
 define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 ; CHECK-LABEL: @dot4f32_fast(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    ret float [[TMP4]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
@@ -169,13 +169,13 @@ define double @dot3f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %p
 ; CHECK-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
 ; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
-; CHECK-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP5]]
 ; CHECK-NEXT:    ret double [[DOT012]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
@@ -203,13 +203,13 @@ define float @dot3f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt
 ; CHECK-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
 ; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[MUL0]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
-; CHECK-NEXT:    [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[MUL0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP5]]
 ; CHECK-NEXT:    ret float [[DOT012]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
@@ -237,13 +237,13 @@ define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3
 ; CHECK-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
 ; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
-; CHECK-NEXT:    [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]]
 ; CHECK-NEXT:    ret double [[DOT012]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
@@ -271,13 +271,13 @@ define float @dot3f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16
 ; CHECK-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
 ; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
-; CHECK-NEXT:    [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]]
 ; CHECK-NEXT:    ret float [[DOT012]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
@@ -304,12 +304,12 @@ define float @dot3f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16
 
 define double @dot2f64(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 ; CHECK-LABEL: @dot2f64(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret double [[DOT01]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
@@ -326,12 +326,12 @@ define double @dot2f64(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %p
 
 define float @dot2f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 ; CHECK-LABEL: @dot2f32(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret float [[DOT01]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
@@ -348,12 +348,12 @@ define float @dot2f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt
 
 define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 ; CHECK-LABEL: @dot2f64_fast(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast double [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret double [[DOT01]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
@@ -370,12 +370,12 @@ define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(1
 
 define float @dot2f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 ; CHECK-LABEL: @dot2f32_fast(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast float [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret float [[DOT01]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-bv-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-bv-vectorized.ll
new file mode 100644
index 00000000000000..c44ef376f81fab
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-bv-vectorized.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test(ptr %p) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr [[GEP1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv548.2.i.13 = zext i32 0 to i64
+  %and551.2.i.13 = and i64 0, %conv548.2.i.13
+  %conv548.3.i.13 = zext i32 0 to i64
+  %and551.3.i.13 = and i64 0, %conv548.3.i.13
+  %0 = trunc i64 %and551.2.i.13 to i32
+  %conv54.2.i.14 = and i32 %0, 0
+  %conv548.2.i.14 = zext i32 %conv54.2.i.14 to i64
+  %and551.2.i.14 = and i64 %and551.2.i.13, %conv548.2.i.14
+  %1 = trunc i64 %and551.3.i.13 to i32
+  %conv54.3.i.14 = and i32 %1, 0
+  %conv548.3.i.14 = zext i32 %conv54.3.i.14 to i64
+  %and551.3.i.14 = and i64 %and551.3.i.13, %conv548.3.i.14
+  %and551.2.i.15 = and i64 %and551.2.i.14, 0
+  %and551.3.i.15 = and i64 %and551.3.i.14, 0
+  %and551.2.i.16 = and i64 %and551.2.i.15, 0
+  %and551.3.i.16 = and i64 %and551.3.i.15, 0
+  %and551.2.i.17 = and i64 %and551.2.i.16, 0
+  %and551.3.i.17 = and i64 %and551.3.i.16, 0
+  %and551.2.i.18 = and i64 %and551.2.i.17, 0
+  %and551.3.i.18 = and i64 %and551.3.i.17, 0
+  %and551.2.i.19 = and i64 %and551.2.i.18, 0
+  %and551.3.i.19 = and i64 %and551.3.i.18, 0
+  %and551.2.i.20 = and i64 %and551.2.i.19, 0
+  %and551.3.i.20 = and i64 %and551.3.i.19, 0
+  %and551.2.i.21 = and i64 %and551.2.i.20, 0
+  %and551.3.i.21 = and i64 %and551.3.i.20, 0
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 16
+  %gep2 = getelementptr inbounds i8, ptr %p, i64 24
+  store i64 %and551.2.i.21, ptr %gep1, align 16
+  store i64 %and551.3.i.21, ptr %gep2, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index 5b33c6e889363e..89bc44dc1d530a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -180,12 +180,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_2(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x ptr> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX512VL-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
+; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX512VL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX512VL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX512VL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; AVX512VL-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
+; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
+; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX512VL-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, ptr %1, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index 09d6c77557efaa..c1b501015e81e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -180,12 +180,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_2(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x ptr> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX512VL-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
+; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX512VL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX512VL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX512VL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; AVX512VL-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
+; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
+; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX512VL-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, ptr %1, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
index 83457cc4966f7c..ebd35448ba72f7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -11,18 +11,18 @@ define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, <8 x ptr> [[SHUFFLE]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
 ; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <8 x double> [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP8]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]])
-; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i64 0
-; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP10]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> <i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
index 19d0bc9b330657..20c5bda328c100 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
@@ -6,10 +6,10 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
 define void @rdx_feeds_single_insert(<2 x double> %v, ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
 ; CHECK-LABEL: @rdx_feeds_single_insert(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr [[ARG1:%.*]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <8 x double> [[TMP1]], <double 1.000000e+01, double 1.100000e+01, double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01, double 1.600000e+01, double 1.700000e+01>
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP2]])
-; CHECK-NEXT:    [[I:%.*]] = insertelement <2 x double> [[V:%.*]], double [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[ARG1:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <8 x double> [[TMP0]], <double 1.000000e+01, double 1.100000e+01, double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01, double 1.600000e+01, double 1.700000e+01>
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP1]])
+; CHECK-NEXT:    [[I:%.*]] = insertelement <2 x double> [[V:%.*]], double [[TMP2]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I]], <2 x ptr> [[P]], i32 8, <2 x i1> <i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
index 40dcc79f79ffce..09a5ace101e645 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
@@ -8,19 +8,23 @@
 ; YAML-NEXT:  Function:        test
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'Stores SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-5'
+; YAML-NEXT:    - Cost:            '-7'
 ; YAML-NEXT:    - String:          ' and with tree size '
-; YAML-NEXT:    - TreeSize:        '7'
+; YAML-NEXT:    - TreeSize:        '5'
 
 define void @test(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[P:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x ptr> [[TMP1]], <4 x i64> <i64 0, i64 32, i64 33, i64 34>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[P]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr i32, ptr [[P]], i64 32
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P]], i64 33
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[I]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[I2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP3]], <2 x i32> [[TMP0]], i64 2)
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP1]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[P1:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
index cfbbe14186b501..eacfbda5447c7b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
@@ -5,13 +5,23 @@ define void @test() {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr null, align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP11]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -57,15 +67,25 @@ define void @test1() {
 ; CHECK-LABEL: define void @test1(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr null, align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -111,12 +131,22 @@ define void @test_div() {
 ; CHECK-LABEL: define void @test_div(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr null, align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = udiv <4 x i32> [[TMP3]], <i32 1, i32 2, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], <i32 2, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -163,12 +193,22 @@ define void @test_rem() {
 ; CHECK-LABEL: define void @test_rem(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr null, align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = urem <4 x i32> [[TMP3]], <i32 1, i32 2, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], <i32 1, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
index 30f328293cdaa3..c114c5dee78e99 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
@@ -6,18 +6,20 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> <i64 8, i64 12, i64 28, i64 24>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 8
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <8 x float> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <8 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    store <8 x float> [[TMP13]], ptr addrspace(1) [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 24
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP8]], i64 2)
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd <8 x float> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    store <8 x float> [[TMP15]], ptr addrspace(1) [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
index 02c7e4a03325ed..1f3c0fb9e297c5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
@@ -54,10 +54,10 @@ define double @hr_or_mul() {
 ; CHECK-LABEL: @hr_or_mul(
 ; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP3]], [[CVT0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP3]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
 ; CHECK-NEXT:    ret double [[OP_RDX]]
 ;
   %cvt0 = uitofp i16 3 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
new file mode 100644
index 00000000000000..56281424c7114a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
@@ -0,0 +1,208 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -slp-vectorize-hor=false < %s | FileCheck %s
+
+define void @func(i32 %0) {
+; CHECK-LABEL: define void @func(
+; CHECK-SAME: i32 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 0, i32 0>, i32 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i32> [[TMP11]], <32 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <32 x i32> <i32 poison, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <32 x i32> [[TMP12]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP13]], <8 x i32> zeroinitializer, i64 16)
+; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP14]], <4 x i32> zeroinitializer, i64 24)
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP15]], <2 x i32> zeroinitializer, i64 14)
+; CHECK-NEXT:    [[TMP17:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP16]], <2 x i32> zeroinitializer, i64 28)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i32> [[TMP8]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <32 x i32> [[TMP18]] to <32 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp slt <32 x i64> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <32 x i1> [[TMP20]], i32 31
+; CHECK-NEXT:    [[TMP22:%.*]] = and i1 false, [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <32 x i1> [[TMP20]], i32 30
+; CHECK-NEXT:    [[TMP24:%.*]] = and i1 false, [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <32 x i1> [[TMP20]], i32 29
+; CHECK-NEXT:    [[TMP26:%.*]] = and i1 false, [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <32 x i1> [[TMP20]], i32 28
+; CHECK-NEXT:    [[TMP28:%.*]] = and i1 false, [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <32 x i1> [[TMP20]], i32 27
+; CHECK-NEXT:    [[TMP30:%.*]] = and i1 false, [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <32 x i1> [[TMP20]], i32 26
+; CHECK-NEXT:    [[TMP32:%.*]] = and i1 false, [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <32 x i1> [[TMP20]], i32 25
+; CHECK-NEXT:    [[TMP34:%.*]] = and i1 false, [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <32 x i1> [[TMP20]], i32 24
+; CHECK-NEXT:    [[TMP36:%.*]] = and i1 false, [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <32 x i1> [[TMP20]], i32 23
+; CHECK-NEXT:    [[TMP38:%.*]] = and i1 false, [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <32 x i1> [[TMP20]], i32 22
+; CHECK-NEXT:    [[TMP40:%.*]] = and i1 false, [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <32 x i1> [[TMP20]], i32 21
+; CHECK-NEXT:    [[TMP42:%.*]] = and i1 false, [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <32 x i1> [[TMP20]], i32 20
+; CHECK-NEXT:    [[TMP44:%.*]] = and i1 false, [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <32 x i1> [[TMP20]], i32 19
+; CHECK-NEXT:    [[TMP46:%.*]] = and i1 false, [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <32 x i1> [[TMP20]], i32 18
+; CHECK-NEXT:    [[TMP48:%.*]] = and i1 false, [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <32 x i1> [[TMP20]], i32 17
+; CHECK-NEXT:    [[TMP50:%.*]] = and i1 false, [[TMP49]]
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <32 x i1> [[TMP20]], i32 16
+; CHECK-NEXT:    [[TMP52:%.*]] = and i1 false, [[TMP51]]
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <32 x i1> [[TMP20]], i32 15
+; CHECK-NEXT:    [[TMP54:%.*]] = and i1 false, [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <32 x i1> [[TMP20]], i32 14
+; CHECK-NEXT:    [[TMP56:%.*]] = and i1 false, [[TMP55]]
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <32 x i1> [[TMP20]], i32 13
+; CHECK-NEXT:    [[TMP58:%.*]] = and i1 false, [[TMP57]]
+; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <32 x i1> [[TMP20]], i32 12
+; CHECK-NEXT:    [[TMP60:%.*]] = and i1 false, [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <32 x i1> [[TMP20]], i32 11
+; CHECK-NEXT:    [[TMP62:%.*]] = and i1 false, [[TMP61]]
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <32 x i1> [[TMP20]], i32 10
+; CHECK-NEXT:    [[TMP64:%.*]] = and i1 false, [[TMP63]]
+; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <32 x i1> [[TMP20]], i32 9
+; CHECK-NEXT:    [[TMP66:%.*]] = and i1 false, [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <32 x i1> [[TMP20]], i32 8
+; CHECK-NEXT:    [[TMP68:%.*]] = and i1 false, [[TMP67]]
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <32 x i1> [[TMP20]], i32 7
+; CHECK-NEXT:    [[TMP70:%.*]] = and i1 false, [[TMP69]]
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <32 x i1> [[TMP20]], i32 6
+; CHECK-NEXT:    [[TMP72:%.*]] = and i1 false, [[TMP71]]
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <32 x i1> [[TMP20]], i32 5
+; CHECK-NEXT:    [[TMP74:%.*]] = and i1 false, [[TMP73]]
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <32 x i1> [[TMP20]], i32 4
+; CHECK-NEXT:    [[TMP76:%.*]] = and i1 false, [[TMP75]]
+; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <32 x i32> [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP78:%.*]] = sext i32 [[TMP77]] to i64
+; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr float, ptr addrspace(1) null, i64 [[TMP78]]
+; CHECK-NEXT:    ret void
+;
+  %2 = shl i32 %0, 0
+  %3 = sext i32 %2 to i64
+  %4 = shl i32 0, 0
+  %5 = sext i32 %4 to i64
+  %6 = or i32 0, 0
+  %7 = or i32 0, 0
+  %8 = zext i32 %6 to i64
+  %9 = zext i32 %7 to i64
+  %10 = zext i32 0 to i64
+  %11 = zext i32 0 to i64
+  %12 = zext i32 0 to i64
+  %13 = zext i32 0 to i64
+  %14 = zext i32 0 to i64
+  %15 = zext i32 0 to i64
+  %16 = zext i32 0 to i64
+  %17 = zext i32 0 to i64
+  %18 = zext i32 0 to i64
+  %19 = zext i32 0 to i64
+  %20 = zext i32 0 to i64
+  %21 = zext i32 0 to i64
+  %22 = zext i32 0 to i64
+  %23 = zext i32 0 to i64
+  %24 = zext i32 0 to i64
+  %25 = zext i32 0 to i64
+  %26 = zext i32 0 to i64
+  %27 = or i64 %3, 0
+  %28 = or i64 %3, %8
+  %29 = or i64 %3, %9
+  %30 = or i64 %3, %10
+  %31 = or i64 %3, %11
+  %32 = or i64 %3, %12
+  %33 = or i64 %3, %13
+  %34 = or i64 %3, %14
+  %35 = or i64 %3, %15
+  %36 = or i64 %3, %16
+  %37 = or i64 %3, %17
+  %38 = or i64 %3, %18
+  %39 = or i64 %3, %19
+  %40 = or i64 %3, %20
+  %41 = or i64 %3, %21
+  %42 = or i64 %3, %22
+  %43 = or i64 %3, %23
+  %44 = or i64 %3, %24
+  %45 = or i64 %3, %25
+  %46 = or i64 %3, 0
+  %47 = or i64 %3, 0
+  %48 = or i64 %3, 0
+  %49 = or i64 %3, 0
+  %50 = or i64 %3, 0
+  %51 = or i64 %3, 0
+  %52 = or i64 %3, 0
+  %53 = or i64 %3, 0
+  %54 = or i64 %3, 0
+  %55 = or i64 %3, 0
+  %56 = or i64 %3, 0
+  %57 = or i64 %3, 0
+  %58 = or i64 %3, 0
+  %59 = icmp slt i64 %28, 0
+  %60 = icmp slt i64 %29, 0
+  %61 = icmp slt i64 %30, 0
+  %62 = icmp slt i64 %31, 0
+  %63 = icmp slt i64 %32, 0
+  %64 = icmp slt i64 %33, 0
+  %65 = icmp slt i64 %34, 0
+  %66 = icmp slt i64 %35, 0
+  %67 = icmp slt i64 %36, 0
+  %68 = icmp slt i64 %37, 0
+  %69 = icmp slt i64 %38, 0
+  %70 = icmp slt i64 %39, 0
+  %71 = icmp slt i64 %40, 0
+  %72 = icmp slt i64 %41, 0
+  %73 = icmp slt i64 %42, 0
+  %74 = icmp slt i64 %43, 0
+  %75 = icmp slt i64 %44, 0
+  %76 = icmp slt i64 %45, 0
+  %77 = icmp slt i64 %46, 0
+  %78 = icmp slt i64 %47, 0
+  %79 = icmp slt i64 %48, 0
+  %80 = icmp slt i64 %49, 0
+  %81 = icmp slt i64 %50, 0
+  %82 = icmp slt i64 %51, 0
+  %83 = icmp slt i64 %52, 0
+  %84 = icmp slt i64 %53, 0
+  %85 = icmp slt i64 %54, 0
+  %86 = icmp slt i64 %55, 0
+  %87 = icmp slt i64 %56, 0
+  %88 = icmp slt i64 %57, 0
+  %89 = icmp slt i64 %58, 0
+  %90 = and i1 false, %59
+  %91 = and i1 false, %60
+  %92 = and i1 false, %61
+  %93 = and i1 false, %62
+  %94 = and i1 false, %63
+  %95 = and i1 false, %64
+  %96 = and i1 false, %65
+  %97 = and i1 false, %66
+  %98 = and i1 false, %67
+  %99 = and i1 false, %68
+  %100 = and i1 false, %69
+  %101 = and i1 false, %70
+  %102 = and i1 false, %71
+  %103 = and i1 false, %72
+  %104 = and i1 false, %73
+  %105 = and i1 false, %74
+  %106 = and i1 false, %75
+  %107 = and i1 false, %76
+  %108 = icmp eq i32 %2, 0
+  %109 = and i1 false, %77
+  %110 = and i1 false, %78
+  %111 = and i1 false, %79
+  %112 = and i1 false, %80
+  %113 = and i1 false, %81
+  %114 = and i1 false, %82
+  %115 = and i1 false, %83
+  %116 = and i1 false, %84
+  %117 = and i1 false, %85
+  %118 = and i1 false, %86
+  %119 = or i64 %5, %26
+  %120 = getelementptr float, ptr addrspace(1) null, i64 %119
+  %121 = icmp slt i64 %119, 0
+  ret void
+}
diff --git a/llvm/test/Verifier/rtsan-attrs.ll b/llvm/test/Verifier/rtsan-attrs.ll
deleted file mode 100644
index 42ab85163642b1..00000000000000
--- a/llvm/test/Verifier/rtsan-attrs.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
-
-; CHECK: Attributes 'sanitize_realtime and nosanitize_realtime' are incompatible!
-; CHECK-NEXT: ptr @sanitize_nosanitize
-define void @sanitize_nosanitize() #0 {
-  ret void
-}
-
-attributes #0 = { sanitize_realtime nosanitize_realtime }
diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
index 708b5a006be60e..d269f92763853c 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
@@ -1,24 +1,30 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
 ; 3 kernels:
 ;   - A does a direct call to HelperA
 ;   - B is storing @HelperA
 ;   - C does a direct call to HelperA
 ;
-; The helper functions will get externalized, so C/A will end up
-; in the same partition.
-
-; P0 is empty.
-; CHECK0: declare
-
-; CHECK1: define amdgpu_kernel void @B(ptr %dst)
-
-; CHECK2: define hidden void @HelperA()
-; CHECK2: define amdgpu_kernel void @A()
-; CHECK2: define amdgpu_kernel void @C()
+; The helper functions will get externalized, which will force A and C into P0 as
+; external functions cannot be duplicated.
+
+; CHECK0: define hidden void @HelperA()
+; CHECK0: define amdgpu_kernel void @A()
+; CHECK0: declare amdgpu_kernel void @B(ptr)
+; CHECK0: define amdgpu_kernel void @C()
+
+; CHECK1: declare hidden void @HelperA()
+; CHECK1: declare amdgpu_kernel void @A()
+; CHECK1: declare amdgpu_kernel void @B(ptr)
+; CHECK1: declare amdgpu_kernel void @C()
+
+; CHECK2: declare hidden void @HelperA()
+; CHECK2: declare amdgpu_kernel void @A()
+; CHECK2: define amdgpu_kernel void @B(ptr %dst)
+; CHECK2: declare amdgpu_kernel void @C()
 
 define internal void @HelperA() {
   ret void
diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
index 81f6c8f0fbb3a6..731cf4b374c95b 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0
+; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
new file mode 100644
index 00000000000000..6a07ed51ba1beb
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -debug -amdgpu-module-splitting-log-private 2>&1 | FileCheck %s --implicit-check-not=MyCustomKernel
+; REQUIRES: asserts
+
+; SHA256 of the kernel names.
+
+; CHECK: a097723d21cf9f35d90e6fb7881995ac8c398b3366a6c97efc657404f9fe301c
+; CHECK: 626bc23242de8fcfda7f0e66318d29455c081df6b5380e64d14703c95fcbcd59
+; CHECK: c38d90a7ca71dc5d694bb9e093dadcdedfc4cb4adf7ed7e46d42fe95a0b4ef55
+
+define amdgpu_kernel void @MyCustomKernel0() {
+  ret void
+}
+
+define amdgpu_kernel void @MyCustomKernel1() {
+  ret void
+}
+
+define amdgpu_kernel void @MyCustomKernel2() {
+  ret void
+}
diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll
new file mode 100644
index 00000000000000..836b5c05d0653d
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll
@@ -0,0 +1,36 @@
+; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -debug 2>&1 | FileCheck %s --implicit-check-not="[root]"
+; REQUIRES: asserts
+
+; func_3 is never directly called, it needs to be considered
+; as a root to handle this module correctly.
+
+; CHECK:      [root] kernel_1
+; CHECK-NEXT:   [dependency] func_1
+; CHECK-NEXT:   [dependency] func_2
+; CHECK-NEXT: [root] func_3
+; CHECK-NEXT:   [dependency] func_2
+
+define amdgpu_kernel void @kernel_1() {
+entry:
+  call void @func_1()
+  ret void
+}
+
+define linkonce_odr hidden void @func_1() {
+entry:
+  %call = call i32 @func_2()
+  ret void
+}
+
+define linkonce_odr hidden i32 @func_2() #0 {
+entry:
+  ret i32 0
+}
+
+define void @func_3() {
+entry:
+  %call = call i32 @func_2()
+  ret void
+}
+
+attributes #0 = { noinline optnone }
diff --git a/llvm/test/tools/llvm-split/AMDGPU/declarations.ll b/llvm/test/tools/llvm-split/AMDGPU/declarations.ll
index 755676061b2557..10b6cdfef4055f 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/declarations.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/declarations.ll
@@ -1,13 +1,16 @@
 ; RUN: rm -rf %t0 %t1
 ; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: not llvm-dis -o - %t1
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
 
-; Empty module without any defs should result in a single output module that is
-; an exact copy of the input.
+; Check that all declarations are put into each partition.
 
 ; CHECK0: declare void @A
 ; CHECK0: declare void @B
 
+; CHECK1: declare void @A
+; CHECK1: declare void @B
+
 declare void @A()
+
 declare void @B()
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
index d7e84abd5f968d..c2746d1398924c 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
 
 ; 3 kernels:
 ;   - A calls nothing
@@ -13,12 +13,16 @@
 ; Additionally, @PerryThePlatypus gets externalized as
 ; the alias counts as taking its address.
 
-; CHECK0: define amdgpu_kernel void @A
+; CHECK0-NOT: define
+; CHECK0: @Perry = internal alias ptr (), ptr @PerryThePlatypus
+; CHECK0: define hidden void @PerryThePlatypus()
+; CHECK0: define amdgpu_kernel void @B
+; CHECK0: define amdgpu_kernel void @C
+; CHECK0-NOT: define
 
-; CHECK1: @Perry = internal alias ptr (), ptr @PerryThePlatypus
-; CHECK1: define hidden void @PerryThePlatypus()
-; CHECK1: define amdgpu_kernel void @B
-; CHECK1: define amdgpu_kernel void @C
+; CHECK1-NOT: define
+; CHECK1: define amdgpu_kernel void @A
+; CHECK1-NOT: define
 
 @Perry = internal alias ptr(), ptr @PerryThePlatypus
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
index c7e13304dc6dec..4635264aefb39a 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
@@ -1,21 +1,27 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
 ; 3 kernels with each their own dependencies should go into 3
 ; distinct partitions. The most expensive kernel should be
 ; seen first and go into the last partition.
 
+; CHECK0-NOT: define
 ; CHECK0: define amdgpu_kernel void @C
 ; CHECK0: define internal void @HelperC
 ; CHECK0-NOT: define
 
+; CHECK1-NOT: define
 ; CHECK1: define amdgpu_kernel void @A
 ; CHECK1: define internal void @HelperA
+; CHECK1-NOT: define
 
+; CHECK2-NOT: define
 ; CHECK2: define amdgpu_kernel void @B
 ; CHECK2: define internal void @HelperB
+; CHECK2-NOT: define
+
 
 define amdgpu_kernel void @A() {
   call void @HelperA()
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
index 332344a776e82e..435e97a5813400 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
@@ -1,20 +1,29 @@
 ; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s
 
-; CHECK0: define internal void @PrivateHelper1()
-; CHECK0: define amdgpu_kernel void @D
+; Both overridable helper should go in P0.
 
-; CHECK1: define internal void @PrivateHelper0()
-; CHECK1: define amdgpu_kernel void @C
+; CHECK0-NOT: define
+; CHECK0: define available_externally void @OverridableHelper0()
+; CHECK0: define internal void @OverridableHelper1()
+; CHECK0: define amdgpu_kernel void @A
+; CHECK0: define amdgpu_kernel void @B
+; CHECK0-NOT: define
 
-; CHECK2: define internal void @OverridableHelper1()
-; CHECK2: define amdgpu_kernel void @B
+; CHECK1-NOT: define
 
-; CHECK3: define available_externally void @OverridableHelper0()
-; CHECK3: define amdgpu_kernel void @A
+; CHECK2-NOT: define
+; CHECK2: define internal void @PrivateHelper1()
+; CHECK2: define amdgpu_kernel void @D
+; CHECK2-NOT: define
+
+; CHECK3-NOT: define
+; CHECK3: define internal void @PrivateHelper0()
+; CHECK3: define amdgpu_kernel void @C
+; CHECK3-NOT: define
 
 define available_externally void @OverridableHelper0() {
   ret void
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
index 5be945bda48bf4..2d870039112cbf 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
 ; We have 4 kernels:
 ;   - Each kernel has an internal helper
@@ -15,19 +15,25 @@
 ; indirect call. HelperC/D should also end up in P0 as they
 ; are dependencies of HelperB.
 
+; CHECK0-NOT: define
+; CHECK0: define hidden void @HelperA
+; CHECK0: define hidden void @HelperB
+; CHECK0: define hidden void @CallCandidate
+; CHECK0: define internal void @HelperC
 ; CHECK0: define internal void @HelperD
-; CHECK0: define amdgpu_kernel void @D
+; CHECK0: define amdgpu_kernel void @A
+; CHECK0: define amdgpu_kernel void @B
+; CHECK0-NOT: define
 
-; CHECK1: define internal void @HelperC
-; CHECK1: define amdgpu_kernel void @C
+; CHECK1-NOT: define
+; CHECK1: define internal void @HelperD
+; CHECK1: define amdgpu_kernel void @D
+; CHECK1-NOT: define
 
-; CHECK2: define hidden void @HelperA
-; CHECK2: define hidden void @HelperB
-; CHECK2: define hidden void @CallCandidate
+; CHECK2-NOT: define
 ; CHECK2: define internal void @HelperC
-; CHECK2: define internal void @HelperD
-; CHECK2: define amdgpu_kernel void @A
-; CHECK2: define amdgpu_kernel void @B
+; CHECK2: define amdgpu_kernel void @C
+; CHECK2-NOT: define
 
 @addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate]
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
index 9205a5d1930e52..dc2c5c3c07bee6 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
@@ -1,15 +1,21 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
-
-; CHECK0: define amdgpu_kernel void @D
-
-; CHECK1: define amdgpu_kernel void @C
-
-; CHECK2: define void @ExternalHelper
-; CHECK2: define amdgpu_kernel void @A
-; CHECK2: define amdgpu_kernel void @B
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+
+; CHECK0-NOT: define
+; CHECK0: define void @ExternalHelper
+; CHECK0: define amdgpu_kernel void @A
+; CHECK0: define amdgpu_kernel void @B
+; CHECK0-NOT: define
+
+; CHECK1-NOT: define
+; CHECK1: define amdgpu_kernel void @D
+; CHECK1-NOT: define
+
+; CHECK2-NOT: define
+; CHECK2: define amdgpu_kernel void @C
+; CHECK2-NOT: define
 
 define void @ExternalHelper() {
   ret void
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
index a184d92aea9b9f..0fc76934afc548 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
@@ -1,20 +1,26 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-globals
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
 ; 3 kernels use private/internal global variables.
 ; The GVs should be copied in each partition as needed.
 
+; CHECK0-NOT: define
 ; CHECK0: @bar = internal constant ptr
 ; CHECK0: define amdgpu_kernel void @C
+; CHECK0-NOT: define
 
+; CHECK1-NOT: define
 ; CHECK1: @foo = private constant ptr
 ; CHECK1: define amdgpu_kernel void @A
+; CHECK1-NOT: define
 
+; CHECK2-NOT: define
 ; CHECK2: @foo = private constant ptr
 ; CHECK2: @bar = internal constant ptr
 ; CHECK2: define amdgpu_kernel void @B
+; CHECK2-NOT: define
 
 @foo = private constant ptr poison
 @bar = internal constant ptr poison
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
index be84a0b5916f0d..7564662e7c7c0c 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
@@ -1,22 +1,28 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
 ; 3 kernels use private/internal global variables.
 ; The GVs should be copied in each partition as needed.
 
+; CHECK0-NOT: define
 ; CHECK0: @foo = hidden constant ptr poison
 ; CHECK0: @bar = hidden constant ptr poison
 ; CHECK0: define amdgpu_kernel void @C
+; CHECK0-NOT: define
 
+; CHECK1-NOT: define
 ; CHECK1: @foo = external hidden constant ptr{{$}}
 ; CHECK1: @bar = external hidden constant ptr{{$}}
 ; CHECK1: define amdgpu_kernel void @A
+; CHECK1-NOT: define
 
+; CHECK2-NOT: define
 ; CHECK2: @foo = external hidden constant ptr{{$}}
 ; CHECK2: @bar = external hidden constant ptr{{$}}
 ; CHECK2: define amdgpu_kernel void @B
+; CHECK2-NOT: define
 
 @foo = private constant ptr poison
 @bar = internal constant ptr poison
diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
index 807fb2e5f33cea..459c5a7f1a2db3 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
@@ -1,12 +1,12 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=1.2 -amdgpu-module-splitting-large-function-merge-overlap=0.5
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
-; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 -amdgpu-module-splitting-max-depth=0
-; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 --implicit-check-not=define %s
+; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0
+; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 %s
+; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 %s
+; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 %s
 
 ; 2 kernels (A/B) are large and share all their dependencies.
 ; They should go in the same partition, the remaining kernel should
@@ -15,12 +15,14 @@
 ; Also check w/o large kernels processing to verify they are indeed handled
 ; differently.
 
-; P0 is empty
-; CHECK0: declare
+; CHECK0-NOT: define
 
+; CHECK1-NOT: define
 ; CHECK1: define internal void @HelperC()
 ; CHECK1: define amdgpu_kernel void @C
+; CHECK1-NOT: define
 
+; CHECK2-NOT: define
 ; CHECK2: define internal void @large2()
 ; CHECK2: define internal void @large1()
 ; CHECK2: define internal void @large0()
@@ -28,9 +30,12 @@
 ; CHECK2: define internal void @HelperB()
 ; CHECK2: define amdgpu_kernel void @A
 ; CHECK2: define amdgpu_kernel void @B
+; CHECK2-NOT: define
 
+; NOLARGEKERNELS-CHECK0-NOT: define
 ; NOLARGEKERNELS-CHECK0: define internal void @HelperC()
 ; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C
+; NOLARGEKERNELS-CHECK0-NOT: define
 
 ; NOLARGEKERNELS-CHECK1: define internal void @large2()
 ; NOLARGEKERNELS-CHECK1: define internal void @large1()
@@ -44,7 +49,6 @@
 ; NOLARGEKERNELS-CHECK2: define internal void @HelperA()
 ; NOLARGEKERNELS-CHECK2: define amdgpu_kernel void @A
 
-
 define internal void @large2() {
   store volatile i32 42, ptr null
   call void @large2()
diff --git a/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll
index 1314a78b42f3b0..167930ce0e8063 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=DEFINE %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=DEFINE %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=DEFINE %s
 
 ; We have 4 function:
 ;   - Each function has an internal helper
@@ -11,19 +11,19 @@
 ; @CallCandidate doesn't have to be in A/B's partition, unlike
 ; in the corresponding tests for kernels where it has to.
 
+; CHECK0: define hidden void @HelperA
+; CHECK0: define hidden void @HelperB
 ; CHECK0: define internal void @HelperC
 ; CHECK0: define internal void @HelperD
-; CHECK0: define internal void @C
-; CHECK0: define internal void @D
+; CHECK0: define void @A
+; CHECK0: define void @B
 
-; CHECK1: define hidden void @HelperA
-; CHECK1: define hidden void @CallCandidate()
-; CHECK1: define internal void @A
+; CHECK1: define internal void @HelperD
+; CHECK1: define void @D
 
-; CHECK2: define hidden void @HelperB
+; CHECK2: define hidden void @CallCandidate
 ; CHECK2: define internal void @HelperC
-; CHECK2: define internal void @HelperD
-; CHECK2: define internal void @B
+; CHECK2: define void @C
 
 @addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate]
 
@@ -51,22 +51,22 @@ define internal void @HelperD() {
   ret void
 }
 
-define internal void @A(ptr %call) {
+define void @A(ptr %call) {
   call void @HelperA(ptr %call)
   ret void
 }
 
-define internal void @B(ptr %call) {
+define void @B(ptr %call) {
   call void @HelperB(ptr %call)
   ret void
 }
 
-define internal void @C() {
+define void @C() {
   call void @HelperC()
   ret void
 }
 
-define internal void @D() {
+define void @D() {
   call void @HelperD()
   ret void
 }
diff --git a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll b/llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll
deleted file mode 100644
index 01f2f3627f9905..00000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; RUN: llvm-split -o %t_s3_ %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=2
-; RUN: llvm-dis -o - %t_s3_0 | FileCheck --check-prefix=SPLIT3-CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s3_1 | FileCheck --check-prefix=SPLIT3-CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s3_2 | FileCheck --check-prefix=SPLIT3-CHECK2 --implicit-check-not=define %s
-
-; RUN: llvm-split -o %t_s5_ %s -j 5 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=2
-; RUN: llvm-dis -o - %t_s5_0 | FileCheck --check-prefix=SPLIT5-CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_1 | FileCheck --check-prefix=SPLIT5-CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_2 | FileCheck --check-prefix=SPLIT5-CHECK2 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_3 | FileCheck --check-prefix=SPLIT5-CHECK3 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_4 | FileCheck --check-prefix=SPLIT5-CHECK4 --implicit-check-not=define %s
-
-; Test the specifics of the search algorithm.
-; This test will change depending on new heuristics we add or remove.
-
-; --------------------------------------------
-
-; SPLIT3-CHECK0: define internal void @HelperA()
-; SPLIT3-CHECK0: define internal void @HelperB()
-; SPLIT3-CHECK0: define internal void @HelperC()
-; SPLIT3-CHECK0: define amdgpu_kernel void @AB()
-; SPLIT3-CHECK0: define amdgpu_kernel void @BC()
-
-; SPLIT3-CHECK1: define amdgpu_kernel void @A()
-; SPLIT3-CHECK1: define internal void @HelperA()
-; SPLIT3-CHECK1: define amdgpu_kernel void @C()
-; SPLIT3-CHECK1: define internal void @HelperC()
-
-; SPLIT3-CHECK2: define internal void @HelperA()
-; SPLIT3-CHECK2: define amdgpu_kernel void @B()
-; SPLIT3-CHECK2: define internal void @HelperB()
-; SPLIT3-CHECK2: define internal void @HelperC()
-; SPLIT3-CHECK2: define amdgpu_kernel void @ABC()
-
-; --------------------------------------------
-
-; SPLIT5-CHECK0: define amdgpu_kernel void @A()
-; SPLIT5-CHECK0: define internal void @HelperA()
-; SPLIT5-CHECK0: define amdgpu_kernel void @B()
-; SPLIT5-CHECK0: define internal void @HelperB()
-
-; SPLIT5-CHECK1: define internal void @HelperB()
-; SPLIT5-CHECK1: define internal void @HelperC()
-; SPLIT5-CHECK1: define amdgpu_kernel void @BC
-
-; SPLIT5-CHECK2: define internal void @HelperA()
-; SPLIT5-CHECK2: define internal void @HelperB()
-; SPLIT5-CHECK2: define amdgpu_kernel void @AB()
-
-; SPLIT5-CHECK3: define amdgpu_kernel void @C()
-; SPLIT5-CHECK3: define internal void @HelperC()
-
-; SPLIT5-CHECK4: define internal void @HelperA()
-; SPLIT5-CHECK4: define internal void @HelperB()
-; SPLIT5-CHECK4: define internal void @HelperC()
-; SPLIT5-CHECK4: define amdgpu_kernel void @ABC()
-
-define amdgpu_kernel void @A() {
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  call void @HelperA()
-  ret void
-}
-
-define internal void @HelperA() {
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  call void @HelperB()
-  ret void
-}
-
-define internal void @HelperB() {
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  call void @HelperC()
-  ret void
-}
-
-define internal void @HelperC() {
-  store volatile i32 42, ptr null
-  ret void
-}
-
-define amdgpu_kernel void @AB() {
-  store volatile i32 42, ptr null
-  call void @HelperA()
-  call void @HelperB()
-  ret void
-}
-
-define amdgpu_kernel void @BC() {
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  call void @HelperB()
-  call void @HelperC()
-  ret void
-}
-
-define amdgpu_kernel void @ABC() {
-  call void @HelperA()
-  call void @HelperB()
-  call void @HelperC()
-  ret void
-}
diff --git a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll b/llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll
deleted file mode 100644
index eae57a19883106..00000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; RUN: llvm-split -o %t_s3_ %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=8
-; RUN: llvm-dis -o - %t_s3_0 | FileCheck --check-prefix=SPLIT3-CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s3_1 | FileCheck --check-prefix=SPLIT3-CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s3_2 | FileCheck --check-prefix=SPLIT3-CHECK2 --implicit-check-not=define %s
-
-; RUN: llvm-split -o %t_s5_ %s -j 5 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=8
-; RUN: llvm-dis -o - %t_s5_0 | FileCheck --check-prefix=SPLIT5-CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_1 | FileCheck --check-prefix=SPLIT5-CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_2 | FileCheck --check-prefix=SPLIT5-CHECK2 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_3 | FileCheck --check-prefix=SPLIT5-CHECK3 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_4 | FileCheck --check-prefix=SPLIT5-CHECK4 --implicit-check-not=define %s
-
-; Test the specifics of the search algorithm.
-; This test will change depending on new heuristics we add or remove.
-
-; --------------------------------------------
-
-; SPLIT3-CHECK0: define internal void @HelperA()
-; SPLIT3-CHECK0: define internal void @HelperB()
-; SPLIT3-CHECK0: define internal void @HelperC()
-; SPLIT3-CHECK0: define amdgpu_kernel void @AB()
-; SPLIT3-CHECK0: define amdgpu_kernel void @BC()
-
-; SPLIT3-CHECK1: define amdgpu_kernel void @A()
-; SPLIT3-CHECK1: define internal void @HelperA()
-; SPLIT3-CHECK1: define amdgpu_kernel void @C()
-; SPLIT3-CHECK1: define internal void @HelperC()
-
-; SPLIT3-CHECK2: define internal void @HelperA()
-; SPLIT3-CHECK2: define amdgpu_kernel void @B()
-; SPLIT3-CHECK2: define internal void @HelperB()
-; SPLIT3-CHECK2: define internal void @HelperC()
-; SPLIT3-CHECK2: define amdgpu_kernel void @ABC()
-
-; --------------------------------------------
-
-; SPLIT5-CHECK0: define amdgpu_kernel void @A()
-; SPLIT5-CHECK0: define internal void @HelperA()
-; SPLIT5-CHECK0: define amdgpu_kernel void @B()
-; SPLIT5-CHECK0: define internal void @HelperB()
-
-; SPLIT5-CHECK1: define internal void @HelperB()
-; SPLIT5-CHECK1: define internal void @HelperC()
-; SPLIT5-CHECK1: define amdgpu_kernel void @BC
-
-; SPLIT5-CHECK2: define internal void @HelperA()
-; SPLIT5-CHECK2: define internal void @HelperB()
-; SPLIT5-CHECK2: define amdgpu_kernel void @AB()
-
-; SPLIT5-CHECK3: define amdgpu_kernel void @C()
-; SPLIT5-CHECK3: define internal void @HelperC()
-
-; SPLIT5-CHECK4: define internal void @HelperA()
-; SPLIT5-CHECK4: define internal void @HelperB()
-; SPLIT5-CHECK4: define internal void @HelperC()
-; SPLIT5-CHECK4: define amdgpu_kernel void @ABC()
-
-define amdgpu_kernel void @A() {
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  call void @HelperA()
-  ret void
-}
-
-define internal void @HelperA() {
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  call void @HelperB()
-  ret void
-}
-
-define internal void @HelperB() {
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  call void @HelperC()
-  ret void
-}
-
-define internal void @HelperC() {
-  store volatile i32 42, ptr null
-  ret void
-}
-
-define amdgpu_kernel void @AB() {
-  store volatile i32 42, ptr null
-  call void @HelperA()
-  call void @HelperB()
-  ret void
-}
-
-define amdgpu_kernel void @BC() {
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  call void @HelperB()
-  call void @HelperC()
-  ret void
-}
-
-define amdgpu_kernel void @ABC() {
-  call void @HelperA()
-  call void @HelperB()
-  call void @HelperC()
-  ret void
-}
diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp
index 728f2ed3e62aca..364a7d63d486e1 100644
--- a/llvm/tools/dsymutil/dsymutil.cpp
+++ b/llvm/tools/dsymutil/dsymutil.cpp
@@ -835,7 +835,7 @@ int dsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
     if (Crashed)
       (*Repro)->generate();
 
-    if (!AllOK)
+    if (!AllOK || Crashed)
       return EXIT_FAILURE;
 
     if (NeedsTempFiles) {
diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
index 5615a4493d20a1..5ce14d3f6b9cef 100644
--- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
+++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
@@ -1569,14 +1569,12 @@ TEST(BasicBlockDbgInfoTest, CloneTrailingRecordsToEmptyBlock) {
   // The trailing records should've been absorbed into NewBB.
   EXPECT_FALSE(BB.getTrailingDbgRecords());
   EXPECT_TRUE(NewBB->getTrailingDbgRecords());
-  if (NewBB->getTrailingDbgRecords()) {
-    EXPECT_EQ(
-        llvm::range_size(NewBB->getTrailingDbgRecords()->getDbgRecordRange()),
-        1u);
+  if (DbgMarker *Trailing = NewBB->getTrailingDbgRecords()) {
+    EXPECT_EQ(llvm::range_size(Trailing->getDbgRecordRange()), 1u);
+    // Drop the trailing records now, to prevent a cleanup assertion.
+    Trailing->eraseFromParent();
+    NewBB->deleteTrailingDbgRecords();
   }
-
-  // Drop the trailing records now, to prevent a cleanup assertion.
-  NewBB->deleteTrailingDbgRecords();
 }
 
 } // End anonymous namespace.
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index c543846eb2686e..01fe21eb5cfa43 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -130,6 +130,161 @@ define void @foo(i32 %v0) {
   EXPECT_NE(FortyThree, FortyTwo);
 }
 
+TEST_F(SandboxIRTest, ConstantFP) {
+  parseIR(C, R"IR(
+define void @foo(float %v0, double %v1) {
+  %fadd0 = fadd float %v0, 42.0
+  %fadd1 = fadd double %v1, 43.0
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto &BB = *F.begin();
+  auto It = BB.begin();
+  auto *FAdd0 = cast<sandboxir::BinaryOperator>(&*It++);
+  auto *FAdd1 = cast<sandboxir::BinaryOperator>(&*It++);
+  auto *FortyTwo = cast<sandboxir::ConstantFP>(FAdd0->getOperand(1));
+  [[maybe_unused]] auto *FortyThree =
+      cast<sandboxir::ConstantFP>(FAdd1->getOperand(1));
+
+  auto *FloatTy = sandboxir::Type::getFloatTy(Ctx);
+  auto *DoubleTy = sandboxir::Type::getDoubleTy(Ctx);
+  auto *LLVMFloatTy = Type::getFloatTy(C);
+  auto *LLVMDoubleTy = Type::getDoubleTy(C);
+  // Check that creating an identical constant gives us the same object.
+  auto *NewFortyTwo = sandboxir::ConstantFP::get(FloatTy, 42.0);
+  EXPECT_EQ(NewFortyTwo, FortyTwo);
+  // Check get(Type, double).
+  auto *FortyFour =
+      cast<sandboxir::ConstantFP>(sandboxir::ConstantFP::get(FloatTy, 44.0));
+  auto *LLVMFortyFour =
+      cast<llvm::ConstantFP>(llvm::ConstantFP::get(LLVMFloatTy, 44.0));
+  EXPECT_NE(FortyFour, FortyTwo);
+  EXPECT_EQ(FortyFour, Ctx.getValue(LLVMFortyFour));
+  // Check get(Type, APFloat).
+  auto *FortyFive = cast<sandboxir::ConstantFP>(
+      sandboxir::ConstantFP::get(DoubleTy, APFloat(45.0)));
+  auto *LLVMFortyFive = cast<llvm::ConstantFP>(
+      llvm::ConstantFP::get(LLVMDoubleTy, APFloat(45.0)));
+  EXPECT_EQ(FortyFive, Ctx.getValue(LLVMFortyFive));
+  // Check get(Type, StringRef).
+  auto *FortySix = sandboxir::ConstantFP::get(FloatTy, "46.0");
+  EXPECT_EQ(FortySix, Ctx.getValue(llvm::ConstantFP::get(LLVMFloatTy, "46.0")));
+  // Check get(APFloat).
+  auto *FortySeven = sandboxir::ConstantFP::get(APFloat(47.0), Ctx);
+  EXPECT_EQ(FortySeven, Ctx.getValue(llvm::ConstantFP::get(C, APFloat(47.0))));
+  // Check getNaN().
+  {
+    auto *NaN = sandboxir::ConstantFP::getNaN(FloatTy);
+    EXPECT_EQ(NaN, Ctx.getValue(llvm::ConstantFP::getNaN(LLVMFloatTy)));
+  }
+  {
+    auto *NaN = sandboxir::ConstantFP::getNaN(FloatTy, /*Negative=*/true);
+    EXPECT_EQ(NaN, Ctx.getValue(llvm::ConstantFP::getNaN(LLVMFloatTy,
+                                                         /*Negative=*/true)));
+  }
+  {
+    auto *NaN = sandboxir::ConstantFP::getNaN(FloatTy, /*Negative=*/true,
+                                              /*Payload=*/1);
+    EXPECT_EQ(NaN, Ctx.getValue(llvm::ConstantFP::getNaN(
+                       LLVMFloatTy, /*Negative=*/true, /*Payload=*/1)));
+  }
+  // Check getQNaN().
+  {
+    auto *QNaN = sandboxir::ConstantFP::getQNaN(FloatTy);
+    EXPECT_EQ(QNaN, Ctx.getValue(llvm::ConstantFP::getQNaN(LLVMFloatTy)));
+  }
+  {
+    auto *QNaN = sandboxir::ConstantFP::getQNaN(FloatTy, /*Negative=*/true);
+    EXPECT_EQ(QNaN, Ctx.getValue(llvm::ConstantFP::getQNaN(LLVMFloatTy,
+                                                           /*Negative=*/true)));
+  }
+  {
+    APInt Payload(1, 1);
+    auto *QNaN =
+        sandboxir::ConstantFP::getQNaN(FloatTy, /*Negative=*/true, &Payload);
+    EXPECT_EQ(QNaN, Ctx.getValue(llvm::ConstantFP::getQNaN(
+                        LLVMFloatTy, /*Negative=*/true, &Payload)));
+  }
+  // Check getSNaN().
+  {
+    auto *SNaN = sandboxir::ConstantFP::getSNaN(FloatTy);
+    EXPECT_EQ(SNaN, Ctx.getValue(llvm::ConstantFP::getSNaN(LLVMFloatTy)));
+  }
+  {
+    auto *SNaN = sandboxir::ConstantFP::getSNaN(FloatTy, /*Negative=*/true);
+    EXPECT_EQ(SNaN, Ctx.getValue(llvm::ConstantFP::getSNaN(LLVMFloatTy,
+                                                           /*Negative=*/true)));
+  }
+  {
+    APInt Payload(1, 1);
+    auto *SNaN =
+        sandboxir::ConstantFP::getSNaN(FloatTy, /*Negative=*/true, &Payload);
+    EXPECT_EQ(SNaN, Ctx.getValue(llvm::ConstantFP::getSNaN(
+                        LLVMFloatTy, /*Negative=*/true, &Payload)));
+  }
+
+  // Check getZero().
+  {
+    auto *Zero = sandboxir::ConstantFP::getZero(FloatTy);
+    EXPECT_EQ(Zero, Ctx.getValue(llvm::ConstantFP::getZero(LLVMFloatTy)));
+  }
+  {
+    auto *Zero = sandboxir::ConstantFP::getZero(FloatTy, /*Negative=*/true);
+    EXPECT_EQ(Zero, Ctx.getValue(llvm::ConstantFP::getZero(LLVMFloatTy,
+                                                           /*Negative=*/true)));
+  }
+
+  // Check getNegativeZero().
+  auto *NegZero = cast<sandboxir::ConstantFP>(
+      sandboxir::ConstantFP::getNegativeZero(FloatTy));
+  EXPECT_EQ(NegZero,
+            Ctx.getValue(llvm::ConstantFP::getNegativeZero(LLVMFloatTy)));
+
+  // Check getInfinity().
+  {
+    auto *Inf = sandboxir::ConstantFP::getInfinity(FloatTy);
+    EXPECT_EQ(Inf, Ctx.getValue(llvm::ConstantFP::getInfinity(LLVMFloatTy)));
+  }
+  {
+    auto *Inf = sandboxir::ConstantFP::getInfinity(FloatTy, /*Negative=*/true);
+    EXPECT_EQ(Inf, Ctx.getValue(llvm::ConstantFP::getInfinity(
+                       LLVMFloatTy, /*Negative=*/true)));
+  }
+
+  // Check isValueValidForType().
+  APFloat V(1.1);
+  EXPECT_EQ(sandboxir::ConstantFP::isValueValidForType(FloatTy, V),
+            llvm::ConstantFP::isValueValidForType(LLVMFloatTy, V));
+  // Check getValueAPF().
+  EXPECT_EQ(FortyFour->getValueAPF(), LLVMFortyFour->getValueAPF());
+  // Check getValue().
+  EXPECT_EQ(FortyFour->getValue(), LLVMFortyFour->getValue());
+  // Check isZero().
+  EXPECT_EQ(FortyFour->isZero(), LLVMFortyFour->isZero());
+  EXPECT_TRUE(sandboxir::ConstantFP::getZero(FloatTy));
+  EXPECT_TRUE(sandboxir::ConstantFP::getZero(FloatTy, /*Negative=*/true));
+  // Check isNegative().
+  EXPECT_TRUE(cast<sandboxir::ConstantFP>(
+                  sandboxir::ConstantFP::getZero(FloatTy, /*Negative=*/true))
+                  ->isNegative());
+  // Check isInfinity().
+  EXPECT_TRUE(
+      cast<sandboxir::ConstantFP>(sandboxir::ConstantFP::getInfinity(FloatTy))
+          ->isInfinity());
+  // Check isNaN().
+  EXPECT_TRUE(
+      cast<sandboxir::ConstantFP>(sandboxir::ConstantFP::getNaN(FloatTy))
+          ->isNaN());
+  // Check isExactlyValue(APFloat).
+  EXPECT_TRUE(NegZero->isExactlyValue(NegZero->getValueAPF()));
+  // Check isExactlyValue(double).
+  EXPECT_TRUE(NegZero->isExactlyValue(-0.0));
+}
+
 TEST_F(SandboxIRTest, Use) {
   parseIR(C, R"IR(
 define i32 @foo(i32 %v0, i32 %v1) {
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index c5e4ad4219c91d..9b9be69ee38448 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -1,11 +1,8 @@
-from __future__ import print_function
-
 import argparse
 import bisect
 import collections
 import copy
 import glob
-import itertools
 import os
 import re
 import subprocess
@@ -517,12 +514,13 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False):
                     sep="",
                     file=sys.stderr,
                 )
-            # Python 2.7 doesn't have subprocess.DEVNULL:
-            with open(os.devnull, "w") as devnull:
-                pp = subprocess.Popen(
-                    preprocess_cmd, shell=True, stdin=devnull, stdout=subprocess.PIPE
-                )
-                ir_file = pp.stdout
+            pp = subprocess.Popen(
+                preprocess_cmd,
+                shell=True,
+                stdin=subprocess.DEVNULL,
+                stdout=subprocess.PIPE,
+            )
+            ir_file = pp.stdout
 
         if isinstance(cmd_args, list):
             args = [applySubstitutions(a, substitutions) for a in cmd_args]
diff --git a/llvm/utils/git/pre-push.py b/llvm/utils/git/pre-push.py
index d7ae3767d2923d..dfa009dd1a6f62 100755
--- a/llvm/utils/git/pre-push.py
+++ b/llvm/utils/git/pre-push.py
@@ -27,7 +27,6 @@
 """
 
 import argparse
-import os
 import shutil
 import subprocess
 import sys
@@ -70,14 +69,6 @@ def ask_confirm(prompt):
         return query.lower() == "y"
 
 
-def get_dev_null():
-    """Lazily create a /dev/null fd for use in shell()"""
-    global dev_null_fd
-    if dev_null_fd is None:
-        dev_null_fd = open(os.devnull, "w")
-    return dev_null_fd
-
-
 def shell(
     cmd,
     strip=True,
@@ -95,10 +86,8 @@ def shell(
         cwd_msg = " in %s" % cwd
     log_verbose("Running%s: %s" % (cwd_msg, " ".join(quoted_cmd)))
 
-    err_pipe = subprocess.PIPE
-    if ignore_errors:
-        # Silence errors if requested.
-        err_pipe = get_dev_null()
+    # Silence errors if requested.
+    err_pipe = subprocess.DEVNULL if ignore_errors else subprocess.PIPE
 
     start = time.time()
     p = subprocess.Popen(
diff --git a/llvm/utils/gn/gn.py b/llvm/utils/gn/gn.py
index 290c6941bceea2..6b7919b7faeb92 100755
--- a/llvm/utils/gn/gn.py
+++ b/llvm/utils/gn/gn.py
@@ -42,7 +42,7 @@ def main():
     if (
         subprocess.call(
             "gn --version",
-            stdout=open(os.devnull, "w"),
+            stdout=subprocess.DEVNULL,
             stderr=subprocess.STDOUT,
             shell=True,
         )
diff --git a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
index 7ff3faf63bedc9..f176d8b94b5322 100644
--- a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn
@@ -7,5 +7,6 @@ static_library("SandboxIR") {
   sources = [
     "SandboxIR.cpp",
     "Tracker.cpp",
+    "Type.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn
index 243a92f2e62587..aa594df8c164a1 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/BPF/BUILD.gn
@@ -71,6 +71,7 @@ static_library("LLVMBPFCodeGen") {
     "BPFISelLowering.cpp",
     "BPFInstrInfo.cpp",
     "BPFMCInstLower.cpp",
+    "BPFMIChecking.cpp",
     "BPFMIPeephole.cpp",
     "BPFMISimplifyPatchable.cpp",
     "BPFPreserveDIType.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn
index 2d246eccb872ea..02ef303a6946f3 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/SandboxIR/BUILD.gn
@@ -9,5 +9,6 @@ unittest("SandboxIRTests") {
   sources = [
     "SandboxIRTest.cpp",
     "TrackerTest.cpp",
+    "TypesTest.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
index 3a660a87d8af63..47b03b42d096d2 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
@@ -64,6 +64,7 @@ unittest("SupportTests") {
     "MemoryBufferRefTest.cpp",
     "MemoryBufferTest.cpp",
     "MemoryTest.cpp",
+    "ModRefTest.cpp",
     "NativeFormatTests.cpp",
     "OptimizedStructLayoutTest.cpp",
     "ParallelTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn
index f5b162dd102320..ad44635f107a16 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn
@@ -10,6 +10,7 @@ unittest("IPOTests") {
   sources = [
     "AttributorTest.cpp",
     "FunctionSpecializationTest.cpp",
+    "ImportIDTableTests.cpp",
     "LowerTypeTests.cpp",
     "WholeProgramDevirt.cpp",
   ]
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 4d48b3de7a57ed..709dd922b8fa2f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -19,6 +19,7 @@ include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td"
 
+def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
 def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
 def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
 
@@ -531,8 +532,10 @@ def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">;
 def ProxyAsync   : I32EnumAttrCase<"async", 1, "async">;
 def ProxyAsyncGlobal   : I32EnumAttrCase<"async_global", 2, "async.global">;
 def ProxyAsyncShared   : I32EnumAttrCase<"async_shared", 3, "async.shared">;
+def ProxyTensorMap : I32EnumAttrCase<"TENSORMAP", 4, "tensormap">;
+def ProxyGeneric : I32EnumAttrCase<"GENERIC", 5, "generic">;
 def ProxyKind : I32EnumAttr<"ProxyKind", "Proxy kind",
-  [ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared]> {
+  [ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared, ProxyTensorMap, ProxyGeneric]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
 }
@@ -565,6 +568,80 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
   let hasVerifier = 1;
 }
 
+// Attrs describing the scope of the Memory Operation
+def MemScopeKindCTA      : I32EnumAttrCase<"CTA", 0, "cta">;
+def MemScopeKindCluster  : I32EnumAttrCase<"CLUSTER", 1, "cluster">;
+def MemScopeKindGPU      : I32EnumAttrCase<"GPU", 2, "gpu">;
+def MemScopeKindSYS      : I32EnumAttrCase<"SYS", 3, "sys">;
+
+def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind",
+  [MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::NVVM";
+}
+def MemScopeKindAttr : EnumAttr<NVVM_Dialect, MemScopeKind, "mem_scope"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
+      Arguments<(ins MemScopeKindAttr:$scope, LLVM_PointerGeneric:$addr, I32:$size,
+                     DefaultValuedAttr<ProxyKindAttr,
+                                       "ProxyKind::GENERIC">:$fromProxy,
+                     DefaultValuedAttr<ProxyKindAttr,
+                                       "ProxyKind::TENSORMAP">:$toProxy)> {
+  let summary = "Uni-directional proxy fence operation with acquire semantics";
+  let description = [{
+    `fence.proxy.acquire` is a uni-directional fence used to establish ordering
+    between a prior memory access performed via the generic proxy and a
+    subsequent memory access performed via the tensormap proxy
+
+    The address operand `addr` and the operand `size` together specify the
+    memory range `[addr, addr+size)` on which the ordering guarantees on the
+    memory accesses across the proxies is to be provided. The only supported
+    value for the `size` operand is 128 and must be an immediate. Generic Addressing
+    is used unconditionally, and the address specified by the operand `addr` must
+    fall within the `.global` state space. Otherwise, the behavior is undefined
+    [For more information, see PTX ISA]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+  }];
+
+  let assemblyFormat = "$scope $addr `,` $size (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict";
+  let llvmBuilder = [{
+    createIntrinsicCall(
+        builder,
+        getUnidirectionalFenceProxyID($fromProxy, $toProxy, $scope, false),
+        {$addr, $size});
+  }];
+
+  let hasVerifier = 1;
+}
+
+def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
+      Arguments<(ins MemScopeKindAttr:$scope,
+                     DefaultValuedAttr<ProxyKindAttr,
+                                       "ProxyKind::GENERIC">:$fromProxy,
+                     DefaultValuedAttr<ProxyKindAttr,
+                                       "ProxyKind::TENSORMAP">:$toProxy)> {
+  let summary = "Uni-directional proxy fence operation with release semantics";
+  let description = [{
+    `fence.proxy.release` is a uni-directional fence used to establish ordering
+    between a prior memory access performed via the generic proxy and a
+    subsequent memory access performed via the tensormap proxy. `fence.proxy.release`
+    operation can form a release sequence that synchronizes with an acquire
+    sequence that contains the fence.proxy.acquire proxy fence operation
+    [For more information, see PTX ISA]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+  }];
+
+  let assemblyFormat = "$scope (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict";
+  let llvmBuilder = [{
+    createIntrinsicCall(builder, getUnidirectionalFenceProxyID(
+                                     $fromProxy, $toProxy, $scope, true));
+  }];
+
+  let hasVerifier = 1;
+}
+
 def SetMaxRegisterActionIncrease : I32EnumAttrCase<"increase", 0>;
 def SetMaxRegisterActionDecrease   : I32EnumAttrCase<"decrease", 1>;
 def SetMaxRegisterAction : I32EnumAttr<"SetMaxRegisterAction", "NVVM set max register action",
diff --git a/mlir/include/mlir/IR/Block.h b/mlir/include/mlir/IR/Block.h
index e4fddfcb7608e6..67825eb1704bbe 100644
--- a/mlir/include/mlir/IR/Block.h
+++ b/mlir/include/mlir/IR/Block.h
@@ -27,8 +27,8 @@ template <typename ValueRangeT>
 class ValueTypeRange;
 
 /// `Block` represents an ordered list of `Operation`s.
-class Block : public IRObjectWithUseList<BlockOperand>,
-              public llvm::ilist_node_with_parent<Block, Region> {
+class alignas(8) Block : public IRObjectWithUseList<BlockOperand>,
+                         public llvm::ilist_node_with_parent<Block, Region> {
 public:
   explicit Block() = default;
   ~Block();
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 5f680e8eca7559..60113bdef16a23 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -1124,17 +1124,6 @@ struct ConversionConfig {
   // already been modified) and iterators into past IR state cannot be
   // represented at the moment.
   RewriterBase::Listener *listener = nullptr;
-
-  /// If set to "true", the dialect conversion attempts to build source/target/
-  /// argument materializations through the type converter API in lieu of
-  /// builtin.unrealized_conversion_cast ops. The conversion process fails if
-  /// at least one materialization could not be built.
-  ///
-  /// If set to "false", the dialect conversion does not does not build any
-  /// custom materializations and instead inserts
-  /// builtin.unrealized_conversion_cast ops to ensure that the resulting IR
-  /// is valid.
-  bool buildMaterializations = true;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 4d1896551101ed..2c7c3e9d535f7d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1004,6 +1004,10 @@ void NVVM::WgmmaMmaAsyncOp::getAsmValues(
   }
 }
 LogicalResult NVVM::FenceProxyOp::verify() {
+  if (getKind() == NVVM::ProxyKind::TENSORMAP)
+    return emitOpError() << "tensormap proxy is not a supported proxy kind";
+  if (getKind() == NVVM::ProxyKind::GENERIC)
+    return emitOpError() << "generic proxy not a supported proxy kind";
   if (getKind() == NVVM::ProxyKind::async_shared && !getSpace().has_value()) {
     return emitOpError() << "async_shared fence requires space attribute";
   }
@@ -1013,6 +1017,30 @@ LogicalResult NVVM::FenceProxyOp::verify() {
   return success();
 }
 
+LogicalResult NVVM::FenceProxyAcquireOp::verify() {
+  if (getFromProxy() != NVVM::ProxyKind::GENERIC)
+    return emitOpError("uni-directional proxies only support generic for "
+                       "from_proxy attribute");
+
+  if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
+    return emitOpError("uni-directional proxies only support tensormap "
+                       "for to_proxy attribute");
+
+  return success();
+}
+
+LogicalResult NVVM::FenceProxyReleaseOp::verify() {
+  if (getFromProxy() != NVVM::ProxyKind::GENERIC)
+    return emitOpError("uni-directional proxies only support generic for "
+                       "from_proxy attribute");
+
+  if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
+    return emitOpError("uni-directional proxies only support tensormap "
+                       "for to_proxy attribute");
+
+  return success();
+}
+
 LogicalResult NVVM::SetMaxRegisterOp::verify() {
   if (getRegCount() % 8)
     return emitOpError("new register size must be multiple of 8");
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 5f51d4b25ccc25..bd18bd66bb7cdb 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -1028,6 +1028,10 @@ LogicalResult tosa::TileOp::verify() {
     return emitOpError("expect 'multiples' array to have length ")
            << outputType.getRank() << " but got " << multiples.size() << ".";
 
+  if (llvm::any_of(multiples, [](int64_t v) { return v <= 0 && v != -1; }))
+    return emitOpError(
+        "expect element of 'multiples' to be positive integer or -1.");
+
   return success();
 }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index a09c24dda82afc..9cc66207660f64 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -120,6 +120,41 @@ static llvm::Intrinsic::ID getLdMatrixIntrinsicId(NVVM::MMALayout layout,
   }
 }
 
+static unsigned getUnidirectionalFenceProxyID(NVVM::ProxyKind fromProxy,
+                                              NVVM::ProxyKind toProxy,
+                                              NVVM::MemScopeKind scope,
+                                              bool isRelease) {
+  if (fromProxy == NVVM::ProxyKind::GENERIC &&
+      toProxy == NVVM::ProxyKind::TENSORMAP) {
+    switch (scope) {
+    case NVVM::MemScopeKind::CTA: {
+      if (isRelease)
+        return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_cta;
+      return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cta;
+    }
+    case NVVM::MemScopeKind::CLUSTER: {
+      if (isRelease)
+        return llvm::Intrinsic::
+            nvvm_fence_proxy_tensormap_generic_release_cluster;
+      return llvm::Intrinsic::
+          nvvm_fence_proxy_tensormap_generic_acquire_cluster;
+    }
+    case NVVM::MemScopeKind::GPU: {
+      if (isRelease)
+        return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_gpu;
+      return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_gpu;
+    }
+    case NVVM::MemScopeKind::SYS: {
+      if (isRelease)
+        return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_sys;
+      return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_sys;
+    }
+    }
+    llvm_unreachable("Unknown scope for uni-directional fence.proxy operation");
+  }
+  llvm_unreachable("Unsupported proxy kinds");
+}
+
 namespace {
 /// Implementation of the dialect interface that converts operations belonging
 /// to the NVVM dialect to LLVM IR.
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index cc9c9495e5155c..b23fb97959ed67 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -702,12 +702,14 @@ class UnresolvedMaterializationRewrite : public OperationRewrite {
     return rewrite->getKind() == Kind::UnresolvedMaterialization;
   }
 
-  void rollback() override;
-
   UnrealizedConversionCastOp getOperation() const {
     return cast<UnrealizedConversionCastOp>(op);
   }
 
+  void rollback() override;
+
+  void cleanup(RewriterBase &rewriter) override;
+
   /// Return the type converter of this materialization (which may be null).
   const TypeConverter *getConverter() const {
     return converterAndKind.getPointer();
@@ -764,7 +766,7 @@ namespace detail {
 struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   explicit ConversionPatternRewriterImpl(MLIRContext *ctx,
                                          const ConversionConfig &config)
-      : context(ctx), eraseRewriter(ctx), config(config) {}
+      : context(ctx), config(config) {}
 
   //===--------------------------------------------------------------------===//
   // State Management
@@ -832,7 +834,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   //===--------------------------------------------------------------------===//
   // Materializations
   //===--------------------------------------------------------------------===//
-
   /// Build an unresolved materialization operation given an output type and set
   /// of input operands.
   Value buildUnresolvedMaterialization(MaterializationKind kind,
@@ -881,7 +882,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
 
     /// Erase the given op (unless it was already erased).
     void eraseOp(Operation *op) override {
-      if (wasErased(op))
+      if (erased.contains(op))
         return;
       op->dropAllUses();
       RewriterBase::eraseOp(op);
@@ -889,24 +890,17 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
 
     /// Erase the given block (unless it was already erased).
     void eraseBlock(Block *block) override {
-      if (wasErased(block))
+      if (erased.contains(block))
         return;
       assert(block->empty() && "expected empty block");
       block->dropAllDefinedValueUses();
       RewriterBase::eraseBlock(block);
     }
 
-    bool wasErased(void *ptr) const { return erased.contains(ptr); }
-
-    bool wasErased(OperationRewrite *rewrite) const {
-      return wasErased(rewrite->getOperation());
-    }
-
     void notifyOperationErased(Operation *op) override { erased.insert(op); }
 
     void notifyBlockErased(Block *block) override { erased.insert(block); }
 
-  private:
     /// Pointers to all erased operations and blocks.
     DenseSet<void *> erased;
   };
@@ -918,11 +912,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// MLIR context.
   MLIRContext *context;
 
-  /// A rewriter that keeps track of ops/block that were already erased and
-  /// skips duplicate op/block erasures. This rewriter is used during the
-  /// "cleanup" phase.
-  SingleEraseRewriter eraseRewriter;
-
   // Mapping between replaced values that differ in type. This happens when
   // replacing a value with one of a different type.
   ConversionValueMapping mapping;
@@ -1069,6 +1058,10 @@ void UnresolvedMaterializationRewrite::rollback() {
   op->erase();
 }
 
+void UnresolvedMaterializationRewrite::cleanup(RewriterBase &rewriter) {
+  rewriter.eraseOp(op);
+}
+
 void ConversionPatternRewriterImpl::applyRewrites() {
   // Commit all rewrites.
   IRRewriter rewriter(context, config.listener);
@@ -1076,6 +1069,7 @@ void ConversionPatternRewriterImpl::applyRewrites() {
     rewrite->commit(rewriter);
 
   // Clean up all rewrites.
+  SingleEraseRewriter eraseRewriter(context);
   for (auto &rewrite : rewrites)
     rewrite->cleanup(eraseRewriter);
 }
@@ -2359,6 +2353,12 @@ struct OperationConverter {
       ConversionPatternRewriterImpl &rewriterImpl,
       DenseMap<Value, SmallVector<Value>> &inverseMapping);
 
+  /// Legalize any unresolved type materializations.
+  LogicalResult legalizeUnresolvedMaterializations(
+      ConversionPatternRewriter &rewriter,
+      ConversionPatternRewriterImpl &rewriterImpl,
+      DenseMap<Value, SmallVector<Value>> &inverseMapping);
+
   /// Legalize an operation result that was marked as "erased".
   LogicalResult
   legalizeErasedResult(Operation *op, OpResult result,
@@ -2405,56 +2405,6 @@ LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter,
   return success();
 }
 
-static LogicalResult
-legalizeUnresolvedMaterialization(RewriterBase &rewriter,
-                                  UnresolvedMaterializationRewrite *rewrite) {
-  UnrealizedConversionCastOp op = rewrite->getOperation();
-  assert(!op.use_empty() &&
-         "expected that dead materializations have already been DCE'd");
-  Operation::operand_range inputOperands = op.getOperands();
-  Type outputType = op.getResultTypes()[0];
-
-  // Try to materialize the conversion.
-  if (const TypeConverter *converter = rewrite->getConverter()) {
-    rewriter.setInsertionPoint(op);
-    Value newMaterialization;
-    switch (rewrite->getMaterializationKind()) {
-    case MaterializationKind::Argument:
-      // Try to materialize an argument conversion.
-      newMaterialization = converter->materializeArgumentConversion(
-          rewriter, op->getLoc(), outputType, inputOperands);
-      if (newMaterialization)
-        break;
-      // If an argument materialization failed, fallback to trying a target
-      // materialization.
-      [[fallthrough]];
-    case MaterializationKind::Target:
-      newMaterialization = converter->materializeTargetConversion(
-          rewriter, op->getLoc(), outputType, inputOperands);
-      break;
-    case MaterializationKind::Source:
-      newMaterialization = converter->materializeSourceConversion(
-          rewriter, op->getLoc(), outputType, inputOperands);
-      break;
-    }
-    if (newMaterialization) {
-      assert(newMaterialization.getType() == outputType &&
-             "materialization callback produced value of incorrect type");
-      rewriter.replaceOp(op, newMaterialization);
-      return success();
-    }
-  }
-
-  InFlightDiagnostic diag = op->emitError()
-                            << "failed to legalize unresolved materialization "
-                               "from ("
-                            << inputOperands.getTypes() << ") to " << outputType
-                            << " that remained live after conversion";
-  diag.attachNote(op->getUsers().begin()->getLoc())
-      << "see existing live user here: " << *op->getUsers().begin();
-  return failure();
-}
-
 LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
   if (ops.empty())
     return success();
@@ -2496,37 +2446,6 @@ LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
   } else {
     rewriterImpl.applyRewrites();
   }
-
-  // Gather all unresolved materializations.
-  SmallVector<UnrealizedConversionCastOp> allCastOps;
-  DenseMap<Operation *, UnresolvedMaterializationRewrite *> rewriteMap;
-  for (std::unique_ptr<IRRewrite> &rewrite : rewriterImpl.rewrites) {
-    auto *mat = dyn_cast<UnresolvedMaterializationRewrite>(rewrite.get());
-    if (!mat)
-      continue;
-    if (rewriterImpl.eraseRewriter.wasErased(mat))
-      continue;
-    allCastOps.push_back(mat->getOperation());
-    rewriteMap[mat->getOperation()] = mat;
-  }
-
-  // Reconcile all UnrealizedConversionCastOps that were inserted by the
-  // dialect conversion frameworks. (Not the one that were inserted by
-  // patterns.)
-  SmallVector<UnrealizedConversionCastOp> remainingCastOps;
-  reconcileUnrealizedCasts(allCastOps, &remainingCastOps);
-
-  // Try to legalize all unresolved materializations.
-  if (config.buildMaterializations) {
-    IRRewriter rewriter(rewriterImpl.context, config.listener);
-    for (UnrealizedConversionCastOp castOp : remainingCastOps) {
-      auto it = rewriteMap.find(castOp.getOperation());
-      assert(it != rewriteMap.end() && "inconsistent state");
-      if (failed(legalizeUnresolvedMaterialization(rewriter, it->second)))
-        return failure();
-    }
-  }
-
   return success();
 }
 
@@ -2540,6 +2459,9 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
   if (failed(legalizeConvertedOpResultTypes(rewriter, rewriterImpl,
                                             inverseMapping)))
     return failure();
+  if (failed(legalizeUnresolvedMaterializations(rewriter, rewriterImpl,
+                                                inverseMapping)))
+    return failure();
   return success();
 }
 
@@ -2655,6 +2577,279 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes(
   return success();
 }
 
+/// Replace the results of a materialization operation with the given values.
+static void
+replaceMaterialization(ConversionPatternRewriterImpl &rewriterImpl,
+                       ResultRange matResults, ValueRange values,
+                       DenseMap<Value, SmallVector<Value>> &inverseMapping) {
+  matResults.replaceAllUsesWith(values);
+
+  // For each of the materialization results, update the inverse mappings to
+  // point to the replacement values.
+  for (auto [matResult, newValue] : llvm::zip(matResults, values)) {
+    auto inverseMapIt = inverseMapping.find(matResult);
+    if (inverseMapIt == inverseMapping.end())
+      continue;
+
+    // Update the reverse mapping, or remove the mapping if we couldn't update
+    // it. Not being able to update signals that the mapping would have become
+    // circular (i.e. %foo -> newValue -> %foo), which may occur as values are
+    // propagated through temporary materializations. We simply drop the
+    // mapping, and let the post-conversion replacement logic handle updating
+    // uses.
+    for (Value inverseMapVal : inverseMapIt->second)
+      if (!rewriterImpl.mapping.tryMap(inverseMapVal, newValue))
+        rewriterImpl.mapping.erase(inverseMapVal);
+  }
+}
+
+/// Compute all of the unresolved materializations that will persist beyond the
+/// conversion process, and require inserting a proper user materialization for.
+static void computeNecessaryMaterializations(
+    DenseMap<Operation *, UnresolvedMaterializationRewrite *>
+        &materializationOps,
+    ConversionPatternRewriter &rewriter,
+    ConversionPatternRewriterImpl &rewriterImpl,
+    DenseMap<Value, SmallVector<Value>> &inverseMapping,
+    SetVector<UnresolvedMaterializationRewrite *> &necessaryMaterializations) {
+  // Helper function to check if the given value or a not yet materialized
+  // replacement of the given value is live.
+  // Note: `inverseMapping` maps from replaced values to original values.
+  auto isLive = [&](Value value) {
+    auto findFn = [&](Operation *user) {
+      auto matIt = materializationOps.find(user);
+      if (matIt != materializationOps.end())
+        return !necessaryMaterializations.count(matIt->second);
+      return rewriterImpl.isOpIgnored(user);
+    };
+    // A worklist is needed because a value may have gone through a chain of
+    // replacements and each of the replaced values may have live users.
+    SmallVector<Value> worklist;
+    worklist.push_back(value);
+    while (!worklist.empty()) {
+      Value next = worklist.pop_back_val();
+      if (llvm::find_if_not(next.getUsers(), findFn) != next.user_end())
+        return true;
+      // This value may be replacing another value that has a live user.
+      llvm::append_range(worklist, inverseMapping.lookup(next));
+    }
+    return false;
+  };
+
+  llvm::unique_function<Value(Value, Value, Type)> lookupRemappedValue =
+      [&](Value invalidRoot, Value value, Type type) {
+        // Check to see if the input operation was remapped to a variant of the
+        // output.
+        Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type);
+        if (remappedValue.getType() == type && remappedValue != invalidRoot)
+          return remappedValue;
+
+        // Check to see if the input is a materialization operation that
+        // provides an inverse conversion. We just check blindly for
+        // UnrealizedConversionCastOp here, but it has no effect on correctness.
+        auto inputCastOp = value.getDefiningOp<UnrealizedConversionCastOp>();
+        if (inputCastOp && inputCastOp->getNumOperands() == 1)
+          return lookupRemappedValue(invalidRoot, inputCastOp->getOperand(0),
+                                     type);
+
+        return Value();
+      };
+
+  SetVector<UnresolvedMaterializationRewrite *> worklist;
+  for (auto &rewrite : rewriterImpl.rewrites) {
+    auto *mat = dyn_cast<UnresolvedMaterializationRewrite>(rewrite.get());
+    if (!mat)
+      continue;
+    materializationOps.try_emplace(mat->getOperation(), mat);
+    worklist.insert(mat);
+  }
+  while (!worklist.empty()) {
+    UnresolvedMaterializationRewrite *mat = worklist.pop_back_val();
+    UnrealizedConversionCastOp op = mat->getOperation();
+
+    // We currently only handle target materializations here.
+    assert(op->getNumResults() == 1 && "unexpected materialization type");
+    OpResult opResult = op->getOpResult(0);
+    Type outputType = opResult.getType();
+    Operation::operand_range inputOperands = op.getOperands();
+
+    // Try to forward propagate operands for user conversion casts that result
+    // in the input types of the current cast.
+    for (Operation *user : llvm::make_early_inc_range(opResult.getUsers())) {
+      auto castOp = dyn_cast<UnrealizedConversionCastOp>(user);
+      if (!castOp)
+        continue;
+      if (castOp->getResultTypes() == inputOperands.getTypes()) {
+        replaceMaterialization(rewriterImpl, user->getResults(), inputOperands,
+                               inverseMapping);
+        necessaryMaterializations.remove(materializationOps.lookup(user));
+      }
+    }
+
+    // Try to avoid materializing a resolved materialization if possible.
+    // Handle the case of a 1-1 materialization.
+    if (inputOperands.size() == 1) {
+      // Check to see if the input operation was remapped to a variant of the
+      // output.
+      Value remappedValue =
+          lookupRemappedValue(opResult, inputOperands[0], outputType);
+      if (remappedValue && remappedValue != opResult) {
+        replaceMaterialization(rewriterImpl, opResult, remappedValue,
+                               inverseMapping);
+        necessaryMaterializations.remove(mat);
+        continue;
+      }
+    } else {
+      // TODO: Avoid materializing other types of conversions here.
+    }
+
+    // If the materialization does not have any live users, we don't need to
+    // generate a user materialization for it.
+    bool isMaterializationLive = isLive(opResult);
+    if (!isMaterializationLive)
+      continue;
+    if (!necessaryMaterializations.insert(mat))
+      continue;
+
+    // Reprocess input materializations to see if they have an updated status.
+    for (Value input : inputOperands) {
+      if (auto parentOp = input.getDefiningOp<UnrealizedConversionCastOp>()) {
+        if (auto *mat = materializationOps.lookup(parentOp))
+          worklist.insert(mat);
+      }
+    }
+  }
+}
+
+/// Legalize the given unresolved materialization. Returns success if the
+/// materialization was legalized, failure otherise.
+static LogicalResult legalizeUnresolvedMaterialization(
+    UnresolvedMaterializationRewrite &mat,
+    DenseMap<Operation *, UnresolvedMaterializationRewrite *>
+        &materializationOps,
+    ConversionPatternRewriter &rewriter,
+    ConversionPatternRewriterImpl &rewriterImpl,
+    DenseMap<Value, SmallVector<Value>> &inverseMapping) {
+  auto findLiveUser = [&](auto &&users) {
+    auto liveUserIt = llvm::find_if_not(
+        users, [&](Operation *user) { return rewriterImpl.isOpIgnored(user); });
+    return liveUserIt == users.end() ? nullptr : *liveUserIt;
+  };
+
+  llvm::unique_function<Value(Value, Type)> lookupRemappedValue =
+      [&](Value value, Type type) {
+        // Check to see if the input operation was remapped to a variant of the
+        // output.
+        Value remappedValue = rewriterImpl.mapping.lookupOrDefault(value, type);
+        if (remappedValue.getType() == type)
+          return remappedValue;
+        return Value();
+      };
+
+  UnrealizedConversionCastOp op = mat.getOperation();
+  if (!rewriterImpl.ignoredOps.insert(op))
+    return success();
+
+  // We currently only handle target materializations here.
+  OpResult opResult = op->getOpResult(0);
+  Operation::operand_range inputOperands = op.getOperands();
+  Type outputType = opResult.getType();
+
+  // If any input to this materialization is another materialization, resolve
+  // the input first.
+  for (Value value : op->getOperands()) {
+    auto valueCast = value.getDefiningOp<UnrealizedConversionCastOp>();
+    if (!valueCast)
+      continue;
+
+    auto matIt = materializationOps.find(valueCast);
+    if (matIt != materializationOps.end())
+      if (failed(legalizeUnresolvedMaterialization(
+              *matIt->second, materializationOps, rewriter, rewriterImpl,
+              inverseMapping)))
+        return failure();
+  }
+
+  // Perform a last ditch attempt to avoid materializing a resolved
+  // materialization if possible.
+  // Handle the case of a 1-1 materialization.
+  if (inputOperands.size() == 1) {
+    // Check to see if the input operation was remapped to a variant of the
+    // output.
+    Value remappedValue = lookupRemappedValue(inputOperands[0], outputType);
+    if (remappedValue && remappedValue != opResult) {
+      replaceMaterialization(rewriterImpl, opResult, remappedValue,
+                             inverseMapping);
+      return success();
+    }
+  } else {
+    // TODO: Avoid materializing other types of conversions here.
+  }
+
+  // Try to materialize the conversion.
+  if (const TypeConverter *converter = mat.getConverter()) {
+    rewriter.setInsertionPoint(op);
+    Value newMaterialization;
+    switch (mat.getMaterializationKind()) {
+    case MaterializationKind::Argument:
+      // Try to materialize an argument conversion.
+      newMaterialization = converter->materializeArgumentConversion(
+          rewriter, op->getLoc(), outputType, inputOperands);
+      if (newMaterialization)
+        break;
+      // If an argument materialization failed, fallback to trying a target
+      // materialization.
+      [[fallthrough]];
+    case MaterializationKind::Target:
+      newMaterialization = converter->materializeTargetConversion(
+          rewriter, op->getLoc(), outputType, inputOperands);
+      break;
+    case MaterializationKind::Source:
+      newMaterialization = converter->materializeSourceConversion(
+          rewriter, op->getLoc(), outputType, inputOperands);
+      break;
+    }
+    if (newMaterialization) {
+      assert(newMaterialization.getType() == outputType &&
+             "materialization callback produced value of incorrect type");
+      replaceMaterialization(rewriterImpl, opResult, newMaterialization,
+                             inverseMapping);
+      return success();
+    }
+  }
+
+  InFlightDiagnostic diag = op->emitError()
+                            << "failed to legalize unresolved materialization "
+                               "from ("
+                            << inputOperands.getTypes() << ") to " << outputType
+                            << " that remained live after conversion";
+  if (Operation *liveUser = findLiveUser(op->getUsers())) {
+    diag.attachNote(liveUser->getLoc())
+        << "see existing live user here: " << *liveUser;
+  }
+  return failure();
+}
+
+LogicalResult OperationConverter::legalizeUnresolvedMaterializations(
+    ConversionPatternRewriter &rewriter,
+    ConversionPatternRewriterImpl &rewriterImpl,
+    DenseMap<Value, SmallVector<Value>> &inverseMapping) {
+  // As an initial step, compute all of the inserted materializations that we
+  // expect to persist beyond the conversion process.
+  DenseMap<Operation *, UnresolvedMaterializationRewrite *> materializationOps;
+  SetVector<UnresolvedMaterializationRewrite *> necessaryMaterializations;
+  computeNecessaryMaterializations(materializationOps, rewriter, rewriterImpl,
+                                   inverseMapping, necessaryMaterializations);
+
+  // Once computed, legalize any necessary materializations.
+  for (auto *mat : necessaryMaterializations) {
+    if (failed(legalizeUnresolvedMaterialization(
+            *mat, materializationOps, rewriter, rewriterImpl, inverseMapping)))
+      return failure();
+  }
+  return success();
+}
+
 LogicalResult OperationConverter::legalizeErasedResult(
     Operation *op, OpResult result,
     ConversionPatternRewriterImpl &rewriterImpl) {
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 75362378daaaaa..156a8a468d5b42 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -1286,6 +1286,7 @@ func.func @warpgroup_matrix_multiply_m128n128k64(
 
 // CHECK-DAG: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] : !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>> to i64
 // CHECK-DAG: %[[S1:.+]] = builtin.unrealized_conversion_cast %[[arg1]] : !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>> to i64
+// CHECK-DAG: %[[S2:.+]] = builtin.unrealized_conversion_cast %[[arg2]] : memref<128x128xf32, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
 // CHECK: %[[S3:.+]] = llvm.mlir.constant(0.000000e+00 : f32) : f32
 // CHECK: %[[S4:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)>
 // CHECK: %[[S5:.+]] = llvm.extractvalue %[[S4]][0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
@@ -1298,8 +1299,8 @@ func.func @warpgroup_matrix_multiply_m128n128k64(
 // CHECK: %[[S136:.+]] = llvm.insertvalue %[[S134]], %[[S135]][1] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
 // CHECK: nvvm.wgmma.fence.aligned
 // CHECK: %[[S137:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)>
-// CHECK: %[[S138:.+]] = llvm.extractvalue %{{.*}}[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
-// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %[[S1]], %[[S138]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+// CHECK: %[[S138:.+]] = llvm.extractvalue %136[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
+// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %[[S0]], %1, %[[S138]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <row>] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
 // CHECK: nvvm.wgmma.mma_async
 // CHECK: nvvm.wgmma.mma_async
 // CHECK: %[[S154:.+]] = nvvm.wgmma.mma_async
diff --git a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir
index ab18ce05e355d3..a192434c5accf8 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir
@@ -80,7 +80,6 @@ func.func @no_layout_to_dyn_layout_cast(%m: memref<?xf32>) -> memref<?xf32, stri
   %0 = bufferization.to_tensor %m : memref<?xf32>
   // expected-error @+1 {{failed to legalize unresolved materialization from ('memref<?xf32>') to 'memref<?xf32, strided<[1], offset: ?>>' that remained live after conversion}}
   %1 = bufferization.to_memref %0 : memref<?xf32, strided<[1], offset: ?>>
-  // expected-note @below{{see existing live user here}}
   return %1 : memref<?xf32, strided<[1], offset: ?>>
 }
 
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index 54d299a3efcecd..8b42e952bb1b7c 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -538,6 +538,24 @@ func.func @test_tile_invalid_multiples() {
 
 // -----
 
+func.func @test_tile_invalid_multiples_value() {
+  %0 = tensor.empty() : tensor<4x31xf32>
+  // expected-error@+1 {{'tosa.tile' op expect element of 'multiples' to be positive integer or -1.}}
+  %1 = tosa.tile %0 {multiples = array<i64: 2, -2>} : (tensor<4x31xf32>) -> tensor<4x31xf32>
+  return
+}
+
+// -----
+
+func.func @test_tile_io_rank_mismatch() {
+  %0 = tensor.empty() : tensor<4x31xf32>
+  // expected-error@+1 {{'tosa.tile' op expect same input and output tensor rank.}}
+  %1 = tosa.tile %0 {multiples = array<i64: 2, 2>} : (tensor<4x31xf32>) -> tensor<4x31x31xf32>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @test_invalid_constant_permutation
 func.func @test_invalid_constant_permutation() {
   // expected-error@+3 {{permutation must be within input bounds}}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir
index cc05940fb6d02c..0ee016627440f7 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/outerproduct-f32.mlir
@@ -22,7 +22,7 @@
 func.func @test_outerproduct_no_accumulator_4x4xf32() {
   %c0 = arith.constant 0 : index
 
-  %vector_i32 = llvm.intr.experimental.stepvector : vector<[4]xi32>
+  %vector_i32 = llvm.intr.stepvector : vector<[4]xi32>
   %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32>
   %tile = vector.outerproduct %vector, %vector : vector<[4]xf32>, vector<[4]xf32>
 
@@ -47,7 +47,7 @@ func.func @test_outerproduct_with_accumulator_4x4xf32() {
   %f10 = arith.constant 10.0 : f32
 
   %acc = vector.splat %f10 : vector<[4]x[4]xf32>
-  %vector_i32 = llvm.intr.experimental.stepvector : vector<[4]xi32>
+  %vector_i32 = llvm.intr.stepvector : vector<[4]xi32>
   %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32>
   %tile = vector.outerproduct %vector, %vector, %acc : vector<[4]xf32>, vector<[4]xf32>
 
@@ -71,7 +71,7 @@ func.func @test_masked_outerproduct_no_accumulator_4x4xf32() {
   %c0 = arith.constant 0 : index
   %ones = arith.constant dense<1> : vector<[4]xi32>
 
-  %step_vector = llvm.intr.experimental.stepvector : vector<[4]xi32>
+  %step_vector = llvm.intr.stepvector : vector<[4]xi32>
   %vector_i32 = arith.addi %step_vector, %ones : vector<[4]xi32>
   %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32>
 
@@ -104,7 +104,7 @@ func.func @test_masked_outerproduct_with_accumulator_4x4xf32() {
   %f10 = arith.constant 10.0 : f32
 
   %acc = vector.splat %f10 : vector<[4]x[4]xf32>
-  %step_vector = llvm.intr.experimental.stepvector : vector<[4]xi32>
+  %step_vector = llvm.intr.stepvector : vector<[4]xi32>
   %vector_i32 = arith.addi %step_vector, %ones : vector<[4]xi32>
   %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32>
 
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
new file mode 100644
index 00000000000000..0e563808da970b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) {
+  // expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support generic for from_proxy attribute}}
+  nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %size from_proxy=#nvvm.proxy_kind<tensormap> to_proxy=#nvvm.proxy_kind<generic>
+  llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_fence_proxy_release() {
+  // expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support generic for from_proxy attribute}}
+  nvvm.fence.proxy.release #nvvm.mem_scope<cta> from_proxy=#nvvm.proxy_kind<tensormap> to_proxy=#nvvm.proxy_kind<generic>
+  llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) {
+  // expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support tensormap for to_proxy attribute}}
+  nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %size  from_proxy=#nvvm.proxy_kind<generic> to_proxy=#nvvm.proxy_kind<generic>
+  llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_fence_proxy_release() {
+  // expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support tensormap for to_proxy attribute}}
+  nvvm.fence.proxy.release #nvvm.mem_scope<cta> from_proxy=#nvvm.proxy_kind<generic> to_proxy=#nvvm.proxy_kind<generic>
+  llvm.return
+}
\ No newline at end of file
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index a8ae4d97888c90..6e2787d121ae64 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -574,3 +574,40 @@ llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant})
 llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}, %arg1: f32, %arg2: !llvm.ptr {llvm.byval = f32, nvvm.grid_constant}) attributes {nvvm.kernel} {
   llvm.return
 }
+
+
+// -----
+// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_release
+llvm.func @nvvm_fence_proxy_tensormap_generic_release() {
+  %c128 = llvm.mlir.constant(128) : i32
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta()
+  nvvm.fence.proxy.release #nvvm.mem_scope<cta>
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster()
+  nvvm.fence.proxy.release #nvvm.mem_scope<cluster>
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu()
+  nvvm.fence.proxy.release #nvvm.mem_scope<gpu>
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys()
+  nvvm.fence.proxy.release #nvvm.mem_scope<sys>
+  llvm.return
+}
+
+// -----
+// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_acquire
+llvm.func @nvvm_fence_proxy_tensormap_generic_acquire(%addr : !llvm.ptr) {
+  %c128 = llvm.mlir.constant(128) : i32
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr {{%[0-9]+}}, i32 128)
+  nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %c128
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr {{%[0-9]+}}, i32 128)
+  nvvm.fence.proxy.acquire #nvvm.mem_scope<cluster> %addr, %c128
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr {{%[0-9]+}}, i32 128)
+  nvvm.fence.proxy.acquire #nvvm.mem_scope<gpu> %addr, %c128
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr {{%[0-9]+}}, i32 128)
+  nvvm.fence.proxy.acquire #nvvm.mem_scope<sys> %addr, %c128
+  llvm.return
+}
\ No newline at end of file
diff --git a/mlir/test/Transforms/test-legalize-type-conversion.mlir b/mlir/test/Transforms/test-legalize-type-conversion.mlir
index f130adff42f8cd..cf2c9f6a8ec441 100644
--- a/mlir/test/Transforms/test-legalize-type-conversion.mlir
+++ b/mlir/test/Transforms/test-legalize-type-conversion.mlir
@@ -4,7 +4,6 @@
 func.func @test_invalid_arg_materialization(
   // expected-error@below {{failed to legalize unresolved materialization from () to 'i16' that remained live after conversion}}
   %arg0: i16) {
-  // expected-note@below{{see existing live user here}}
   "foo.return"(%arg0) : (i16) -> ()
 }
 
@@ -23,7 +22,6 @@ func.func @test_valid_arg_materialization(%arg0: i64) {
 func.func @test_invalid_result_materialization() {
   // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}}
   %result = "test.type_producer"() : () -> f16
-  // expected-note@below{{see existing live user here}}
   "foo.return"(%result) : (f16) -> ()
 }
 
@@ -32,7 +30,6 @@ func.func @test_invalid_result_materialization() {
 func.func @test_invalid_result_materialization() {
   // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}}
   %result = "test.type_producer"() : () -> f16
-  // expected-note@below{{see existing live user here}}
   "foo.return"(%result) : (f16) -> ()
 }
 
@@ -52,7 +49,6 @@ func.func @test_transitive_use_materialization() {
 func.func @test_transitive_use_invalid_materialization() {
   // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f16' that remained live after conversion}}
   %result = "test.another_type_producer"() : () -> f16
-  // expected-note@below{{see existing live user here}}
   "foo.return"(%result) : (f16) -> ()
 }
 
@@ -103,9 +99,9 @@ func.func @test_block_argument_not_converted() {
 func.func @test_signature_conversion_no_converter() {
   "test.signature_conversion_no_converter"() ({
   // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f32' that remained live after conversion}}
+  // expected-note@below {{see existing live user here}}
   ^bb0(%arg0: f32):
     "test.type_consumer"(%arg0) : (f32) -> ()
-    // expected-note@below{{see existing live user here}}
     "test.return"(%arg0) : (f32) -> ()
   }) : () -> ()
   return
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 98d0ddd9a2be11..f0d4f35ba3e229 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -18,12 +18,15 @@
 # name: The name of this test suite.
 config.name = "MLIR"
 
+# TODO: Consolidate the logic for turning on the internal shell by default for all LLVM test suites.
+# See https://github.com/llvm/llvm-project/issues/106636 for more details.
+#
 # We prefer the lit internal shell which provides a better user experience on failures
 # unless the user explicitly disables it with LIT_USE_INTERNAL_SHELL=0 env var.
 use_lit_shell = True
 lit_shell_env = os.environ.get("LIT_USE_INTERNAL_SHELL")
 if lit_shell_env:
-  use_lit_shell = not lit.util.pythonize_bool(lit_shell_env)
+    use_lit_shell = lit.util.pythonize_bool(lit_shell_env)
 
 config.test_format = lit.formats.ShTest(execute_external=not use_lit_shell)
 
diff --git a/polly/test/UnitIsl/lit.cfg b/polly/test/UnitIsl/lit.cfg
index 0944d543572d86..4b68f1460c3d83 100644
--- a/polly/test/UnitIsl/lit.cfg
+++ b/polly/test/UnitIsl/lit.cfg
@@ -17,12 +17,15 @@ config.name = 'Polly - isl unit tests'
 # For now we require '&&' between commands, until they get globally killed and
 # the test runner updated.
 #
+# TODO: Consolidate the logic for turning on the internal shell by default for all LLVM test suites.
+# See https://github.com/llvm/llvm-project/issues/106636 for more details.
+#
 # We prefer the lit internal shell which provides a better user experience on failures
 # unless the user explicitly disables it with LIT_USE_INTERNAL_SHELL=0 env var.
 use_lit_shell = True
 lit_shell_env = os.environ.get("LIT_USE_INTERNAL_SHELL")
 if lit_shell_env:
-  use_lit_shell = not lit.util.pythonize_bool(lit_shell_env)
+	use_lit_shell = lit.util.pythonize_bool(lit_shell_env)
 
 config.test_format = lit.formats.ShTest(execute_external=not use_lit_shell)
 
diff --git a/polly/test/lit.cfg b/polly/test/lit.cfg
index 156c1f97f5d3ae..075ebdacbdc946 100644
--- a/polly/test/lit.cfg
+++ b/polly/test/lit.cfg
@@ -20,12 +20,15 @@ config.name = 'Polly'
 # For now we require '&&' between commands, until they get globally killed and
 # the test runner updated.
 #
+# TODO: Consolidate the logic for turning on the internal shell by default for all LLVM test suites.
+# See https://github.com/llvm/llvm-project/issues/106636 for more details.
+#
 # We prefer the lit internal shell which provides a better user experience on failures
 # unless the user explicitly disables it with LIT_USE_INTERNAL_SHELL=0 env var.
 use_lit_shell = True
 lit_shell_env = os.environ.get("LIT_USE_INTERNAL_SHELL")
 if lit_shell_env:
-  use_lit_shell = not lit.util.pythonize_bool(lit_shell_env)
+    use_lit_shell = lit.util.pythonize_bool(lit_shell_env)
 
 config.test_format = lit.formats.ShTest(execute_external=not use_lit_shell)
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 1bf6cdbb447a4c..b2dcc696b0ad06 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -4160,6 +4160,7 @@ cc_library(
         ":Option",
         ":Support",
         ":WindowsManifest",
+        ":config",
     ],
 )
 
@@ -4601,6 +4602,7 @@ cc_binary(
         ":Target",
         ":TargetParser",
         ":TransformUtils",
+        ":config",
     ],
 )