From c4b205c93a6403316bcf94a27a7a44b5e8861bcd Mon Sep 17 00:00:00 2001
From: william <we3223@gmail.com>
Date: Sun, 13 Jul 2025 21:53:59 +0800
Subject: [PATCH 1/2] X86: Remove LowerToHorizontalOp and modified test case

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 118 ----
 .../PhaseOrdering}/X86/haddsub-2.ll           | 195 +++++-
 .../PhaseOrdering}/X86/haddsub-shuf.ll        | 285 ++++++++-
 .../PhaseOrdering}/X86/haddsub-undef.ll       | 407 +++++++++++-
 .../PhaseOrdering}/X86/haddsub.ll             | 590 +++++++++++++++++-
 .../PhaseOrdering}/X86/phaddsub-undef.ll      |  78 ++-
 6 files changed, 1502 insertions(+), 171 deletions(-)
 rename llvm/test/{CodeGen => Transforms/PhaseOrdering}/X86/haddsub-2.ll (81%)
 rename llvm/test/{CodeGen => Transforms/PhaseOrdering}/X86/haddsub-shuf.ll (73%)
 rename llvm/test/{CodeGen => Transforms/PhaseOrdering}/X86/haddsub-undef.ll (57%)
 rename llvm/test/{CodeGen => Transforms/PhaseOrdering}/X86/haddsub.ll (64%)
 rename llvm/test/{CodeGen => Transforms/PhaseOrdering}/X86/phaddsub-undef.ll (53%)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f8f29b9f2cdc7..677ecf8801e2d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8569,122 +8569,6 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
   return DAG.getNode(HOpcode, DL, VT, V0, V1);
 }
 
-/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
-static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL,
-                                   const X86Subtarget &Subtarget,
-                                   SelectionDAG &DAG) {
-  // We need at least 2 non-undef elements to make this worthwhile by default.
-  unsigned NumNonUndefs =
-      count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
-  if (NumNonUndefs < 2)
-    return SDValue();
-
-  // There are 4 sets of horizontal math operations distinguished by type:
-  // int/FP at 128-bit/256-bit. Each type was introduced with a different
-  // subtarget feature. Try to match those "native" patterns first.
-  MVT VT = BV->getSimpleValueType(0);
-  if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
-      ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
-      ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
-      ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
-    unsigned HOpcode;
-    SDValue V0, V1;
-    if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
-      return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
-  }
-
-  // Try harder to match 256-bit ops by using extract/concat.
-  if (!Subtarget.hasAVX() || !VT.is256BitVector())
-    return SDValue();
-
-  // Count the number of UNDEF operands in the build_vector in input.
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned Half = NumElts / 2;
-  unsigned NumUndefsLO = 0;
-  unsigned NumUndefsHI = 0;
-  for (unsigned i = 0, e = Half; i != e; ++i)
-    if (BV->getOperand(i)->isUndef())
-      NumUndefsLO++;
-
-  for (unsigned i = Half, e = NumElts; i != e; ++i)
-    if (BV->getOperand(i)->isUndef())
-      NumUndefsHI++;
-
-  SDValue InVec0, InVec1;
-  if (VT == MVT::v8i32 || VT == MVT::v16i16) {
-    SDValue InVec2, InVec3;
-    unsigned X86Opcode;
-    bool CanFold = true;
-
-    if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
-        isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
-                              InVec3) &&
-        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
-        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
-      X86Opcode = X86ISD::HADD;
-    else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
-                                   InVec1) &&
-             isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
-                                   InVec3) &&
-             ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
-             ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
-      X86Opcode = X86ISD::HSUB;
-    else
-      CanFold = false;
-
-    if (CanFold) {
-      // Do not try to expand this build_vector into a pair of horizontal
-      // add/sub if we can emit a pair of scalar add/sub.
-      if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
-        return SDValue();
-
-      // Convert this build_vector into a pair of horizontal binops followed by
-      // a concat vector. We must adjust the outputs from the partial horizontal
-      // matching calls above to account for undefined vector halves.
-      SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
-      SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
-      assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
-      bool isUndefLO = NumUndefsLO == Half;
-      bool isUndefHI = NumUndefsHI == Half;
-      return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
-                                   isUndefHI);
-    }
-  }
-
-  if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
-      VT == MVT::v16i16) {
-    unsigned X86Opcode;
-    if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
-                              InVec1))
-      X86Opcode = X86ISD::HADD;
-    else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
-                                   InVec1))
-      X86Opcode = X86ISD::HSUB;
-    else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
-                                   InVec1))
-      X86Opcode = X86ISD::FHADD;
-    else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
-                                   InVec1))
-      X86Opcode = X86ISD::FHSUB;
-    else
-      return SDValue();
-
-    // Don't try to expand this build_vector into a pair of horizontal add/sub
-    // if we can simply emit a pair of scalar add/sub.
-    if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
-      return SDValue();
-
-    // Convert this build_vector into two horizontal add/sub followed by
-    // a concat vector.
-    bool isUndefLO = NumUndefsLO == Half;
-    bool isUndefHI = NumUndefsHI == Half;
-    return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
-                                 isUndefLO, isUndefHI);
-  }
-
-  return SDValue();
-}
-
 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG);
 
@@ -9270,8 +9154,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
     return AddSub;
-  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
-    return HorizontalOp;
   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
     return Broadcast;
   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/Transforms/PhaseOrdering/X86/haddsub-2.ll
similarity index 81%
rename from llvm/test/CodeGen/X86/haddsub-2.ll
rename to llvm/test/Transforms/PhaseOrdering/X86/haddsub-2.ll
index bca446fa8fb56..4eb5bdba9edb6 100644
--- a/llvm/test/CodeGen/X86/haddsub-2.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/haddsub-2.ll
@@ -1,38 +1,39 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
-; SSE-LABEL: hadd_ps_test1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    haddps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hadd_ps_test1(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ;
-; AVX-LABEL: hadd_ps_test1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+
   %vecext = extractelement <4 x float> %A, i32 0
   %vecext1 = extractelement <4 x float> %A, i32 1
   %add = fadd float %vecext, %vecext1
   %vecinit = insertelement <4 x float> undef, float %add, i32 0
+
   %vecext2 = extractelement <4 x float> %A, i32 2
   %vecext3 = extractelement <4 x float> %A, i32 3
   %add4 = fadd float %vecext2, %vecext3
   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
+
   %vecext6 = extractelement <4 x float> %B, i32 0
   %vecext7 = extractelement <4 x float> %B, i32 1
   %add8 = fadd float %vecext6, %vecext7
   %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
+
   %vecext10 = extractelement <4 x float> %B, i32 2
   %vecext11 = extractelement <4 x float> %B, i32 3
   %add12 = fadd float %vecext10, %vecext11
   %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
+
   ret <4 x float> %vecinit13
 }
 
+
 define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
 ; SSE-LABEL: hadd_ps_test2:
 ; SSE:       # %bb.0:
@@ -43,6 +44,13 @@ define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hadd_ps_test2(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
   %vecext = extractelement <4 x float> %A, i32 2
   %vecext1 = extractelement <4 x float> %A, i32 3
   %add = fadd float %vecext, %vecext1
@@ -72,6 +80,13 @@ define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsub_ps_test1(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
   %vecext = extractelement <4 x float> %A, i32 0
   %vecext1 = extractelement <4 x float> %A, i32 1
   %sub = fsub float %vecext, %vecext1
@@ -101,6 +116,13 @@ define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsub_ps_test2(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
   %vecext = extractelement <4 x float> %A, i32 2
   %vecext1 = extractelement <4 x float> %A, i32 3
   %sub = fsub float %vecext, %vecext1
@@ -159,6 +181,13 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @phadd_d_test1(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
   %vecext = extractelement <4 x i32> %A, i32 0
   %vecext1 = extractelement <4 x i32> %A, i32 1
   %add = add i32 %vecext, %vecext1
@@ -217,6 +246,13 @@ define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @phadd_d_test2(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
   %vecext = extractelement <4 x i32> %A, i32 2
   %vecext1 = extractelement <4 x i32> %A, i32 3
   %add = add i32 %vecext, %vecext1
@@ -275,6 +311,13 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @phsub_d_test1(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
   %vecext = extractelement <4 x i32> %A, i32 0
   %vecext1 = extractelement <4 x i32> %A, i32 1
   %sub = sub i32 %vecext, %vecext1
@@ -333,6 +376,13 @@ define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @phsub_d_test2(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
   %vecext = extractelement <4 x i32> %A, i32 2
   %vecext1 = extractelement <4 x i32> %A, i32 3
   %sub = sub i32 %vecext, %vecext1
@@ -362,6 +412,13 @@ define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hadd_pd_test1(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[VECINIT2:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[VECINIT2]]
+;
   %vecext = extractelement <2 x double> %A, i32 0
   %vecext1 = extractelement <2 x double> %A, i32 1
   %add = fadd double %vecext, %vecext1
@@ -383,6 +440,13 @@ define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hadd_pd_test2(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[VECINIT2:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[VECINIT2]]
+;
   %vecext = extractelement <2 x double> %A, i32 1
   %vecext1 = extractelement <2 x double> %A, i32 0
   %add = fadd double %vecext, %vecext1
@@ -404,6 +468,13 @@ define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hsub_pd_test1(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[VECINIT2:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[VECINIT2]]
+;
   %vecext = extractelement <2 x double> %A, i32 0
   %vecext1 = extractelement <2 x double> %A, i32 1
   %sub = fsub double %vecext, %vecext1
@@ -425,6 +496,13 @@ define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hsub_pd_test2(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[VECINIT2:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[VECINIT2]]
+;
   %vecext = extractelement <2 x double> %B, i32 0
   %vecext1 = extractelement <2 x double> %B, i32 1
   %sub = fsub double %vecext, %vecext1
@@ -456,6 +534,13 @@ define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
 ; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @avx_vhadd_pd_test(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
   %vecext = extractelement <4 x double> %A, i32 0
   %vecext1 = extractelement <4 x double> %A, i32 1
   %add = fadd double %vecext, %vecext1
@@ -495,6 +580,13 @@ define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
 ; AVX2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @avx_vhsub_pd_test(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
   %vecext = extractelement <4 x double> %A, i32 0
   %vecext1 = extractelement <4 x double> %A, i32 1
   %sub = fsub double %vecext, %vecext1
@@ -590,6 +682,13 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
 ; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @avx2_vphadd_d_test(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
   %vecext = extractelement <8 x i32> %A, i32 0
   %vecext1 = extractelement <8 x i32> %A, i32 1
   %add = add i32 %vecext, %vecext1
@@ -745,6 +844,13 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @avx2_vphadd_w_test(
+; CHECK-SAME: <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
   %vecext = extractelement <16 x i16> %a, i32 0
   %vecext1 = extractelement <16 x i16> %a, i32 1
   %add = add i16 %vecext, %vecext1
@@ -863,6 +969,13 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
 ; AVX-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrd $3, %esi, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @not_a_hsub_1(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 2, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
   %vecext = extractelement <4 x i32> %A, i32 0
   %vecext1 = extractelement <4 x i32> %A, i32 1
   %sub = sub i32 %vecext, %vecext1
@@ -920,6 +1033,13 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @not_a_hsub_2(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
   %vecext = extractelement <4 x float> %A, i32 2
   %vecext1 = extractelement <4 x float> %A, i32 3
   %sub = fsub float %vecext, %vecext1
@@ -960,6 +1080,13 @@ define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
 ; AVX-NEXT:    vsubsd %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @not_a_hsub_3(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[VECINIT2:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[VECINIT2]]
+;
   %vecext = extractelement <2 x double> %B, i32 0
   %vecext1 = extractelement <2 x double> %B, i32 1
   %sub = fsub double %vecext, %vecext1
@@ -985,6 +1112,13 @@ define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @avx_vhadd_ps(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -1031,6 +1165,13 @@ define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @avx_vhsub_ps(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
   %sub = fsub float %vecext, %vecext1
@@ -1077,6 +1218,13 @@ define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @avx_hadd_pd(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
   %vecext = extractelement <4 x double> %a, i32 0
   %vecext1 = extractelement <4 x double> %a, i32 1
   %add = fadd double %vecext, %vecext1
@@ -1107,6 +1255,13 @@ define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @avx_hsub_pd(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
   %vecext = extractelement <4 x double> %a, i32 0
   %vecext1 = extractelement <4 x double> %a, i32 1
   %sub = fsub double %vecext, %vecext1
@@ -1202,6 +1357,13 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @avx2_hadd_d(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
   %vecext = extractelement <8 x i32> %a, i32 0
   %vecext1 = extractelement <8 x i32> %a, i32 1
   %add = add i32 %vecext, %vecext1
@@ -1355,6 +1517,13 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @avx2_hadd_w(
+; CHECK-SAME: <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
   %vecext = extractelement <16 x i16> %a, i32 0
   %vecext1 = extractelement <16 x i16> %a, i32 1
   %add = add i16 %vecext, %vecext1
diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/Transforms/PhaseOrdering/X86/haddsub-shuf.ll
similarity index 73%
rename from llvm/test/CodeGen/X86/haddsub-shuf.ll
rename to llvm/test/Transforms/PhaseOrdering/X86/haddsub-shuf.ll
index 364ad953a11d4..f425550c1c6df 100644
--- a/llvm/test/CodeGen/X86/haddsub-shuf.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/haddsub-shuf.ll
@@ -1,15 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3           | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSSE3,SSSE3_SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSSE3,SSSE3_FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST
-
-; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
-; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 define <4 x float> @hadd_v4f32(<4 x float> %a) {
 ; SSE-LABEL: hadd_v4f32:
@@ -21,6 +11,13 @@ define <4 x float> @hadd_v4f32(<4 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hadd_v4f32(
+; CHECK-SAME: <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[SHUF]]
+;
   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
   %hop = fadd <2 x float> %a02, %a13
@@ -65,6 +62,13 @@ define <8 x float> @hadd_v8f32a(<8 x float> %a) {
 ; AVX2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @hadd_v8f32a(
+; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 2, i32 poison, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 3, i32 poison, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[SHUF]]
+;
   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %hop = fadd <4 x float> %a0, %a1
@@ -83,6 +87,13 @@ define <8 x float> @hadd_v8f32b(<8 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @hadd_v8f32b(
+; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 0, i32 2, i32 4, i32 6, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 1, i32 3, i32 5, i32 7, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[SHUF]]
+;
   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = fadd <8 x float> %a0, %a1
@@ -100,6 +111,13 @@ define <4 x float> @hsub_v4f32(<4 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsub_v4f32(
+; CHECK-SAME: <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
+; CHECK-NEXT:    [[SHUF:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[SHUF]]
+;
   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
   %hop = fsub <2 x float> %a02, %a13
@@ -144,6 +162,13 @@ define <8 x float> @hsub_v8f32a(<8 x float> %a) {
 ; AVX2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @hsub_v8f32a(
+; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 2, i32 poison, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 3, i32 poison, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[SHUF]]
+;
   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %hop = fsub <4 x float> %a0, %a1
@@ -162,6 +187,13 @@ define <8 x float> @hsub_v8f32b(<8 x float> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @hsub_v8f32b(
+; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 0, i32 2, i32 4, i32 6, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 1, i32 3, i32 5, i32 7, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[SHUF]]
+;
   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = fsub <8 x float> %a0, %a1
@@ -206,6 +238,13 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) {
 ; AVX2_FAST:       # %bb.0:
 ; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX2_FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hadd_v2f64(
+; CHECK-SAME: <2 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[SHUF]]
+;
   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %hop = fadd <2 x double> %a0, %a1
@@ -250,6 +289,13 @@ define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) {
 ; AVX2_FAST:       # %bb.0:
 ; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX2_FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hadd_v2f64_scalar_splat(
+; CHECK-SAME: <2 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[SHUF]]
+;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
   %hop = fadd double %a0, %a1
@@ -281,6 +327,13 @@ define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @hadd_v4f64_scalar_splat(
+; CHECK-SAME: <4 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[SHUF]]
+;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
   %hop0 = fadd double %a0, %a1
@@ -335,6 +388,13 @@ define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) {
 ; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX2_FAST-NEXT:    vbroadcastsd %xmm0, %ymm0
 ; AVX2_FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @hadd_v4f64_scalar_broadcast(
+; CHECK-SAME: <4 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[SHUF]]
+;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
   %hop0 = fadd double %a0, %a1
@@ -370,6 +430,13 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @hadd_v4f64(
+; CHECK-SAME: <4 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[SHUF:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[SHUF]]
+;
   %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
   %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
   %hop = fadd <4 x double> %a0, %a1
@@ -414,6 +481,12 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) {
 ; AVX2_FAST:       # %bb.0:
 ; AVX2_FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 ; AVX2_FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hsub_v2f64(
+; CHECK-SAME: <2 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[SHUF:%.*]] = fsub <2 x double> [[TMP1]], [[A]]
+; CHECK-NEXT:    ret <2 x double> [[SHUF]]
+;
   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %hop = fsub <2 x double> %a0, %a1
@@ -444,6 +517,13 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @hsub_v4f64(
+; CHECK-SAME: <4 x double> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[SHUF:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[SHUF]]
+;
   %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
   %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
   %hop = fsub <4 x double> %a0, %a1
@@ -468,6 +548,13 @@ define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @hadd_v4i32(
+; CHECK-SAME: <4 x i32> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 3>
+; CHECK-NEXT:    [[SHUF:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %hop = add <4 x i32> %a02, %a13
@@ -524,6 +611,13 @@ define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
 ; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @hadd_v8i32a(
+; CHECK-SAME: <8 x i32> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 2, i32 poison, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 3, i32 poison, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[SHUF]]
+;
   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %hop = add <4 x i32> %a0, %a1
@@ -560,6 +654,13 @@ define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @hadd_v8i32b(
+; CHECK-SAME: <8 x i32> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 0, i32 2, i32 0, i32 2, i32 4, i32 6, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 1, i32 3, i32 1, i32 3, i32 5, i32 7, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[SHUF]]
+;
   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = add <8 x i32> %a0, %a1
@@ -584,6 +685,13 @@ define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @hsub_v4i32(
+; CHECK-SAME: <4 x i32> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 poison, i32 2, i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 poison, i32 3, i32 1, i32 poison>
+; CHECK-NEXT:    [[SHUF:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %hop = sub <4 x i32> %a02, %a13
@@ -640,6 +748,13 @@ define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
 ; AVX2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @hsub_v8i32a(
+; CHECK-SAME: <8 x i32> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 2, i32 poison, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 3, i32 poison, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[SHUF]]
+;
   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %hop = sub <4 x i32> %a0, %a1
@@ -676,6 +791,13 @@ define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @hsub_v8i32b(
+; CHECK-SAME: <8 x i32> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 0, i32 2, i32 0, i32 2, i32 4, i32 6, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 1, i32 3, i32 1, i32 3, i32 5, i32 7, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[SHUF]]
+;
   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = sub <8 x i32> %a0, %a1
@@ -705,6 +827,13 @@ define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x i16> @hadd_v8i16(
+; CHECK-SAME: <8 x i16> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[SHUF]]
+;
   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = add <8 x i16> %a0246, %a1357
@@ -768,6 +897,13 @@ define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
 ; AVX2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @hadd_v16i16a(
+; CHECK-SAME: <16 x i16> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[SHUF:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[SHUF]]
+;
   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   %hop = add <8 x i16> %a0, %a1
@@ -820,6 +956,13 @@ define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @hadd_v16i16b(
+; CHECK-SAME: <16 x i16> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[SHUF:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[SHUF]]
+;
   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = add <16 x i16> %a0, %a1
@@ -845,6 +988,13 @@ define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x i16> @hsub_v8i16(
+; CHECK-SAME: <8 x i16> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 4, i32 poison, i32 poison, i32 2, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 5, i32 poison, i32 poison, i32 3, i32 poison, i32 7>
+; CHECK-NEXT:    [[SHUF:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[SHUF]]
+;
   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = sub <8 x i16> %a0246, %a1357
@@ -908,6 +1058,13 @@ define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
 ; AVX2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @hsub_v16i16a(
+; CHECK-SAME: <16 x i16> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[SHUF:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[SHUF]]
+;
   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   %hop = sub <8 x i16> %a0, %a1
@@ -960,6 +1117,13 @@ define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vphsubw %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @hsub_v16i16b(
+; CHECK-SAME: <16 x i16> [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[SHUF:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[SHUF]]
+;
   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = sub <16 x i16> %a0, %a1
@@ -985,6 +1149,12 @@ define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) {
 ; AVX2-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @broadcast_haddps_v4f32(
+; CHECK-SAME: <4 x float> [[A0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> [[A0]], <4 x float> [[A0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
   %1 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a0)
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %2
@@ -1002,6 +1172,13 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR34724_1(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 2, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[VECINIT13:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[VECINIT13]]
+;
   %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>
   %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5>
   %t2 = fadd <2 x float> %t0, %t1
@@ -1022,6 +1199,13 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR34724_2(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 2, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[VECINIT13:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[VECINIT13]]
+;
   %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
   %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef>
   %t2 = fadd <4 x float> %t0, %t1
@@ -1051,6 +1235,13 @@ define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) {
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hadd_4f32_v8f32_shuffle(
+; CHECK-SAME: <8 x float> [[A0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 6, i32 6>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
+; CHECK-NEXT:    [[HADD:%.*]] = fadd <4 x float> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <4 x float> [[HADD]]
+;
   %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
   %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1074,6 +1265,13 @@ define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) {
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsub_4f32_v8f32_shuffle(
+; CHECK-SAME: <8 x float> [[A0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HSUB0:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> poison, <4 x i32> <i32 2, i32 2, i32 6, i32 6>
+; CHECK-NEXT:    [[HSUB1:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> poison, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
+; CHECK-NEXT:    [[HSUB:%.*]] = fadd <4 x float> [[HSUB0]], [[HSUB1]]
+; CHECK-NEXT:    ret <4 x float> [[HSUB]]
+;
   %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
   %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1113,6 +1311,13 @@ define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @hadd_4i32_v8i32_shuffle(
+; CHECK-SAME: <8 x i32> [[A0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> poison, <4 x i32> <i32 2, i32 2, i32 6, i32 6>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> poison, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
+; CHECK-NEXT:    [[HADD:%.*]] = add <4 x i32> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <4 x i32> [[HADD]]
+;
   %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
   %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1152,6 +1357,13 @@ define <4 x i32> @hsub_4i32_v8i32_shuffle(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x i32> @hsub_4i32_v8i32_shuffle(
+; CHECK-SAME: <8 x i32> [[A0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HSUB0:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> poison, <4 x i32> <i32 2, i32 2, i32 6, i32 6>
+; CHECK-NEXT:    [[HSUB1:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> poison, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
+; CHECK-NEXT:    [[HSUB:%.*]] = add <4 x i32> [[HSUB0]], [[HSUB1]]
+; CHECK-NEXT:    ret <4 x i32> [[HSUB]]
+;
   %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
   %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1185,6 +1397,13 @@ define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1)
 ; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @hadd_4f64_v4f64_shuffle(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <4 x double> [[A0]], <4 x double> [[A1]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <4 x double> [[A0]], <4 x double> [[A1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[HADD:%.*]] = fadd <4 x double> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <4 x double> [[HADD]]
+;
   %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1213,6 +1432,13 @@ define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1)
 ; AVX2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @hsub_4f64_v4f64_shuffle(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <4 x double> [[A0]], <4 x double> [[A1]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <4 x double> [[A0]], <4 x double> [[A1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[HADD:%.*]] = fsub <4 x double> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <4 x double> [[HADD]]
+;
   %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1241,6 +1467,13 @@ define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
 ; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @hadd_8f32_v8f32_shuffle(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> [[A1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> [[A1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[HADD:%.*]] = fadd <8 x float> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <8 x float> [[HADD]]
+;
   %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %hadd0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -1269,6 +1502,13 @@ define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
 ; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @hsub_8f32_v8f32_shuffle(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HSUB0:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> [[A1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[HSUB1:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> [[A1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[HSUB:%.*]] = fadd <8 x float> [[HSUB0]], [[HSUB1]]
+; CHECK-NEXT:    ret <8 x float> [[HSUB]]
+;
   %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %hsub0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -1312,6 +1552,13 @@ define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @hadd_8i32_v8i32_shuffle(
+; CHECK-SAME: <8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> [[A1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> [[A1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[HADD:%.*]] = add <8 x i32> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <8 x i32> [[HADD]]
+;
   %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -1356,6 +1603,13 @@ define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX2-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @hsub_8i32_v8i32_shuffle(
+; CHECK-SAME: <8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> [[A1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> [[A1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[HADD:%.*]] = sub <8 x i32> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <8 x i32> [[HADD]]
+;
   %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -1413,6 +1667,13 @@ define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
+; CHECK-LABEL: define <16 x i16> @hadd_16i16_16i16_shuffle(
+; CHECK-SAME: <16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[HADD0:%.*]] = shufflevector <16 x i16> [[A0]], <16 x i16> [[A1]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[HADD1:%.*]] = shufflevector <16 x i16> [[A0]], <16 x i16> [[A1]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[HADD:%.*]] = add <16 x i16> [[HADD0]], [[HADD1]]
+; CHECK-NEXT:    ret <16 x i16> [[HADD]]
+;
   %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   %shuf1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %hadd0 = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/Transforms/PhaseOrdering/X86/haddsub-undef.ll
similarity index 57%
rename from llvm/test/CodeGen/X86/haddsub-undef.ll
rename to llvm/test/Transforms/PhaseOrdering/X86/haddsub-undef.ll
index 94fa81742ba71..678b0a10717ac 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/haddsub-undef.ll
@@ -1,12 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3               | FileCheck %s --check-prefixes=SSE,SSE-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops     | FileCheck %s --check-prefixes=SSE,SSE-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST
-
-; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: test1_undef:
@@ -18,6 +11,19 @@ define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test1_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x float> [[TMP1]], float undef, i64 2
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <4 x float> [[VECINIT3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 6, i32 2, i32 poison>
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; CHECK-NEXT:    [[VECINIT13:%.*]] = shufflevector <4 x float> [[VECINIT5]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT13]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -43,6 +49,19 @@ define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test2_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x float> [[TMP1]], float undef, i64 1
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT9:%.*]] = shufflevector <4 x float> [[VECINIT3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; CHECK-NEXT:    [[VECINIT13:%.*]] = shufflevector <4 x float> [[VECINIT9]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT13]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -68,6 +87,19 @@ define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test3_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x float> [[TMP1]], float undef, i64 3
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <4 x float> [[VECINIT3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 6, i32 poison, i32 3>
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[B]], [[SHIFT2]]
+; CHECK-NEXT:    [[VECINIT9:%.*]] = shufflevector <4 x float> [[VECINIT5]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT9]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -105,6 +137,13 @@ define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test4_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> <float poison, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -135,6 +174,13 @@ define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @test5_undef(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <2 x double> [[TMP1]], double undef, i64 1
+; CHECK-NEXT:    ret <2 x double> [[VECINIT1]]
+;
   %vecext = extractelement <2 x double> %a, i32 0
   %vecext1 = extractelement <2 x double> %a, i32 1
   %add = fadd double %vecext, %vecext1
@@ -152,6 +198,16 @@ define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test6_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> <float poison, float poison, float undef, float undef>, <4 x i32> <i32 0, i32 poison, i32 6, i32 7>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <4 x float> [[VECINIT]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT5]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -173,6 +229,16 @@ define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test7_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> <float undef, float undef, float poison, float poison>, <4 x i32> <i32 4, i32 5, i32 0, i32 poison>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <4 x float> [[VECINIT]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT5]]
+;
   %vecext = extractelement <4 x float> %b, i32 0
   %vecext1 = extractelement <4 x float> %b, i32 1
   %add = fadd float %vecext, %vecext1
@@ -218,6 +284,16 @@ define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test8_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> <float poison, float undef, float poison, float undef>, <4 x i32> <i32 0, i32 5, i32 poison, i32 7>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <4 x float> [[VECINIT]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT5]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -239,6 +315,16 @@ define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @test9_undef(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> <float poison, float undef, float undef, float poison>, <4 x i32> <i32 0, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <4 x float> [[VECINIT]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINIT5]]
+;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -260,6 +346,16 @@ define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @test10_undef(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> <float poison, float undef, float undef, float poison, float undef, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 9, i32 10, i32 poison, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <8 x float> [[VECINIT]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[VECINIT5]]
+;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -292,6 +388,16 @@ define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @test11_undef(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> <float poison, float undef, float undef, float undef, float undef, float undef, float poison, float undef>, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 15>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[B]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <8 x float> [[VECINIT]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 12, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[VECINIT5]]
+;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -313,6 +419,16 @@ define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @test12_undef(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> <float poison, float poison, float undef, float undef, float undef, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 poison, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <8 x float> [[VECINIT]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 10, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[VECINIT5]]
+;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
   %add = fadd float %vecext, %vecext1
@@ -335,6 +451,14 @@ define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @test13_undef(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP4]]
+;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %a, i32 1
   %add1 = fadd float %vecext, %vecext1
@@ -389,6 +513,14 @@ define <16 x float> @test13_v16f32_undef(<16 x float> %a, <16 x float> %b) {
 ; AVX512-SLOW-NEXT:    vaddss %xmm0, %xmm2, %xmm0
 ; AVX512-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX512-SLOW-NEXT:    retq
+; CHECK-LABEL: define <16 x float> @test13_v16f32_undef(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    ret <16 x float> [[TMP4]]
+;
   %vecext = extractelement <16 x float> %a, i32 0
   %vecext1 = extractelement <16 x float> %a, i32 1
   %add1 = fadd float %vecext, %vecext1
@@ -429,6 +561,12 @@ define <2 x double> @add_pd_003(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @add_pd_003(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x double> [[X]], [[L]]
+; CHECK-NEXT:    ret <2 x double> [[ADD]]
+;
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
   %add = fadd <2 x double> %l, %x
   ret <2 x double> %add
@@ -459,6 +597,12 @@ define <2 x double> @add_pd_003_2(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @add_pd_003_2(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x double> [[X]], [[L]]
+; CHECK-NEXT:    ret <2 x double> [[ADD]]
+;
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   %add = fadd <2 x double> %l, %x
   ret <2 x double> %add
@@ -481,6 +625,12 @@ define <2 x double> @add_pd_010(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @add_pd_010(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = fadd <2 x double> [[TMP1]], [[X]]
+; CHECK-NEXT:    ret <2 x double> [[SHUFFLE2]]
+;
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
   %add = fadd <2 x double> %l, %x
   %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -497,6 +647,13 @@ define <4 x float> @add_ps_007(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_007(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[L]], [[R]]
+; CHECK-NEXT:    ret <4 x float> [[ADD]]
+;
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
   %add = fadd <4 x float> %l, %r
@@ -530,6 +687,13 @@ define <4 x float> @add_ps_030(<4 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_030(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 3, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE2]]
+;
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
   %add = fadd <4 x float> %l, %r
@@ -547,6 +711,13 @@ define <4 x float> @add_ps_007_2(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_007_2(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 poison>
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[L]], [[R]]
+; CHECK-NEXT:    ret <4 x float> [[ADD]]
+;
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
   %add = fadd <4 x float> %l, %r
@@ -575,6 +746,12 @@ define <4 x float> @add_ps_008(<4 x float> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_008(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[L:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[X]], [[L]]
+; CHECK-NEXT:    ret <4 x float> [[ADD]]
+;
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = fadd <4 x float> %l, %x
   ret <4 x float> %add
@@ -593,6 +770,13 @@ define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) {
 ; AVX-NEXT:    vhaddps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,3,3]
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_016(
+; CHECK-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP0]], <4 x i32> <i32 2, i32 0, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP0]], <4 x i32> <i32 3, i32 1, i32 7, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
+;
   %3 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 0, i32 6>
   %4 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 1, i32 7>
   %5 = fadd <2 x float> %3, %4
@@ -630,6 +814,13 @@ define <4 x float> @add_ps_017(<4 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_017(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE2]]
+;
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = fadd <4 x float> %l, %x
   %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
@@ -660,6 +851,12 @@ define <4 x float> @add_ps_018(<4 x float> %x) {
 ; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @add_ps_018(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = fadd <4 x float> [[TMP1]], [[X]]
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE2]]
+;
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
   %add = fadd <4 x float> %l, %r
@@ -704,6 +901,13 @@ define <4 x double> @add_pd_011(<4 x double> %0, <4 x double> %1) {
 ; AVX512-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @add_pd_011(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 poison, i32 4, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 1, i32 poison, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x double> [[TMP5]]
+;
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 undef, i32 4, i32 undef>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 1, i32 undef, i32 5, i32 undef>
   %5 = fadd <4 x double> %3, %4
@@ -722,6 +926,18 @@ define <4 x float> @v8f32_inputs_v4f32_output_0101(<8 x float> %a, <8 x float> %
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @v8f32_inputs_v4f32_output_0101(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R0:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> <float poison, float undef, float poison, float undef>, <4 x i32> <i32 0, i32 5, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[R0]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
   %b0 = extractelement <8 x float> %b, i32 0
@@ -744,6 +960,17 @@ define <4 x float> @v8f32_input0_v4f32_output_0123(<8 x float> %a, <4 x float> %
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @v8f32_input0_v4f32_output_0123(
+; CHECK-SAME: <8 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[R0:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> <float poison, float undef, float undef, float poison>, <4 x i32> <i32 0, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[R0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
   %b2 = extractelement <4 x float> %b, i32 2
@@ -766,6 +993,17 @@ define <4 x float> @v8f32_input1_v4f32_output_2301(<4 x float> %a, <8 x float> %
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @v8f32_input1_v4f32_output_2301(
+; CHECK-SAME: <4 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> <float undef, float poison, float poison, float undef>, <4 x i32> <i32 4, i32 2, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a2 = extractelement <4 x float> %a, i32 2
   %a3 = extractelement <4 x float> %a, i32 3
   %b0 = extractelement <8 x float> %b, i32 0
@@ -788,6 +1026,18 @@ define <4 x float> @v8f32_inputs_v4f32_output_2323(<8 x float> %a, <8 x float> %
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @v8f32_inputs_v4f32_output_2323(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> <float undef, float poison, float undef, float poison>, <4 x i32> <i32 4, i32 2, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[R1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a2 = extractelement <8 x float> %a, i32 2
   %a3 = extractelement <8 x float> %a, i32 3
   %b2 = extractelement <8 x float> %b, i32 2
@@ -822,6 +1072,18 @@ define <4 x float> @v16f32_inputs_v4f32_output_0123(<16 x float> %a, <16 x float
 ; AVX512-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @v16f32_inputs_v4f32_output_0123(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R0:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> <float poison, float undef, float undef, float poison>, <4 x i32> <i32 0, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[R0]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a0 = extractelement <16 x float> %a, i32 0
   %a1 = extractelement <16 x float> %a, i32 1
   %b2 = extractelement <16 x float> %b, i32 2
@@ -853,6 +1115,18 @@ define <8 x float> @v16f32_inputs_v8f32_output_4567(<16 x float> %a, <16 x float
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @v16f32_inputs_v8f32_output_4567(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> <float undef, float undef, float undef, float undef, float poison, float undef, float undef, float poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 13, i32 14, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a4 = extractelement <16 x float> %a, i32 4
   %a5 = extractelement <16 x float> %a, i32 5
   %b6 = extractelement <16 x float> %b, i32 6
@@ -874,6 +1148,16 @@ define <8 x float> @PR40243(<8 x float> %a, <8 x float> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @PR40243(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[R4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> <float undef, float undef, float undef, float undef, float poison, float undef, float undef, float poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 13, i32 14, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a4 = extractelement <8 x float> %a, i32 4
   %a5 = extractelement <8 x float> %a, i32 5
   %add4 = fadd float %a4, %a5
@@ -921,6 +1205,13 @@ define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) {
 ; AVX512-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @PR44694(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x double> [[TMP5]]
+;
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %5 = fadd <4 x double> %3, %4
@@ -952,6 +1243,13 @@ define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR45747_1(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE]]
+;
   %t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
   %t1 = fadd <4 x float> %t0, %a
   %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
@@ -985,6 +1283,13 @@ define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
 ; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm0
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR45747_2(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE]]
+;
   %t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
   %t1 = fadd <4 x float> %t0, %b
   %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
@@ -1001,6 +1306,13 @@ define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR34724_add_v4f32_u123(
+; CHECK-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 poison, i32 2, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 poison, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
+;
   %3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 4>
   %4 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 5>
   %5 = fadd <2 x float> %3, %4
@@ -1040,6 +1352,13 @@ define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR34724_add_v4f32_0u23(
+; CHECK-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 poison, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 1, i32 poison, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
+;
   %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %4 = fadd <4 x float> %3, %0
   %5 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -1061,6 +1380,13 @@ define <4 x float> @PR34724_add_v4f32_01u3(<4 x float> %0, <4 x float> %1) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR34724_add_v4f32_01u3(
+; CHECK-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 2, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 1, i32 3, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
+;
   %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3>
   %5 = fadd <2 x float> %3, %4
@@ -1081,6 +1407,13 @@ define <4 x float> @PR34724_add_v4f32_012u(<4 x float> %0, <4 x float> %1) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @PR34724_add_v4f32_012u(
+; CHECK-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
+;
   %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3>
   %5 = fadd <2 x float> %3, %4
@@ -1129,6 +1462,20 @@ define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) {
 ; AVX-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX-FAST-NEXT:    vhaddpd %ymm0, %ymm1, %ymm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @PR34724_add_v4f64_u123(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <2 x i32> <i32 2, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <2 x i32> <i32 3, i32 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> <double undef, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[SHIFT]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 2, i32 4>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 3, i32 5>
   %5 = fadd <2 x double> %3, %4
@@ -1176,6 +1523,20 @@ define <4 x double> @PR34724_add_v4f64_0u23(<4 x double> %0, <4 x double> %1) {
 ; AVX-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-FAST-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @PR34724_add_v4f64_0u23(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double undef, i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[SHIFT]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 0, i32 4>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 1, i32 5>
   %5 = fadd <2 x double> %3, %4
@@ -1230,6 +1591,20 @@ define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) {
 ; AVX512-FAST-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX512-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
 ; AVX512-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @PR34724_add_v4f64_01u3(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double undef, i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 5, i32 2, i32 poison>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[SHIFT]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
   %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2>
   %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3>
   %5 = fadd <2 x double> %3, %4
@@ -1276,6 +1651,20 @@ define <4 x double> @PR34724_add_v4f64_012u(<4 x double> %0, <4 x double> %1) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
 ; AVX-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @PR34724_add_v4f64_012u(
+; CHECK-SAME: <4 x double> [[TMP0:%.*]], <4 x double> [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double undef, i64 3
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 5, i32 poison, i32 3>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[TMP1]], [[SHIFT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
   %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2>
   %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3>
   %5 = fadd <2 x double> %3, %4
diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/haddsub.ll
similarity index 64%
rename from llvm/test/CodeGen/X86/haddsub.ll
rename to llvm/test/Transforms/PhaseOrdering/X86/haddsub.ll
index a0778195b5c73..91289087689ef 100644
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/haddsub.ll
@@ -1,12 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3               | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops     | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops     | FileCheck %s --check-prefixes=AVX,AVX-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f            | FileCheck %s --check-prefixes=AVX,AVX-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
 ; SSE3-LABEL: haddpd1:
@@ -18,6 +11,13 @@ define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @haddpd1(
+; CHECK-SAME: <2 x double> [[X:%.*]], <2 x double> [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[X]], <2 x double> [[Y]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <2 x double> [[X]], <2 x double> [[Y]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
   %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
   %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
   %r = fadd <2 x double> %a, %b
@@ -34,6 +34,13 @@ define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @haddpd2(
+; CHECK-SAME: <2 x double> [[X:%.*]], <2 x double> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[X]], <2 x double> [[Y]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <2 x double> [[X]], <2 x double> [[Y]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
   %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
   %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
   %r = fadd <2 x double> %a, %b
@@ -63,6 +70,13 @@ define <2 x double> @haddpd3(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @haddpd3(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
   %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %r = fadd <2 x double> %a, %b
@@ -79,6 +93,13 @@ define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps1(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %r = fadd <4 x float> %a, %b
@@ -95,6 +116,13 @@ define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps2(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[X]], <4 x i32> <i32 4, i32 7, i32 0, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
   %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
   %r = fadd <4 x float> %a, %b
@@ -111,6 +139,13 @@ define <4 x float> @haddps3(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps3(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> <float undef, float poison, float undef, float poison>, <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> <float poison, float undef, float poison, float undef>, <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %r = fadd <4 x float> %a, %b
@@ -127,6 +162,13 @@ define <4 x float> @haddps4(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps4(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -143,6 +185,13 @@ define <4 x float> @haddps5(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps5(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -171,6 +220,13 @@ define <4 x float> @haddps6(<4 x float> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps6(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -187,6 +243,13 @@ define <4 x float> @haddps7(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @haddps7(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -203,6 +266,13 @@ define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hsubpd1(
+; CHECK-SAME: <2 x double> [[X:%.*]], <2 x double> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[X]], <2 x double> [[Y]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <2 x double> [[X]], <2 x double> [[Y]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fsub <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
   %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
   %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
   %r = fsub <2 x double> %a, %b
@@ -232,6 +302,13 @@ define <2 x double> @hsubpd2(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <2 x double> @hsubpd2(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fsub <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
   %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %r = fsub <2 x double> %a, %b
@@ -248,6 +325,13 @@ define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsubps1(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fsub <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
   %r = fsub <4 x float> %a, %b
@@ -264,6 +348,13 @@ define <4 x float> @hsubps2(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsubps2(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> <float undef, float poison, float undef, float poison>, <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> <float poison, float undef, float poison, float undef>, <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fsub <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %r = fsub <4 x float> %a, %b
@@ -280,6 +371,13 @@ define <4 x float> @hsubps3(<4 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsubps3(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fsub <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %r = fsub <4 x float> %a, %b
@@ -308,6 +406,13 @@ define <4 x float> @hsubps4(<4 x float> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define <4 x float> @hsubps4(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = fsub <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %r = fsub <4 x float> %a, %b
@@ -325,6 +430,13 @@ define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @vhaddps1(
+; CHECK-SAME: <8 x float> [[X:%.*]], <8 x float> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[X]], <8 x float> [[Y]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x float> [[X]], <8 x float> [[Y]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = fadd <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
   %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
   %r = fadd <8 x float> %a, %b
@@ -342,6 +454,13 @@ define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @vhaddps2(
+; CHECK-SAME: <8 x float> [[X:%.*]], <8 x float> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[X]], <8 x float> [[Y]], <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x float> [[Y]], <8 x float> [[X]], <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fadd <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
   %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
   %r = fadd <8 x float> %a, %b
@@ -359,6 +478,13 @@ define <8 x float> @vhaddps3(<8 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @vhaddps3(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[X]], <8 x float> <float undef, float poison, float undef, float poison, float poison, float poison, float undef, float poison>, <8 x i32> <i32 poison, i32 2, i32 8, i32 10, i32 4, i32 6, i32 poison, i32 14>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x float> [[X]], <8 x float> <float poison, float undef, float poison, float poison, float poison, float undef, float poison, float undef>, <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = fadd <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
   %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
   %r = fadd <8 x float> %a, %b
@@ -376,6 +502,13 @@ define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @vhsubps1(
+; CHECK-SAME: <8 x float> [[X:%.*]], <8 x float> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[X]], <8 x float> [[Y]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x float> [[X]], <8 x float> [[Y]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = fsub <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
   %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
   %r = fsub <8 x float> %a, %b
@@ -393,6 +526,13 @@ define <8 x float> @vhsubps3(<8 x float> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x float> @vhsubps3(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <8 x float> [[X]], <8 x float> <float undef, float poison, float undef, float poison, float poison, float poison, float undef, float poison>, <8 x i32> <i32 poison, i32 2, i32 8, i32 10, i32 4, i32 6, i32 poison, i32 14>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x float> [[X]], <8 x float> <float poison, float undef, float poison, float poison, float poison, float undef, float poison, float undef>, <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = fsub <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
   %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
   %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
   %r = fsub <8 x float> %a, %b
@@ -410,6 +550,13 @@ define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @vhaddpd1(
+; CHECK-SAME: <4 x double> [[X:%.*]], <4 x double> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x double> [[X]], <4 x double> [[Y]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x double> [[X]], <4 x double> [[Y]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fadd <4 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x double> [[R]]
+;
   %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %r = fadd <4 x double> %a, %b
@@ -427,6 +574,13 @@ define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <4 x double> @vhsubpd1(
+; CHECK-SAME: <4 x double> [[X:%.*]], <4 x double> [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <4 x double> [[X]], <4 x double> [[Y]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <4 x double> [[X]], <4 x double> [[Y]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fsub <4 x double> [[A]], [[B]]
+; CHECK-NEXT:    ret <4 x double> [[R]]
+;
   %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %r = fsub <4 x double> %a, %b
@@ -443,6 +597,13 @@ define <2 x float> @haddps_v2f32(<4 x float> %v0) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <2 x float> @haddps_v2f32(
+; CHECK-SAME: <4 x float> [[V0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[V0]], <4 x float> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[V0]], <4 x float> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret <2 x float> [[TMP6]]
+;
   %v0.0 = extractelement <4 x float> %v0, i32 0
   %v0.1 = extractelement <4 x float> %v0, i32 1
   %v0.2 = extractelement <4 x float> %v0, i32 2
@@ -478,6 +639,13 @@ define float @extract_extract01_v4f32_fadd_f32(<4 x float> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fadd_f32(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   %x1 = extractelement <4 x float> %x, i32 1
   %x01 = fadd float %x0, %x1
@@ -511,6 +679,13 @@ define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v4f32_fadd_f32(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 2
   %x1 = extractelement <4 x float> %x, i32 3
   %x01 = fadd float %x0, %x1
@@ -539,6 +714,13 @@ define float @extract_extract01_v4f32_fadd_f32_commute(<4 x float> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fadd_f32_commute(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   %x1 = extractelement <4 x float> %x, i32 1
   %x01 = fadd float %x1, %x0
@@ -572,6 +754,13 @@ define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v4f32_fadd_f32_commute(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 2
   %x1 = extractelement <4 x float> %x, i32 3
   %x01 = fadd float %x1, %x0
@@ -601,6 +790,13 @@ define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v2f64_fadd_f64(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <2 x double> %x, i32 0
   %x1 = extractelement <2 x double> %x, i32 1
   %x01 = fadd double %x0, %x1
@@ -630,6 +826,13 @@ define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v2f64_fadd_f64_commute(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <2 x double> %x, i32 0
   %x1 = extractelement <2 x double> %x, i32 1
   %x01 = fadd double %x1, %x0
@@ -658,6 +861,13 @@ define float @extract_extract01_v4f32_fsub_f32(<4 x float> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fsub_f32(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   %x1 = extractelement <4 x float> %x, i32 1
   %x01 = fsub float %x0, %x1
@@ -692,6 +902,13 @@ define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) {
 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v4f32_fsub_f32(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 2
   %x1 = extractelement <4 x float> %x, i32 3
   %x01 = fsub float %x0, %x1
@@ -711,6 +928,13 @@ define float @extract_extract01_v4f32_fsub_f32_commute(<4 x float> %x) {
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fsub_f32_commute(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   %x1 = extractelement <4 x float> %x, i32 1
   %x01 = fsub float %x1, %x0
@@ -732,6 +956,13 @@ define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) {
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v4f32_fsub_f32_commute(
+; CHECK-SAME: <4 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 2
   %x1 = extractelement <4 x float> %x, i32 3
   %x01 = fsub float %x1, %x0
@@ -761,6 +992,13 @@ define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) {
 ; AVX-FAST:       # %bb.0:
 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v2f64_fsub_f64(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <2 x double> %x, i32 0
   %x1 = extractelement <2 x double> %x, i32 1
   %x01 = fsub double %x0, %x1
@@ -781,6 +1019,13 @@ define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) {
 ; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v2f64_fsub_f64_commute(
+; CHECK-SAME: <2 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <2 x double> %x, i32 0
   %x1 = extractelement <2 x double> %x, i32 1
   %x01 = fsub double %x1, %x0
@@ -813,6 +1058,13 @@ define float @extract_extract01_v8f32_fadd_f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v8f32_fadd_f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 0
   %x1 = extractelement <8 x float> %x, i32 1
   %x01 = fadd float %x0, %x1
@@ -848,6 +1100,13 @@ define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v8f32_fadd_f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 2
   %x1 = extractelement <8 x float> %x, i32 3
   %x01 = fadd float %x0, %x1
@@ -885,6 +1144,13 @@ define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract67_v8f32_fadd_f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 6
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 6
   %x1 = extractelement <8 x float> %x, i32 7
   %x01 = fadd float %x0, %x1
@@ -915,6 +1181,13 @@ define float @extract_extract01_v8f32_fadd_f32_commute(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v8f32_fadd_f32_commute(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 0
   %x1 = extractelement <8 x float> %x, i32 1
   %x01 = fadd float %x1, %x0
@@ -950,6 +1223,13 @@ define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v8f32_fadd_f32_commute(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 2
   %x1 = extractelement <8 x float> %x, i32 3
   %x01 = fadd float %x1, %x0
@@ -987,6 +1267,13 @@ define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract67_v8f32_fadd_f32_commute(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 6
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 6
   %x1 = extractelement <8 x float> %x, i32 7
   %x01 = fadd float %x1, %x0
@@ -1018,6 +1305,13 @@ define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v4f64_fadd_f64(
+; CHECK-SAME: <4 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[X]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <4 x double> %x, i32 0
   %x1 = extractelement <4 x double> %x, i32 1
   %x01 = fadd double %x0, %x1
@@ -1052,6 +1346,13 @@ define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract23_v4f64_fadd_f64(
+; CHECK-SAME: <4 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[X]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x double> [[TMP1]], i64 2
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <4 x double> %x, i32 2
   %x1 = extractelement <4 x double> %x, i32 3
   %x01 = fadd double %x0, %x1
@@ -1083,6 +1384,13 @@ define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v4f64_fadd_f64_commute(
+; CHECK-SAME: <4 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[X]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <4 x double> %x, i32 0
   %x1 = extractelement <4 x double> %x, i32 1
   %x01 = fadd double %x1, %x0
@@ -1117,6 +1425,13 @@ define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract23_v4f64_fadd_f64_commute(
+; CHECK-SAME: <4 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[X]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x double> [[TMP1]], i64 2
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <4 x double> %x, i32 2
   %x1 = extractelement <4 x double> %x, i32 3
   %x01 = fadd double %x1, %x0
@@ -1147,6 +1462,13 @@ define float @extract_extract01_v8f32_fsub_f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v8f32_fsub_f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 0
   %x1 = extractelement <8 x float> %x, i32 1
   %x01 = fsub float %x0, %x1
@@ -1183,6 +1505,13 @@ define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract23_v8f32_fsub_f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 2
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 2
   %x1 = extractelement <8 x float> %x, i32 3
   %x01 = fsub float %x0, %x1
@@ -1217,6 +1546,13 @@ define float @extract_extract45_v8f32_fsub_f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract45_v8f32_fsub_f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 4
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 4
   %x1 = extractelement <8 x float> %x, i32 5
   %x01 = fsub float %x0, %x1
@@ -1239,6 +1575,13 @@ define float @extract_extract01_v8f32_fsub_f32_commute(<8 x float> %x) {
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v8f32_fsub_f32_commute(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <8 x float> %x, i32 0
   %x1 = extractelement <8 x float> %x, i32 1
   %x01 = fsub float %x1, %x0
@@ -1270,6 +1613,13 @@ define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) {
 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v4f64_fsub_f64(
+; CHECK-SAME: <4 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[X]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <4 x double> %x, i32 0
   %x1 = extractelement <4 x double> %x, i32 1
   %x01 = fsub double %x0, %x1
@@ -1293,6 +1643,13 @@ define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) {
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v4f64_fsub_f64_commute(
+; CHECK-SAME: <4 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[X]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <4 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <4 x double> %x, i32 0
   %x1 = extractelement <4 x double> %x, i32 1
   %x01 = fsub double %x1, %x0
@@ -1325,6 +1682,13 @@ define float @extract_extract01_v16f32_fadd_f32(<16 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v16f32_fadd_f32(
+; CHECK-SAME: <16 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x float> [[X]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <16 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <16 x float> %x, i32 0
   %x1 = extractelement <16 x float> %x, i32 1
   %x01 = fadd float %x0, %x1
@@ -1355,6 +1719,13 @@ define float @extract_extract01_v16f32_fadd_f32_commute(<16 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v16f32_fadd_f32_commute(
+; CHECK-SAME: <16 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x float> [[X]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <16 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <16 x float> %x, i32 0
   %x1 = extractelement <16 x float> %x, i32 1
   %x01 = fadd float %x1, %x0
@@ -1386,6 +1757,13 @@ define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v8f64_fadd_f64(
+; CHECK-SAME: <8 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x double> [[X]], <8 x double> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <8 x double> %x, i32 0
   %x1 = extractelement <8 x double> %x, i32 1
   %x01 = fadd double %x0, %x1
@@ -1417,6 +1795,13 @@ define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) {
 ; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v8f64_fadd_f64_commute(
+; CHECK-SAME: <8 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x double> [[X]], <8 x double> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <8 x double> %x, i32 0
   %x1 = extractelement <8 x double> %x, i32 1
   %x01 = fadd double %x1, %x0
@@ -1447,6 +1832,13 @@ define float @extract_extract01_v16f32_fsub_f32(<16 x float> %x) {
 ; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v16f32_fsub_f32(
+; CHECK-SAME: <16 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x float> [[X]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <16 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <16 x float> %x, i32 0
   %x1 = extractelement <16 x float> %x, i32 1
   %x01 = fsub float %x0, %x1
@@ -1467,6 +1859,13 @@ define float @extract_extract01_v16f32_fsub_f32_commute(<16 x float> %x) {
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v16f32_fsub_f32_commute(
+; CHECK-SAME: <16 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x float> [[X]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <16 x float> [[TMP1]], i64 0
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <16 x float> %x, i32 0
   %x1 = extractelement <16 x float> %x, i32 1
   %x01 = fsub float %x1, %x0
@@ -1498,6 +1897,13 @@ define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) {
 ; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v8f64_fsub_f64(
+; CHECK-SAME: <8 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x double> [[X]], <8 x double> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[X]], [[SHIFT]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <8 x double> %x, i32 0
   %x1 = extractelement <8 x double> %x, i32 1
   %x01 = fsub double %x0, %x1
@@ -1519,6 +1925,13 @@ define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) {
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define double @extract_extract01_v8f64_fsub_f64_commute(
+; CHECK-SAME: <8 x double> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x double> [[X]], <8 x double> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[X01:%.*]] = extractelement <8 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    ret double [[X01]]
+;
   %x0 = extractelement <8 x double> %x, i32 0
   %x1 = extractelement <8 x double> %x, i32 1
   %x01 = fsub double %x1, %x0
@@ -1553,6 +1966,14 @@ define float @extract_extract01_v4f32_fadd_f32_uses1(<4 x float> %x, ptr %p) {
 ; AVX-FAST-NEXT:    vmovss %xmm0, (%rdi)
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fadd_f32_uses1(
+; CHECK-SAME: <4 x float> [[X:%.*]], ptr writeonly captures(none) initializes((0, 4)) [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x float> [[X]], i64 0
+; CHECK-NEXT:    store float [[X0]], ptr [[P]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[X]], i64 1
+; CHECK-NEXT:    [[X01:%.*]] = fadd float [[X0]], [[X1]]
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   store float %x0, ptr %p
   %x1 = extractelement <4 x float> %x, i32 1
@@ -1587,6 +2008,14 @@ define float @extract_extract01_v4f32_fadd_f32_uses2(<4 x float> %x, ptr %p) {
 ; AVX-FAST-NEXT:    vextractps $1, %xmm0, (%rdi)
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fadd_f32_uses2(
+; CHECK-SAME: <4 x float> [[X:%.*]], ptr writeonly captures(none) initializes((0, 4)) [[P:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x float> [[X]], i64 0
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[X]], i64 1
+; CHECK-NEXT:    store float [[X1]], ptr [[P]], align 4
+; CHECK-NEXT:    [[X01:%.*]] = fadd float [[X0]], [[X1]]
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   %x1 = extractelement <4 x float> %x, i32 1
   store float %x1, ptr %p
@@ -1610,6 +2039,15 @@ define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, ptr %p1, pt
 ; AVX-NEXT:    vmovss %xmm1, (%rsi)
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @extract_extract01_v4f32_fadd_f32_uses3(
+; CHECK-SAME: <4 x float> [[X:%.*]], ptr writeonly captures(none) initializes((0, 4)) [[P1:%.*]], ptr writeonly captures(none) initializes((0, 4)) [[P2:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x float> [[X]], i64 0
+; CHECK-NEXT:    store float [[X0]], ptr [[P1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[X]], i64 1
+; CHECK-NEXT:    store float [[X1]], ptr [[P2]], align 4
+; CHECK-NEXT:    [[X01:%.*]] = fadd float [[X0]], [[X1]]
+; CHECK-NEXT:    ret float [[X01]]
+;
   %x0 = extractelement <4 x float> %x, i32 0
   store float %x0, ptr %p1
   %x1 = extractelement <4 x float> %x, i32 1
@@ -1665,6 +2103,11 @@ define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {
 ; AVX-FAST-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @fadd_reduce_v8f32(
+; CHECK-SAME: float [[A0:%.*]], <8 x float> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v8f32(float [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    ret float [[R]]
+;
   %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1)
   ret float %r
 }
@@ -1704,6 +2147,11 @@ define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
 ; AVX-FAST-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define double @fadd_reduce_v4f64(
+; CHECK-SAME: double [[A0:%.*]], <4 x double> [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v4f64(double [[A0]], <4 x double> [[A1]])
+; CHECK-NEXT:    ret double [[R]]
+;
   %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1)
   ret double %r
 }
@@ -1760,6 +2208,19 @@ define float @PR39936_v8f32(<8 x float>) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @PR39936_v8f32(
+; CHECK-SAME: <8 x float> [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <8 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <8 x float> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x float> [[TMP9]], i64 0
+; CHECK-NEXT:    ret float [[TMP10]]
+;
   %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   %4 = fadd <8 x float> %2, %3
@@ -1804,6 +2265,15 @@ define float @hadd32_4(<4 x float> %x225) {
 ; AVX-FAST-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_4(
+; CHECK-SAME: <4 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <4 x float> [[X225]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <4 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <4 x float> [[X227]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <4 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <4 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %x227 = fadd <4 x float> %x225, %x226
   %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -1846,6 +2316,15 @@ define float @hadd32_8(<8 x float> %x225) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_8(
+; CHECK-SAME: <8 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <8 x float> [[X225]], <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <8 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <8 x float> [[X227]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <8 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <8 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x227 = fadd <8 x float> %x225, %x226
   %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1888,6 +2367,15 @@ define float @hadd32_16(<16 x float> %x225) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_16(
+; CHECK-SAME: <16 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <16 x float> [[X225]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <16 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <16 x float> [[X227]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <16 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <16 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x227 = fadd <16 x float> %x225, %x226
   %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1911,6 +2399,15 @@ define float @hadd32_4_optsize(<4 x float> %x225) optsize {
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_4_optsize(
+; CHECK-SAME: <4 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <4 x float> [[X225]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <4 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <4 x float> [[X227]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <4 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <4 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %x227 = fadd <4 x float> %x225, %x226
   %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -1935,6 +2432,15 @@ define float @hadd32_8_optsize(<8 x float> %x225) optsize {
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_8_optsize(
+; CHECK-SAME: <8 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <8 x float> [[X225]], <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <8 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <8 x float> [[X227]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <8 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <8 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x227 = fadd <8 x float> %x225, %x226
   %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1959,6 +2465,15 @@ define float @hadd32_16_optsize(<16 x float> %x225) optsize {
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_16_optsize(
+; CHECK-SAME: <16 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <16 x float> [[X225]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <16 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <16 x float> [[X227]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <16 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <16 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x227 = fadd <16 x float> %x225, %x226
   %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1982,6 +2497,15 @@ define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 {
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_4_pgso(
+; CHECK-SAME: <4 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR0]] !prof [[PROF14:![0-9]+]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <4 x float> [[X225]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <4 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <4 x float> [[X227]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <4 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <4 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
   %x227 = fadd <4 x float> %x225, %x226
   %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -2006,6 +2530,15 @@ define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 {
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_8_pgso(
+; CHECK-SAME: <8 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR0]] !prof [[PROF14]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <8 x float> [[X225]], <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <8 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <8 x float> [[X227]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <8 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <8 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x227 = fadd <8 x float> %x225, %x226
   %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2030,6 +2563,15 @@ define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 {
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define float @hadd32_16_pgso(
+; CHECK-SAME: <16 x float> [[X225:%.*]]) local_unnamed_addr #[[ATTR0]] !prof [[PROF14]] {
+; CHECK-NEXT:    [[X226:%.*]] = shufflevector <16 x float> [[X225]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X227:%.*]] = fadd <16 x float> [[X225]], [[X226]]
+; CHECK-NEXT:    [[X228:%.*]] = shufflevector <16 x float> [[X227]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X229:%.*]] = fadd <16 x float> [[X227]], [[X228]]
+; CHECK-NEXT:    [[X230:%.*]] = extractelement <16 x float> [[X229]], i64 0
+; CHECK-NEXT:    ret float [[X230]]
+;
   %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x227 = fadd <16 x float> %x225, %x226
   %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2071,6 +2613,15 @@ define float @partial_reduction_fadd_v8f32(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @partial_reduction_fadd_v8f32(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[X23:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X0213:%.*]] = fadd <8 x float> [[X]], [[X23]]
+; CHECK-NEXT:    [[X13:%.*]] = shufflevector <8 x float> [[X0213]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X0123:%.*]] = fadd reassoc nsz <8 x float> [[X13]], [[X0213]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <8 x float> [[X0123]], i64 0
+; CHECK-NEXT:    ret float [[R]]
+;
   %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x0213 = fadd <8 x float> %x, %x23
   %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2116,6 +2667,15 @@ define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @partial_reduction_fadd_v8f32_wrong_flags(
+; CHECK-SAME: <8 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[X23:%.*]] = shufflevector <8 x float> [[X]], <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X0213:%.*]] = fadd fast <8 x float> [[X23]], [[X]]
+; CHECK-NEXT:    [[X13:%.*]] = shufflevector <8 x float> [[X0213]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X0123:%.*]] = fadd nnan ninf <8 x float> [[X0213]], [[X13]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <8 x float> [[X0123]], i64 0
+; CHECK-NEXT:    ret float [[R]]
+;
   %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x0213 = fadd fast <8 x float> %x, %x23
   %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2157,6 +2717,15 @@ define float @partial_reduction_fadd_v16f32(<16 x float> %x) {
 ; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX-FAST-NEXT:    vzeroupper
 ; AVX-FAST-NEXT:    retq
+; CHECK-LABEL: define float @partial_reduction_fadd_v16f32(
+; CHECK-SAME: <16 x float> [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[X23:%.*]] = shufflevector <16 x float> [[X]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X0213:%.*]] = fadd <16 x float> [[X]], [[X23]]
+; CHECK-NEXT:    [[X13:%.*]] = shufflevector <16 x float> [[X0213]], <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[X0123:%.*]] = fadd reassoc nsz <16 x float> [[X13]], [[X0213]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x float> [[X0123]], i64 0
+; CHECK-NEXT:    ret float [[R]]
+;
   %x23 = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %x0213 = fadd <16 x float> %x, %x23
   %x13 = shufflevector <16 x float> %x0213, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2181,3 +2750,6 @@ define float @partial_reduction_fadd_v16f32(<16 x float> %x) {
 !12 = !{i32 999000, i64 100, i32 1}
 !13 = !{i32 999999, i64 1, i32 2}
 !14 = !{!"function_entry_count", i64 0}
+;.
+; CHECK: [[PROF14]] = !{!"function_entry_count", i64 0}
+;.
diff --git a/llvm/test/CodeGen/X86/phaddsub-undef.ll b/llvm/test/Transforms/PhaseOrdering/X86/phaddsub-undef.ll
similarity index 53%
rename from llvm/test/CodeGen/X86/phaddsub-undef.ll
rename to llvm/test/Transforms/PhaseOrdering/X86/phaddsub-undef.ll
index 8aa40939994fd..3cfd1b797209c 100644
--- a/llvm/test/CodeGen/X86/phaddsub-undef.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/phaddsub-undef.ll
@@ -1,14 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3              | FileCheck %s --check-prefixes=SSE,SSE-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops    | FileCheck %s --check-prefixes=SSE,SSE-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops     | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl           | FileCheck %s --check-prefixes=AVX,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
-; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
 
 define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: test14_undef:
@@ -20,6 +12,16 @@ define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @test14_undef(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 undef, i32 undef, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef>, <8 x i32> <i32 0, i32 9, i32 10, i32 poison, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <8 x i32> [[VECINIT]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[VECINIT5]]
+;
   %vecext = extractelement <8 x i32> %a, i32 0
   %vecext1 = extractelement <8 x i32> %a, i32 1
   %add = add i32 %vecext, %vecext1
@@ -87,6 +89,16 @@ define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @test15_undef(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 undef>, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 15>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[B]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <8 x i32> [[VECINIT]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 12, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[VECINIT5]]
+;
   %vecext = extractelement <8 x i32> %a, i32 0
   %vecext1 = extractelement <8 x i32> %a, i32 1
   %add = add i32 %vecext, %vecext1
@@ -121,6 +133,16 @@ define <8 x i32> @PR40243_alt(<8 x i32> %a, <8 x i32> %b) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @PR40243_alt(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[SHIFT1]], [[B]]
+; CHECK-NEXT:    [[R4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 undef, i32 undef, i32 poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 13, i32 14, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[R4]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
   %a4 = extractelement <8 x i32> %a, i32 4
   %a5 = extractelement <8 x i32> %a, i32 5
   %add4 = add i32 %a4, %a5
@@ -142,6 +164,16 @@ define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @test16_undef(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <8 x i32> <i32 0, i32 poison, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <8 x i32> [[VECINIT]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 10, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[VECINIT5]]
+;
   %vecext = extractelement <8 x i32> %a, i32 0
   %vecext1 = extractelement <8 x i32> %a, i32 1
   %add = add i32 %vecext, %vecext1
@@ -163,6 +195,16 @@ define <16 x i32> @test16_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
+; CHECK-LABEL: define <16 x i32> @test16_v16i32_undef(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[VECINIT:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> <i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <16 x i32> <i32 0, i32 poison, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <16 x i32> [[A]], [[SHIFT1]]
+; CHECK-NEXT:    [[VECINIT5:%.*]] = shufflevector <16 x i32> [[VECINIT]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 18, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i32> [[VECINIT5]]
+;
   %vecext = extractelement <16 x i32> %a, i32 0
   %vecext1 = extractelement <16 x i32> %a, i32 1
   %add = add i32 %vecext, %vecext1
@@ -197,6 +239,14 @@ define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <8 x i32> @test17_undef(
+; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
+;
   %vecext = extractelement <8 x i32> %a, i32 0
   %vecext1 = extractelement <8 x i32> %a, i32 1
   %add1 = add i32 %vecext, %vecext1
@@ -239,6 +289,14 @@ define <16 x i32> @test17_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
+; CHECK-LABEL: define <16 x i32> @test17_v16i32_undef(
+; CHECK-SAME: <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
+;
   %vecext = extractelement <16 x i32> %a, i32 0
   %vecext1 = extractelement <16 x i32> %a, i32 1
   %add1 = add i32 %vecext, %vecext1

From 3d12ab451a1ba436419b1074e5292196fcf13e3e Mon Sep 17 00:00:00 2001
From: william <we3223@gmail.com>
Date: Sat, 19 Jul 2025 11:43:28 +0800
Subject: [PATCH 2/2] [DAG] Add m_SelectCCLike matcher and update test cases

---
 llvm/include/llvm/CodeGen/SDPatternMatch.h    |  15 +-
 .../llvm/CodeGen/SelectionDAGISelMatchers.h   |  53 +++++
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 218 +++++++++++++-----
 llvm/test/CodeGen/RISCV/rv32zbb.ll            |  26 ++-
 llvm/test/CodeGen/RISCV/rv64zbb.ll            |  32 ++-
 5 files changed, 265 insertions(+), 79 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/SelectionDAGISelMatchers.h

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 2967532226197..d20b5d80670e2 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -93,7 +93,8 @@ struct Value_match {
 
   explicit Value_match(SDValue Match) : MatchVal(Match) {}
 
-  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+  template <typename MatchContext>
+  bool match(const MatchContext &, SDValue N) const {
     if (MatchVal)
       return MatchVal == N;
     return N.getNode();
@@ -130,7 +131,8 @@ struct DeferredValue_match {
 
   explicit DeferredValue_match(SDValue &Match) : MatchVal(Match) {}
 
-  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+  template <typename MatchContext>
+  bool match(const MatchContext &, SDValue N) const {
     return N == MatchVal;
   }
 };
@@ -196,7 +198,8 @@ struct Value_bind {
 
   explicit Value_bind(SDValue &N) : BindVal(N) {}
 
-  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+  template <typename MatchContext>
+  bool match(const MatchContext &, SDValue N) const {
     BindVal = N;
     return true;
   }
@@ -975,8 +978,7 @@ template <typename Opnd> inline UnaryOpc_match<Opnd> m_BitCast(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::BITCAST, Op);
 }
 
-template <typename Opnd>
-inline UnaryOpc_match<Opnd> m_BSwap(const Opnd &Op) {
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_BSwap(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::BSWAP, Op);
 }
 
@@ -1203,7 +1205,8 @@ struct CondCode_match {
 
   explicit CondCode_match(ISD::CondCode *CC) : BindCC(CC) {}
 
-  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+  template <typename MatchContext>
+  bool match(const MatchContext &, SDValue N) const {
     if (auto *CC = dyn_cast<CondCodeSDNode>(N.getNode())) {
       if (CCToMatch && *CCToMatch != CC->get())
         return false;
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISelMatchers.h b/llvm/include/llvm/CodeGen/SelectionDAGISelMatchers.h
new file mode 100644
index 0000000000000..926aca0bed904
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/SelectionDAGISelMatchers.h
@@ -0,0 +1,53 @@
+namespace llvm {
+namespace SDPatternMatch {
+
+// 1. 定義 SelectCC_match
+template <typename LTy, typename RTy, typename TTy, typename FTy, typename CCTy>
+struct SelectCC_match {
+  const LTy &L;
+  const RTy &R;
+  const TTy &T;
+  const FTy &F;
+  const CCTy &CC;
+
+  SelectCC_match(const LTy &l, const RTy &r,
+                 const TTy &t, const FTy &f,
+                 const CCTy &cc)
+      : L(l), R(r), T(t), F(f), CC(cc) {}
+
+  template <typename OpTy>
+  bool match(OpTy V) const {
+    if (V.getOpcode() != ISD::SELECT_CC)
+      return false;
+
+    return L.match(V.getOperand(0)) &&
+           R.match(V.getOperand(1)) &&
+           T.match(V.getOperand(2)) &&
+           F.match(V.getOperand(3)) &&
+           CC.match(cast<CondCodeSDNode>(V.getOperand(4))->get());
+  }
+};
+
+// 2. 定義 m_SelectCC
+template <typename LTy, typename RTy, typename TTy, typename FTy, typename CCTy>
+inline SelectCC_match<LTy, RTy, TTy, FTy, CCTy>
+m_SelectCC(const LTy &L, const RTy &R,
+           const TTy &T, const FTy &F,
+           const CCTy &CC) {
+  return SelectCC_match<LTy, RTy, TTy, FTy, CCTy>(L, R, T, F, CC);
+}
+
+// 3. 定義 m_SelectCCLike
+template <typename LTy, typename RTy, typename TTy, typename FTy, typename CCTy>
+inline auto m_SelectCCLike(const LTy &L, const RTy &R,
+                           const TTy &T, const FTy &F,
+                           const CCTy &CC) {
+  return m_AnyOf(
+    m_Select(m_SetCC(L, R, CC), T, F),
+    m_SelectCC(L, R, T, F, CC)
+  );
+}
+
+} // namespace SDPatternMatch
+} // namespace llvm
+
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fed5e7238433e..9a83847bb01c5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -264,6 +264,47 @@ namespace {
             VT.getSizeInBits().getKnownMinValue() >= MaximumLegalStoreInBits)
           MaximumLegalStoreInBits = VT.getSizeInBits().getKnownMinValue();
     }
+    
+    template <typename LTy, typename RTy, typename TTy, typename FTy,
+              typename CCTy>
+    struct SelectCC_match {
+      LTy L;
+      RTy R;
+      TTy T;
+      FTy F;
+      CCTy CC;
+
+      SelectCC_match(LTy L, RTy R, TTy T, FTy F, CCTy CC)
+          : L(std::move(L)), R(std::move(R)), T(std::move(T)), F(std::move(F)),
+            CC(std::move(CC)) {}
+
+      template <typename MatchContext>
+      bool match(const MatchContext &Ctx, SDValue V) const {
+        return V.getOpcode() == ISD::SELECT_CC && L.match(Ctx, V.getOperand(0)) &&
+               R.match(Ctx, V.getOperand(1)) && T.match(Ctx, V.getOperand(2)) &&
+               F.match(Ctx, V.getOperand(3)) && CC.match(Ctx, V.getOperand(4));
+      }
+    };
+
+    template <typename LTy, typename RTy, typename TTy, typename FTy,
+              typename CCTy>
+    inline auto m_SelectCC(LTy &&L, RTy &&R, TTy &&T, FTy &&F, CCTy &&CC) {
+      return SelectCC_match<std::decay_t<LTy>, std::decay_t<RTy>,
+                            std::decay_t<TTy>, std::decay_t<FTy>,
+                            std::decay_t<CCTy>>(
+          std::forward<LTy>(L), std::forward<RTy>(R), std::forward<TTy>(T),
+          std::forward<FTy>(F), std::forward<CCTy>(CC));
+    }
+
+    template <typename LTy, typename RTy, typename TTy, typename FTy,
+              typename CCTy>
+    inline auto m_SelectCCLike(LTy &&L, RTy &&R, TTy &&T, FTy &&F, CCTy &&CC) {
+      return SDPatternMatch::m_AnyOf(
+          SDPatternMatch::m_Select(SDPatternMatch::m_SetCC(L, R, CC), T, F),
+          m_SelectCC(std::forward<LTy>(L), std::forward<RTy>(R),
+                     std::forward<TTy>(T), std::forward<FTy>(F),
+                     std::forward<CCTy>(CC)));
+    }
 
     void ConsiderForPruning(SDNode *N) {
       // Mark this for potential pruning.
@@ -640,6 +681,7 @@ namespace {
     SDValue mergeInsertEltWithShuffle(SDNode *N, unsigned InsIndex);
     SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
     SDValue combineInsertEltToLoad(SDNode *N, unsigned InsIndex);
+    SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
     SDValue BuildSDIV(SDNode *N);
     SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
@@ -2608,7 +2650,9 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
       return SDValue();
   }
 
-  return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF, BO->getFlags());
+  SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
+  SelectOp->setFlags(BO->getFlags());
+  return SelectOp;
 }
 
 static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, const SDLoc &DL,
@@ -4300,8 +4344,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     return V;
 
   // (A - B) - 1  ->  add (xor B, -1), A
-  if (sd_match(N, m_Sub(m_OneUse(m_Sub(m_Value(A), m_Value(B))),
-                        m_One(/*AllowUndefs=*/true))))
+  if (sd_match(N, m_Sub(m_OneUse(m_Sub(m_Value(A), m_Value(B))), m_One())))
     return DAG.getNode(ISD::ADD, DL, VT, A, DAG.getNOT(DL, B, VT));
 
   // Look for:
@@ -9155,7 +9198,7 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
   if (Op.getOpcode() != ISD::LOAD && VectorIndex.has_value())
     return std::nullopt;
 
-  unsigned BitWidth = Op.getScalarValueSizeInBits();
+  unsigned BitWidth = Op.getValueSizeInBits();
   if (BitWidth % 8 != 0)
     return std::nullopt;
   unsigned ByteWidth = BitWidth / 8;
@@ -9254,7 +9297,7 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
     if (!L->isSimple() || L->isIndexed())
       return std::nullopt;
 
-    unsigned NarrowBitWidth = L->getMemoryVT().getScalarSizeInBits();
+    unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
     if (NarrowBitWidth % 8 != 0)
       return std::nullopt;
     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
@@ -9908,14 +9951,11 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     if (SDValue Combined = visitADDLike(N))
       return Combined;
 
-  // fold not (setcc x, y, cc) -> setcc x y !cc
-  // Avoid breaking: and (not(setcc x, y, cc), z) -> andn for vec
+  // fold !(x cc y) -> (x !cc y)
   unsigned N0Opcode = N0.getOpcode();
   SDValue LHS, RHS, CC;
   if (TLI.isConstTrueVal(N1) &&
-      isSetCCEquivalent(N0, LHS, RHS, CC, /*MatchStrict*/ true) &&
-      !(VT.isVector() && TLI.hasAndNot(SDValue(N, 0)) && N->hasOneUse() &&
-        N->use_begin()->getUser()->getOpcode() == ISD::AND)) {
+      isSetCCEquivalent(N0, LHS, RHS, CC, /*MatchStrict*/ true)) {
     ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
                                                LHS.getValueType());
     if (!LegalOperations ||
@@ -12221,8 +12261,11 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     return V;
 
   // select (not Cond), N1, N2 -> select Cond, N2, N1
-  if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
-    return DAG.getSelect(DL, VT, F, N2, N1, Flags);
+  if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) {
+    SDValue SelectOp = DAG.getSelect(DL, VT, F, N2, N1);
+    SelectOp->setFlags(Flags);
+    return SelectOp;
+  }
 
   if (SDValue V = foldSelectOfConstants(N))
     return V;
@@ -13102,10 +13145,10 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDValue Cond, SDValue TVal,
   EVT CondVT = Cond.getValueType();
   assert(CondVT.isVector() && "Vector select expects a vector selector!");
 
-  bool IsTAllZero = ISD::isConstantSplatVectorAllZeros(TVal.getNode());
-  bool IsTAllOne = ISD::isConstantSplatVectorAllOnes(TVal.getNode());
-  bool IsFAllZero = ISD::isConstantSplatVectorAllZeros(FVal.getNode());
-  bool IsFAllOne = ISD::isConstantSplatVectorAllOnes(FVal.getNode());
+  bool IsTAllZero = ISD::isBuildVectorAllZeros(TVal.getNode());
+  bool IsTAllOne = ISD::isBuildVectorAllOnes(TVal.getNode());
+  bool IsFAllZero = ISD::isBuildVectorAllZeros(FVal.getNode());
+  bool IsFAllOne = ISD::isBuildVectorAllOnes(FVal.getNode());
 
   // no vselect(cond, 0/-1, X) or vselect(cond, X, 0/-1), return
   if (!IsTAllZero && !IsTAllOne && !IsFAllZero && !IsFAllOne)
@@ -13179,15 +13222,6 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDValue Cond, SDValue TVal,
     return DAG.getBitcast(VT, And);
   }
 
-  // select Cond, 0, x -> and not(Cond), x
-  if (IsTAllZero &&
-      (isBitwiseNot(peekThroughBitcasts(Cond)) || TLI.hasAndNot(Cond))) {
-    SDValue X = DAG.getBitcast(CondVT, FVal);
-    SDValue And =
-        DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT), X);
-    return DAG.getBitcast(VT, And);
-  }
-
   return SDValue();
 }
 
@@ -13205,9 +13239,8 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
     return V;
 
   // vselect (not Cond), N1, N2 -> vselect Cond, N2, N1
-  if (!TLI.isTargetCanonicalSelect(N))
-    if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
-      return DAG.getSelect(DL, VT, F, N2, N1);
+  if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
+    return DAG.getSelect(DL, VT, F, N2, N1);
 
   // select (sext m), (add X, C), X --> (add X, (and C, (sext m))))
   if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N2 && N1->hasOneUse() &&
@@ -13506,9 +13539,11 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
 
     // Fold to a simpler select_cc
     if (SCC.getOpcode() == ISD::SETCC) {
-      return DAG.getNode(ISD::SELECT_CC, DL, N2.getValueType(),
-                         SCC.getOperand(0), SCC.getOperand(1), N2, N3,
-                         SCC.getOperand(2), SCC->getFlags());
+      SDValue SelectOp =
+          DAG.getNode(ISD::SELECT_CC, DL, N2.getValueType(), SCC.getOperand(0),
+                      SCC.getOperand(1), N2, N3, SCC.getOperand(2));
+      SelectOp->setFlags(SCC->getFlags());
+      return SelectOp;
     }
   }
 
@@ -16458,8 +16493,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
         TLI.isTypeLegal(VT.getVectorElementType()))) &&
       N0.getOpcode() == ISD::BUILD_VECTOR && N0->hasOneUse() &&
       cast<BuildVectorSDNode>(N0)->isConstant())
-    return DAG.FoldConstantBuildVector(cast<BuildVectorSDNode>(N0), SDLoc(N),
-                                       VT.getVectorElementType());
+    return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(),
+                                             VT.getVectorElementType());
 
   // If the input is a constant, let getNode fold it.
   if (isIntOrFPConstant(N0)) {
@@ -16848,6 +16883,83 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   return DAG.getNode(N0.getOpcode(), DL, N0->getVTList(), Ops, SafeFlags);
 }
 
+/// We know that BV is a build_vector node with Constant, ConstantFP or Undef
+/// operands. DstEltVT indicates the destination element value type.
+SDValue DAGCombiner::
+ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
+  EVT SrcEltVT = BV->getValueType(0).getVectorElementType();
+
+  // If this is already the right type, we're done.
+  if (SrcEltVT == DstEltVT) return SDValue(BV, 0);
+
+  unsigned SrcBitSize = SrcEltVT.getSizeInBits();
+  unsigned DstBitSize = DstEltVT.getSizeInBits();
+
+  // If this is a conversion of N elements of one type to N elements of another
+  // type, convert each element.  This handles FP<->INT cases.
+  if (SrcBitSize == DstBitSize) {
+    SmallVector<SDValue, 8> Ops;
+    for (SDValue Op : BV->op_values()) {
+      // If the vector element type is not legal, the BUILD_VECTOR operands
+      // are promoted and implicitly truncated.  Make that explicit here.
+      if (Op.getValueType() != SrcEltVT)
+        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
+      Ops.push_back(DAG.getBitcast(DstEltVT, Op));
+      AddToWorklist(Ops.back().getNode());
+    }
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
+                              BV->getValueType(0).getVectorNumElements());
+    return DAG.getBuildVector(VT, SDLoc(BV), Ops);
+  }
+
+  // Otherwise, we're growing or shrinking the elements.  To avoid having to
+  // handle annoying details of growing/shrinking FP values, we convert them to
+  // int first.
+  if (SrcEltVT.isFloatingPoint()) {
+    // Convert the input float vector to a int vector where the elements are the
+    // same sizes.
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
+    BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
+    SrcEltVT = IntVT;
+  }
+
+  // Now we know the input is an integer vector.  If the output is a FP type,
+  // convert to integer first, then to FP of the right size.
+  if (DstEltVT.isFloatingPoint()) {
+    EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
+    SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();
+
+    // Next, convert to FP elements of the same size.
+    return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT);
+  }
+
+  // Okay, we know the src/dst types are both integers of differing types.
+  assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
+
+  // TODO: Should ConstantFoldBITCASTofBUILD_VECTOR always take a
+  // BuildVectorSDNode?
+  auto *BVN = cast<BuildVectorSDNode>(BV);
+
+  // Extract the constant raw bit data.
+  BitVector UndefElements;
+  SmallVector<APInt> RawBits;
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
+  if (!BVN->getConstantRawBits(IsLE, DstBitSize, RawBits, UndefElements))
+    return SDValue();
+
+  SDLoc DL(BV);
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned I = 0, E = RawBits.size(); I != E; ++I) {
+    if (UndefElements[I])
+      Ops.push_back(DAG.getUNDEF(DstEltVT));
+    else
+      Ops.push_back(DAG.getConstant(RawBits[I], DL, DstEltVT));
+  }
+
+  EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
+  return DAG.getBuildVector(VT, DL, Ops);
+}
+
 // Returns true if floating point contraction is allowed on the FMUL-SDValue
 // `N`
 static bool isContractableFMUL(const TargetOptions &Options, SDValue N) {
@@ -27643,11 +27755,6 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
     if (DAG.isConstantValueOfAnyType(N1.getOperand(0)) || N1.hasOneUse())
       return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, N1.getOperand(0));
 
-  // insert_subvector (splat X), (splat X), N2 -> splat X
-  if (N0.getOpcode() == ISD::SPLAT_VECTOR && N0.getOpcode() == N1.getOpcode() &&
-      N0.getOperand(0) == N1.getOperand(0))
-    return N0;
-
   // If we are inserting a bitcast value into an undef, with the same
   // number of elements, just use the bitcast input of the extract.
   // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 ->
@@ -28216,16 +28323,14 @@ SDValue DAGCombiner::SimplifyVCastOp(SDNode *N, const SDLoc &DL) {
       TLI.preferScalarizeSplat(N)) {
     EVT SrcVT = N0.getValueType();
     EVT SrcEltVT = SrcVT.getVectorElementType();
-    if (!LegalTypes || TLI.isTypeLegal(SrcEltVT)) {
-      SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
-      SDValue Elt =
-          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcEltVT, Src0, IndexC);
-      SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, Elt, N->getFlags());
-      if (VT.isScalableVector())
-        return DAG.getSplatVector(VT, DL, ScalarBO);
-      SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO);
-      return DAG.getBuildVector(VT, DL, Ops);
-    }
+    SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
+    SDValue Elt =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcEltVT, Src0, IndexC);
+    SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, Elt, N->getFlags());
+    if (VT.isScalableVector())
+      return DAG.getSplatVector(VT, DL, ScalarBO);
+    SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO);
+    return DAG.getBuildVector(VT, DL, Ops);
   }
 
   return SDValue();
@@ -28367,8 +28472,10 @@ SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
                                   SCC.getOperand(0), SCC.getOperand(1),
                                   SCC.getOperand(4), Flags);
       AddToWorklist(SETCC.getNode());
-      return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
-                           SCC.getOperand(2), SCC.getOperand(3), Flags);
+      SDValue SelectNode = DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
+                                         SCC.getOperand(2), SCC.getOperand(3));
+      SelectNode->setFlags(Flags);
+      return SelectNode;
     }
 
     return SCC;
@@ -28669,9 +28776,9 @@ SDValue DAGCombiner::foldSelectOfBinops(SDNode *N) {
     SDValue N10 = N1.getOperand(0);
     SDValue N20 = N2.getOperand(0);
     SDValue NewSel = DAG.getSelect(DL, N10.getValueType(), N0, N10, N20);
-    SDNodeFlags Flags = N1->getFlags() & N2->getFlags();
-    SDValue NewBinOp =
-        DAG.getNode(BinOpc, DL, OpVTs, {NewSel, N1.getOperand(1)}, Flags);
+    SDValue NewBinOp = DAG.getNode(BinOpc, DL, OpVTs, NewSel, N1.getOperand(1));
+    NewBinOp->setFlags(N1->getFlags());
+    NewBinOp->intersectFlagsWith(N2->getFlags());
     return SDValue(NewBinOp.getNode(), N1.getResNo());
   }
 
@@ -28683,9 +28790,10 @@ SDValue DAGCombiner::foldSelectOfBinops(SDNode *N) {
     // Second op VT might be different (e.g. shift amount type)
     if (N11.getValueType() == N21.getValueType()) {
       SDValue NewSel = DAG.getSelect(DL, N11.getValueType(), N0, N11, N21);
-      SDNodeFlags Flags = N1->getFlags() & N2->getFlags();
       SDValue NewBinOp =
-          DAG.getNode(BinOpc, DL, OpVTs, {N1.getOperand(0), NewSel}, Flags);
+          DAG.getNode(BinOpc, DL, OpVTs, N1.getOperand(0), NewSel);
+      NewBinOp->setFlags(N1->getFlags());
+      NewBinOp->intersectFlagsWith(N2->getFlags());
       return SDValue(NewBinOp.getNode(), N1.getResNo());
     }
   }
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 8dd63015971d0..40db4943aedcf 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1649,14 +1649,24 @@ define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
 }
 
 define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
-; CHECK-LABEL: sub_if_uge_multiuse_select_i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    sltu a2, a0, a1
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a1, a2, a1
-; CHECK-NEXT:    sub a0, a0, a1
-; CHECK-NEXT:    sll a0, a0, a1
-; CHECK-NEXT:    ret
+; RV32I-LABEL: sub_if_uge_multiuse_select_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sltu a2, a0, a1
+; RV32I-NEXT:    addi a2, a2, -1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: sub_if_uge_multiuse_select_i32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    sltu a2, a0, a1
+; RV32ZBB-NEXT:    addi a2, a2, -1
+; RV32ZBB-NEXT:    and a2, a2, a1
+; RV32ZBB-NEXT:    sub a1, a0, a1
+; RV32ZBB-NEXT:    minu a0, a0, a1
+; RV32ZBB-NEXT:    sll a0, a0, a2
+; RV32ZBB-NEXT:    ret
   %cmp = icmp ult i32 %x, %y
   %select = select i1 %cmp, i32 0, i32 %y
   %sub = sub nuw i32 %x, %select
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index e6407279870db..dc78ae1b998da 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -1845,16 +1845,28 @@ define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
 }
 
 define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
-; CHECK-LABEL: sub_if_uge_multiuse_select_i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    sext.w a2, a1
-; CHECK-NEXT:    sext.w a3, a0
-; CHECK-NEXT:    sltu a2, a3, a2
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a1, a2, a1
-; CHECK-NEXT:    subw a0, a0, a1
-; CHECK-NEXT:    sllw a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: sub_if_uge_multiuse_select_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a2, a1
+; RV64I-NEXT:    sext.w a3, a0
+; RV64I-NEXT:    sltu a2, a3, a2
+; RV64I-NEXT:    addi a2, a2, -1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: sub_if_uge_multiuse_select_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.w a2, a1
+; RV64ZBB-NEXT:    sext.w a3, a0
+; RV64ZBB-NEXT:    subw a0, a0, a1
+; RV64ZBB-NEXT:    sltu a2, a3, a2
+; RV64ZBB-NEXT:    addi a2, a2, -1
+; RV64ZBB-NEXT:    and a1, a2, a1
+; RV64ZBB-NEXT:    minu a0, a3, a0
+; RV64ZBB-NEXT:    sllw a0, a0, a1
+; RV64ZBB-NEXT:    ret
   %cmp = icmp ult i32 %x, %y
   %select = select i1 %cmp, i32 0, i32 %y
   %sub = sub nuw i32 %x, %select