diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 65528b3050fe5..ab40c35b50aed 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1885,6 +1885,12 @@ LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V); /// If \p V is not an extracted subvector, it is returned as-is. LLVM_ABI SDValue peekThroughExtractSubvectors(SDValue V); +/// Recursively peek through INSERT_VECTOR_ELT nodes, returning the source +/// vector operand of \p V, as long as \p V is an INSERT_VECTOR_ELT operation +/// that do not insert into any of the demanded vector elts. +LLVM_ABI SDValue peekThroughInsertVectorElt(SDValue V, + const APInt &DemandedElts); + /// Return the non-truncated source operand of \p V if it exists. /// If \p V is not a truncation, it is returned as-is. LLVM_ABI SDValue peekThroughTruncates(SDValue V); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 17703f58f2824..65238e7b76da6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23281,6 +23281,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { auto *IndexC = dyn_cast(EltNo); // Insert into out-of-bounds element is undefined. + // Code below relies on that we handle this special case early. if (IndexC && VT.isFixedLengthVector() && IndexC->getZExtValue() >= VT.getVectorNumElements()) return DAG.getUNDEF(VT); @@ -23291,14 +23292,28 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1)) return InVec; - if (!IndexC) { - // If this is variable insert to undef vector, it might be better to splat: - // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > - if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) - return DAG.getSplat(VT, DL, InVal); - return SDValue(); + // If this is variable insert to undef vector, it might be better to splat: + // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > + if (!IndexC && InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) + return DAG.getSplat(VT, DL, InVal); + + // Try to drop insert of UNDEF/POISON elements. This is also done in getNode, + // but we also do it as a DAG combine since for example simplifications into + // SPLAT_VECTOR/BUILD_VECTOR may turn poison elements into undef/zero etc, and + // then suddenly the InVec is guaranteed to not be poison. + if (InVal.isUndef()) { + if (IndexC && VT.isFixedLengthVector()) { + APInt EltMask = APInt::getOneBitSet(VT.getVectorNumElements(), + IndexC->getZExtValue()); + if (DAG.isGuaranteedNotToBePoison(InVec, EltMask)) + return InVec; + } + return DAG.getFreeze(InVec); } + if (!IndexC) + return SDValue(); + if (VT.isScalableVector()) return SDValue(); @@ -27779,18 +27794,42 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { SDValue N2 = N->getOperand(2); uint64_t InsIdx = N->getConstantOperandVal(2); - // If inserting an UNDEF, just return the original vector. - if (N1.isUndef()) - return N0; + // If inserting an UNDEF, just return the original vector (unless it makes the + // result more poisonous). + if (N1.isUndef()) { + if (N1.getOpcode() == ISD::POISON) + return N0; + if (VT.isFixedLengthVector()) { + unsigned SubVecNumElts = N1.getValueType().getVectorNumElements(); + APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(), InsIdx, + InsIdx + SubVecNumElts); + if (DAG.isGuaranteedNotToBePoison(N0, EltMask)) + return N0; + } + return DAG.getFreeze(N0); + } - // If this is an insert of an extracted vector into an undef vector, we can - // just use the input to the extract if the types match, and can simplify + // If this is an insert of an extracted vector into an undef/poison vector, we + // can just use the input to the extract if the types match, and can simplify // in some cases even if they don't. if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOperand(1) == N2) { + EVT N1VT = N1.getValueType(); EVT SrcVT = N1.getOperand(0).getValueType(); - if (SrcVT == VT) - return N1.getOperand(0); + if (SrcVT == VT) { + // Need to ensure that result isn't more poisonous if skipping both the + // extract+insert. + if (N0.getOpcode() == ISD::POISON) + return N1.getOperand(0); + if (VT.isFixedLengthVector() && N1VT.isFixedLengthVector()) { + unsigned SubVecNumElts = N1VT.getVectorNumElements(); + APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(), InsIdx, + InsIdx + SubVecNumElts); + if (DAG.isGuaranteedNotToBePoison(N1.getOperand(0), ~EltMask)) + return N1.getOperand(0); + } else if (DAG.isGuaranteedNotToBePoison(N1.getOperand(0))) + return N1.getOperand(0); + } // TODO: To remove the zero check, need to adjust the offset to // a multiple of the new src type. if (isNullConstant(N2)) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 4b7fc45908119..d97551d3dae99 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5519,8 +5519,9 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op, APInt InVecDemandedElts = DemandedElts; InVecDemandedElts.clearBit(IndexC->getZExtValue()); if (!!InVecDemandedElts && - !isGuaranteedNotToBeUndefOrPoison(InVec, InVecDemandedElts, - PoisonOnly, Depth + 1)) + !isGuaranteedNotToBeUndefOrPoison( + peekThroughInsertVectorElt(InVec, InVecDemandedElts), + InVecDemandedElts, PoisonOnly, Depth + 1)) return false; return true; } @@ -8219,23 +8220,42 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF, except // for scalable vectors where we will generate appropriate code to // deal with out-of-bounds cases correctly. - if (N3C && N1.getValueType().isFixedLengthVector() && - N3C->getZExtValue() >= N1.getValueType().getVectorNumElements()) + if (N3C && VT.isFixedLengthVector() && + N3C->getZExtValue() >= VT.getVectorNumElements()) return getUNDEF(VT); // Undefined index can be assumed out-of-bounds, so that's UNDEF too. if (N3.isUndef()) return getUNDEF(VT); - // If the inserted element is an UNDEF, just use the input vector. - if (N2.isUndef()) + // If inserting poison, just use the input vector. + if (N2.getOpcode() == ISD::POISON) return N1; + // Inserting undef into undef/poison is still undef. + if (N2.getOpcode() == ISD::UNDEF && N1.isUndef()) + return getUNDEF(VT); + + // If the inserted element is an UNDEF, just use the input vector. + // But not if skipping the insert could make the result more poisonous. + if (N2.isUndef()) { + if (N3C && VT.isFixedLengthVector()) { + APInt EltMask = + APInt::getOneBitSet(VT.getVectorNumElements(), N3C->getZExtValue()); + if (isGuaranteedNotToBePoison(N1, EltMask)) + return N1; + } else if (isGuaranteedNotToBePoison(N1)) + return N1; + } break; } case ISD::INSERT_SUBVECTOR: { - // Inserting undef into undef is still undef. - if (N1.isUndef() && N2.isUndef()) + // If inserting poison, just use the input vector, + if (N2.getOpcode() == ISD::POISON) + return N1; + + // Inserting undef into undef/poison is still undef. + if (N2.getOpcode() == ISD::UNDEF && N1.isUndef()) return getUNDEF(VT); EVT N2VT = N2.getValueType(); @@ -8264,11 +8284,37 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (VT == N2VT) return N2; - // If this is an insert of an extracted vector into an undef vector, we - // can just use the input to the extract. + // If this is an insert of an extracted vector into an undef/poison vector, + // we can just use the input to the extract. But not if skipping the + // extract+insert could make the result more poisonous. if (N1.isUndef() && N2.getOpcode() == ISD::EXTRACT_SUBVECTOR && - N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT) - return N2.getOperand(0); + N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT) { + if (N1.getOpcode() == ISD::POISON) + return N2.getOperand(0); + if (VT.isFixedLengthVector() && N2VT.isFixedLengthVector()) { + unsigned LoBit = N3->getAsZExtVal(); + unsigned HiBit = LoBit + N2VT.getVectorNumElements(); + APInt EltMask = + APInt::getBitsSet(VT.getVectorNumElements(), LoBit, HiBit); + if (isGuaranteedNotToBePoison(N2.getOperand(0), ~EltMask)) + return N2.getOperand(0); + } else if (isGuaranteedNotToBePoison(N2.getOperand(0))) + return N2.getOperand(0); + } + + // If the inserted subvector is UNDEF, just use the input vector. + // But not if skipping the insert could make the result more poisonous. + if (N2.isUndef()) { + if (VT.isFixedLengthVector()) { + unsigned LoBit = N3->getAsZExtVal(); + unsigned HiBit = LoBit + N2VT.getVectorNumElements(); + APInt EltMask = + APInt::getBitsSet(VT.getVectorNumElements(), LoBit, HiBit); + if (isGuaranteedNotToBePoison(N1, EltMask)) + return N1; + } else if (isGuaranteedNotToBePoison(N1)) + return N1; + } break; } case ISD::BITCAST: @@ -12777,6 +12823,23 @@ SDValue llvm::peekThroughExtractSubvectors(SDValue V) { return V; } +SDValue llvm::peekThroughInsertVectorElt(SDValue V, const APInt &DemandedElts) { + while (V.getOpcode() == ISD::INSERT_VECTOR_ELT) { + SDValue InVec = V.getOperand(0); + SDValue EltNo = V.getOperand(2); + EVT VT = InVec.getValueType(); + auto *IndexC = dyn_cast(EltNo); + if (IndexC && VT.isFixedLengthVector() && + IndexC->getAPIntValue().ult(VT.getVectorNumElements()) && + !DemandedElts[IndexC->getZExtValue()]) { + V = InVec; + continue; + } + break; + } + return V; +} + SDValue llvm::peekThroughTruncates(SDValue V) { while (V.getOpcode() == ISD::TRUNCATE) V = V.getOperand(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 911bbabc42aa3..4c0e60740746a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3440,8 +3440,8 @@ bool TargetLowering::SimplifyDemandedVectorElts( break; } case ISD::INSERT_SUBVECTOR: { - // Demand any elements from the subvector and the remainder from the src its - // inserted into. + // Demand any elements from the subvector and the remainder from the src it + // is inserted into. SDValue Src = Op.getOperand(0); SDValue Sub = Op.getOperand(1); uint64_t Idx = Op.getConstantOperandVal(2); @@ -3450,6 +3450,10 @@ bool TargetLowering::SimplifyDemandedVectorElts( APInt DemandedSrcElts = DemandedElts; DemandedSrcElts.clearBits(Idx, Idx + NumSubElts); + // If none of the sub operand elements are demanded, bypass the insert. + if (!DemandedSubElts) + return TLO.CombineTo(Op, Src); + APInt SubUndef, SubZero; if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, SubUndef, SubZero, TLO, Depth + 1)) diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll index 82802c79c7085..c6fff3e3d3181 100644 --- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll @@ -57,8 +57,8 @@ define void @widen_f16_build_vector(ptr %addr) { ; CHECK-LABEL: widen_f16_build_vector: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #13294 // =0x33ee -; CHECK-NEXT: movk w8, #13294, lsl #16 -; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: dup v0.4h, w8 +; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret store <2 x half> , ptr %addr, align 2 ret void diff --git a/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll b/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll index 34899cb47dba3..545da98034527 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll @@ -94,16 +94,14 @@ define i32 @combine_undef_add_8xi32(i32 %a, i32 %b, i32 %c, i32 %d) local_unname ; CHECK-LABEL: combine_undef_add_8xi32: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: mov v1.s[1], w1 -; CHECK-NEXT: uhadd v0.4h, v0.4h, v0.4h ; CHECK-NEXT: mov v1.s[2], w2 ; CHECK-NEXT: mov v1.s[3], w3 -; CHECK-NEXT: xtn v2.4h, v1.4s -; CHECK-NEXT: shrn v1.4h, v1.4s, #16 -; CHECK-NEXT: uhadd v1.4h, v2.4h, v1.4h -; CHECK-NEXT: mov v1.d[1], v0.d[0] -; CHECK-NEXT: uaddlv s0, v1.8h +; CHECK-NEXT: uzp2 v2.8h, v1.8h, v0.8h +; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-NEXT: uhadd v0.8h, v0.8h, v2.8h +; CHECK-NEXT: uaddlv s0, v0.8h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %a1 = insertelement <8 x i32> poison, i32 %a, i32 0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index 093e6cd9328c8..ebd32c73ec65b 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -1198,11 +1198,15 @@ define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) # ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] ; CHECK-NEXT: punpklo p2.h, p1.b +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll index aba9056c78cda..5aa3a246d7616 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING,NO_FOLDING1 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING,NO_FOLDING2 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFHMIN ; Check that the default value enables the web folding and @@ -8,20 +8,35 @@ ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, <2 x half> %b, <2 x half> %b2) { -; NO_FOLDING-LABEL: vfwmul_v2f116_multiple_users: -; NO_FOLDING: # %bb.0: -; NO_FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8 -; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9 -; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10 -; NO_FOLDING-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8 -; NO_FOLDING-NEXT: vfadd.vv v11, v11, v9 -; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9 -; NO_FOLDING-NEXT: vse32.v v10, (a0) -; NO_FOLDING-NEXT: vse32.v v11, (a1) -; NO_FOLDING-NEXT: vse32.v v8, (a2) -; NO_FOLDING-NEXT: ret +; NO_FOLDING1-LABEL: vfwmul_v2f116_multiple_users: +; NO_FOLDING1: # %bb.0: +; NO_FOLDING1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v9, v10 +; NO_FOLDING1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; NO_FOLDING1-NEXT: vfmul.vv v10, v11, v8 +; NO_FOLDING1-NEXT: vfadd.vv v11, v11, v9 +; NO_FOLDING1-NEXT: vfsub.vv v8, v8, v9 +; NO_FOLDING1-NEXT: vse32.v v10, (a0) +; NO_FOLDING1-NEXT: vse32.v v11, (a1) +; NO_FOLDING1-NEXT: vse32.v v8, (a2) +; NO_FOLDING1-NEXT: ret +; +; NO_FOLDING2-LABEL: vfwmul_v2f116_multiple_users: +; NO_FOLDING2: # %bb.0: +; NO_FOLDING2-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING2-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING2-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; NO_FOLDING2-NEXT: vfmul.vv v9, v11, v8 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; NO_FOLDING2-NEXT: vfwadd.wv v11, v11, v10 +; NO_FOLDING2-NEXT: vfwsub.wv v8, v8, v10 +; NO_FOLDING2-NEXT: vse32.v v9, (a0) +; NO_FOLDING2-NEXT: vse32.v v11, (a1) +; NO_FOLDING2-NEXT: vse32.v v8, (a2) +; NO_FOLDING2-NEXT: ret ; ; ZVFH-LABEL: vfwmul_v2f116_multiple_users: ; ZVFH: # %bb.0: @@ -61,20 +76,35 @@ define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, } define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2) { -; NO_FOLDING-LABEL: vfwmul_v2f32_multiple_users: -; NO_FOLDING: # %bb.0: -; NO_FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8 -; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9 -; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10 -; NO_FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8 -; NO_FOLDING-NEXT: vfadd.vv v11, v11, v9 -; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9 -; NO_FOLDING-NEXT: vse64.v v10, (a0) -; NO_FOLDING-NEXT: vse64.v v11, (a1) -; NO_FOLDING-NEXT: vse64.v v8, (a2) -; NO_FOLDING-NEXT: ret +; NO_FOLDING1-LABEL: vfwmul_v2f32_multiple_users: +; NO_FOLDING1: # %bb.0: +; NO_FOLDING1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v9, v10 +; NO_FOLDING1-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; NO_FOLDING1-NEXT: vfmul.vv v10, v11, v8 +; NO_FOLDING1-NEXT: vfadd.vv v11, v11, v9 +; NO_FOLDING1-NEXT: vfsub.vv v8, v8, v9 +; NO_FOLDING1-NEXT: vse64.v v10, (a0) +; NO_FOLDING1-NEXT: vse64.v v11, (a1) +; NO_FOLDING1-NEXT: vse64.v v8, (a2) +; NO_FOLDING1-NEXT: ret +; +; NO_FOLDING2-LABEL: vfwmul_v2f32_multiple_users: +; NO_FOLDING2: # %bb.0: +; NO_FOLDING2-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; NO_FOLDING2-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING2-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; NO_FOLDING2-NEXT: vfmul.vv v9, v11, v8 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; NO_FOLDING2-NEXT: vfwadd.wv v11, v11, v10 +; NO_FOLDING2-NEXT: vfwsub.wv v8, v8, v10 +; NO_FOLDING2-NEXT: vse64.v v9, (a0) +; NO_FOLDING2-NEXT: vse64.v v11, (a1) +; NO_FOLDING2-NEXT: vse64.v v8, (a2) +; NO_FOLDING2-NEXT: ret ; ; FOLDING-LABEL: vfwmul_v2f32_multiple_users: ; FOLDING: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll index 227a428831b60..b093e9e35edad 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING1 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING1 +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING2 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING2 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING ; Check that the default value enables the web folding and @@ -16,21 +16,38 @@ ; We need the web size to be at least 3 for the folding to happen, because ; %c has 3 uses. define <2 x i16> @vwmul_v2i16_multiple_users(ptr %x, ptr %y, ptr %z) { -; NO_FOLDING-LABEL: vwmul_v2i16_multiple_users: -; NO_FOLDING: # %bb.0: -; NO_FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; NO_FOLDING-NEXT: vle8.v v8, (a0) -; NO_FOLDING-NEXT: vle8.v v9, (a1) -; NO_FOLDING-NEXT: vle8.v v10, (a2) -; NO_FOLDING-NEXT: vsext.vf2 v11, v8 -; NO_FOLDING-NEXT: vsext.vf2 v8, v9 -; NO_FOLDING-NEXT: vsext.vf2 v9, v10 -; NO_FOLDING-NEXT: vmul.vv v8, v11, v8 -; NO_FOLDING-NEXT: vadd.vv v10, v11, v9 -; NO_FOLDING-NEXT: vsub.vv v9, v11, v9 -; NO_FOLDING-NEXT: vor.vv v8, v8, v10 -; NO_FOLDING-NEXT: vor.vv v8, v8, v9 -; NO_FOLDING-NEXT: ret +; NO_FOLDING1-LABEL: vwmul_v2i16_multiple_users: +; NO_FOLDING1: # %bb.0: +; NO_FOLDING1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING1-NEXT: vle8.v v8, (a0) +; NO_FOLDING1-NEXT: vle8.v v9, (a1) +; NO_FOLDING1-NEXT: vle8.v v10, (a2) +; NO_FOLDING1-NEXT: vsext.vf2 v11, v8 +; NO_FOLDING1-NEXT: vsext.vf2 v8, v9 +; NO_FOLDING1-NEXT: vsext.vf2 v9, v10 +; NO_FOLDING1-NEXT: vmul.vv v8, v11, v8 +; NO_FOLDING1-NEXT: vadd.vv v10, v11, v9 +; NO_FOLDING1-NEXT: vsub.vv v9, v11, v9 +; NO_FOLDING1-NEXT: vor.vv v8, v8, v10 +; NO_FOLDING1-NEXT: vor.vv v8, v8, v9 +; NO_FOLDING1-NEXT: ret +; +; NO_FOLDING2-LABEL: vwmul_v2i16_multiple_users: +; NO_FOLDING2: # %bb.0: +; NO_FOLDING2-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING2-NEXT: vle8.v v8, (a0) +; NO_FOLDING2-NEXT: vle8.v v9, (a1) +; NO_FOLDING2-NEXT: vle8.v v10, (a2) +; NO_FOLDING2-NEXT: vsext.vf2 v11, v8 +; NO_FOLDING2-NEXT: vsext.vf2 v8, v9 +; NO_FOLDING2-NEXT: vmul.vv v8, v11, v8 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; NO_FOLDING2-NEXT: vwadd.wv v9, v11, v10 +; NO_FOLDING2-NEXT: vwsub.wv v11, v11, v10 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; NO_FOLDING2-NEXT: vor.vv v8, v8, v9 +; NO_FOLDING2-NEXT: vor.vv v8, v8, v11 +; NO_FOLDING2-NEXT: ret ; ; FOLDING-LABEL: vwmul_v2i16_multiple_users: ; FOLDING: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index efa6c16fbf4eb..5b61de5a3b772 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3572,45 +3572,53 @@ define void @SpinningCube() { ; SSE2-LABEL: SpinningCube: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] -; SSE2-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE2-NEXT: movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: xorps %xmm3, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSE2-NEXT: addps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, (%rax) -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, (%rax) +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE2-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE2-NEXT: movq {{.*#+}} xmm3 = xmm3[0],zero +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] +; SSE2-NEXT: addps %xmm0, %xmm3 +; SSE2-NEXT: movaps %xmm3, (%rax) +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, (%rax) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: SpinningCube: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] -; SSSE3-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSSE3-NEXT: movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u] -; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSSE3-NEXT: xorps %xmm3, %xmm3 -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSSE3-NEXT: addps %xmm3, %xmm1 -; SSSE3-NEXT: movaps %xmm1, (%rax) -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] -; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: addps %xmm0, %xmm1 -; SSSE3-NEXT: movaps %xmm1, (%rax) +; SSSE3-NEXT: xorps %xmm0, %xmm0 +; SSSE3-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSSE3-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSSE3-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] +; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSSE3-NEXT: movq {{.*#+}} xmm3 = xmm3[0],zero +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] +; SSSE3-NEXT: addps %xmm0, %xmm3 +; SSSE3-NEXT: movaps %xmm3, (%rax) +; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: addps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm0, (%rax) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: SpinningCube: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,mem[0] ; SSE41-NEXT: movaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] ; SSE41-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] ; SSE41-NEXT: movaps %xmm1, %xmm3 @@ -3629,7 +3637,7 @@ define void @SpinningCube() { ; AVX-LABEL: SpinningCube: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,0.0E+0,1.0E+0] ; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] ; AVX-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] ; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]