diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 6345b18b809a6..b7a51fc006654 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -107,10 +107,8 @@ class VectorCombine { const Instruction &I, ExtractElementInst *&ConvertToShuffle, unsigned PreferredExtractIndex); - void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, - Instruction &I); - void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, - Instruction &I); + Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I); + Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I); bool foldExtractExtract(Instruction &I); bool foldInsExtFNeg(Instruction &I); bool foldInsExtBinop(Instruction &I); @@ -138,7 +136,7 @@ class VectorCombine { bool foldInterleaveIntrinsics(Instruction &I); bool shrinkType(Instruction &I); - void replaceValue(Value &Old, Value &New) { + void replaceValue(Instruction &Old, Value &New, bool Erase = true) { LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); LLVM_DEBUG(dbgs() << " With: " << New << '\n'); Old.replaceAllUsesWith(&New); @@ -147,7 +145,11 @@ class VectorCombine { Worklist.pushUsersToWorkList(*NewI); Worklist.pushValue(NewI); } - Worklist.pushValue(&Old); + if (Erase && isInstructionTriviallyDead(&Old)) { + eraseInstruction(Old); + } else { + Worklist.push(&Old); + } } void eraseInstruction(Instruction &I) { @@ -158,11 +160,23 @@ class VectorCombine { // Push remaining users of the operands and then the operand itself - allows // further folds that were hindered by OneUse limits. - for (Value *Op : Ops) - if (auto *OpI = dyn_cast(Op)) { - Worklist.pushUsersToWorkList(*OpI); - Worklist.pushValue(OpI); + SmallPtrSet Visited; + for (Value *Op : Ops) { + if (Visited.insert(Op).second) { + if (auto *OpI = dyn_cast(Op)) { + if (RecursivelyDeleteTriviallyDeadInstructions( + OpI, nullptr, nullptr, [this](Value *V) { + if (auto I = dyn_cast(V)) { + LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n'); + Worklist.remove(I); + } + })) + continue; + Worklist.pushUsersToWorkList(*OpI); + Worklist.pushValue(OpI); + } } + } } }; } // namespace @@ -546,9 +560,8 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, /// the source vector (shift the scalar element) to a NewIndex for extraction. /// Return null if the input can be constant folded, so that we are not creating /// unnecessary instructions. -static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, - unsigned NewIndex, - IRBuilderBase &Builder) { +static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, + IRBuilderBase &Builder) { // Shufflevectors can only be created for fixed-width vectors. Value *X = ExtElt->getVectorOperand(); if (!isa(X->getType())) @@ -563,52 +576,41 @@ static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, Value *Shuf = createShiftShuffle(X, cast(C)->getZExtValue(), NewIndex, Builder); - return dyn_cast( - Builder.CreateExtractElement(Shuf, NewIndex)); + return Shuf; } /// Try to reduce extract element costs by converting scalar compares to vector /// compares followed by extract. /// cmp (ext0 V0, C), (ext1 V1, C) -void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0, - ExtractElementInst *Ext1, Instruction &I) { +Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, + Instruction &I) { assert(isa(&I) && "Expected a compare"); - assert(cast(Ext0->getIndexOperand())->getZExtValue() == - cast(Ext1->getIndexOperand())->getZExtValue() && - "Expected matching constant extract indexes"); // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C ++NumVecCmp; CmpInst::Predicate Pred = cast(&I)->getPredicate(); - Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); Value *VecCmp = Builder.CreateCmp(Pred, V0, V1); - Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand()); - replaceValue(I, *NewExt); + return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp"); } /// Try to reduce extract element costs by converting scalar binops to vector /// binops followed by extract. /// bo (ext0 V0, C), (ext1 V1, C) -void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0, - ExtractElementInst *Ext1, Instruction &I) { +Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, + Instruction &I) { assert(isa(&I) && "Expected a binary operator"); - assert(cast(Ext0->getIndexOperand())->getZExtValue() == - cast(Ext1->getIndexOperand())->getZExtValue() && - "Expected matching constant extract indexes"); // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C ++NumVecBO; - Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); - Value *VecBO = - Builder.CreateBinOp(cast(&I)->getOpcode(), V0, V1); + Value *VecBO = Builder.CreateBinOp(cast(&I)->getOpcode(), V0, + V1, "foldExtExtBinop"); // All IR flags are safe to back-propagate because any potential poison // created in unused vector elements is discarded by the extract. if (auto *VecBOInst = dyn_cast(VecBO)) VecBOInst->copyIRFlags(&I); - Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand()); - replaceValue(I, *NewExt); + return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop"); } /// Match an instruction with extracted vector operands. @@ -647,25 +649,29 @@ bool VectorCombine::foldExtractExtract(Instruction &I) { if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex)) return false; + Value *ExtOp0 = Ext0->getVectorOperand(); + Value *ExtOp1 = Ext1->getVectorOperand(); + if (ExtractToChange) { unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0; - ExtractElementInst *NewExtract = + Value *NewExtOp = translateExtract(ExtractToChange, CheapExtractIdx, Builder); - if (!NewExtract) + if (!NewExtOp) return false; if (ExtractToChange == Ext0) - Ext0 = NewExtract; + ExtOp0 = NewExtOp; else - Ext1 = NewExtract; + ExtOp1 = NewExtOp; } - if (Pred != CmpInst::BAD_ICMP_PREDICATE) - foldExtExtCmp(Ext0, Ext1, I); - else - foldExtExtBinop(Ext0, Ext1, I); - + Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand() + : Ext0->getIndexOperand(); + Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE + ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I) + : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I); Worklist.push(Ext0); Worklist.push(Ext1); + replaceValue(I, *NewExt); return true; } @@ -1824,7 +1830,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { LI->getAlign(), VecTy->getElementType(), Idx, *DL); NewLoad->setAlignment(ScalarOpAlignment); - replaceValue(*EI, *NewLoad); + replaceValue(*EI, *NewLoad, false); } FailureGuard.release(); @@ -3112,7 +3118,7 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask); LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n"); replaceValue(*Shuffle, *NewShuffle); - MadeChanges = true; + return true; } // See if we can re-use foldSelectShuffle, getting it to reduce the size of @@ -3608,7 +3614,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { Builder.SetInsertPoint(Shuffles[S]); Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]); - replaceValue(*Shuffles[S], *NSV); + replaceValue(*Shuffles[S], *NSV, false); } Worklist.pushValue(NSV0A); @@ -3873,8 +3879,7 @@ bool VectorCombine::run() { LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n"); - bool MadeChange = false; - auto FoldInst = [this, &MadeChange](Instruction &I) { + auto FoldInst = [this](Instruction &I) { Builder.SetInsertPoint(&I); bool IsVectorType = isa(I.getType()); bool IsFixedVectorType = isa(I.getType()); @@ -3889,10 +3894,12 @@ bool VectorCombine::run() { if (IsFixedVectorType) { switch (Opcode) { case Instruction::InsertElement: - MadeChange |= vectorizeLoadInsert(I); + if (vectorizeLoadInsert(I)) + return true; break; case Instruction::ShuffleVector: - MadeChange |= widenSubvectorLoad(I); + if (widenSubvectorLoad(I)) + return true; break; default: break; @@ -3902,19 +3909,25 @@ bool VectorCombine::run() { // This transform works with scalable and fixed vectors // TODO: Identify and allow other scalable transforms if (IsVectorType) { - MadeChange |= scalarizeOpOrCmp(I); - MadeChange |= scalarizeLoadExtract(I); - MadeChange |= scalarizeExtExtract(I); - MadeChange |= scalarizeVPIntrinsic(I); - MadeChange |= foldInterleaveIntrinsics(I); + if (scalarizeOpOrCmp(I)) + return true; + if (scalarizeLoadExtract(I)) + return true; + if (scalarizeExtExtract(I)) + return true; + if (scalarizeVPIntrinsic(I)) + return true; + if (foldInterleaveIntrinsics(I)) + return true; } if (Opcode == Instruction::Store) - MadeChange |= foldSingleElementStore(I); + if (foldSingleElementStore(I)) + return true; // If this is an early pipeline invocation of this pass, we are done. if (TryEarlyFoldsOnly) - return; + return false; // Otherwise, try folds that improve codegen but may interfere with // early IR canonicalizations. @@ -3923,56 +3936,79 @@ bool VectorCombine::run() { if (IsFixedVectorType) { switch (Opcode) { case Instruction::InsertElement: - MadeChange |= foldInsExtFNeg(I); - MadeChange |= foldInsExtBinop(I); - MadeChange |= foldInsExtVectorToShuffle(I); + if (foldInsExtFNeg(I)) + return true; + if (foldInsExtBinop(I)) + return true; + if (foldInsExtVectorToShuffle(I)) + return true; break; case Instruction::ShuffleVector: - MadeChange |= foldPermuteOfBinops(I); - MadeChange |= foldShuffleOfBinops(I); - MadeChange |= foldShuffleOfSelects(I); - MadeChange |= foldShuffleOfCastops(I); - MadeChange |= foldShuffleOfShuffles(I); - MadeChange |= foldShuffleOfIntrinsics(I); - MadeChange |= foldSelectShuffle(I); - MadeChange |= foldShuffleToIdentity(I); + if (foldPermuteOfBinops(I)) + return true; + if (foldShuffleOfBinops(I)) + return true; + if (foldShuffleOfSelects(I)) + return true; + if (foldShuffleOfCastops(I)) + return true; + if (foldShuffleOfShuffles(I)) + return true; + if (foldShuffleOfIntrinsics(I)) + return true; + if (foldSelectShuffle(I)) + return true; + if (foldShuffleToIdentity(I)) + return true; break; case Instruction::BitCast: - MadeChange |= foldBitcastShuffle(I); + if (foldBitcastShuffle(I)) + return true; break; case Instruction::And: case Instruction::Or: case Instruction::Xor: - MadeChange |= foldBitOpOfCastops(I); + if (foldBitOpOfCastops(I)) + return true; break; default: - MadeChange |= shrinkType(I); + if (shrinkType(I)) + return true; break; } } else { switch (Opcode) { case Instruction::Call: - MadeChange |= foldShuffleFromReductions(I); - MadeChange |= foldCastFromReductions(I); + if (foldShuffleFromReductions(I)) + return true; + if (foldCastFromReductions(I)) + return true; break; case Instruction::ICmp: case Instruction::FCmp: - MadeChange |= foldExtractExtract(I); + if (foldExtractExtract(I)) + return true; break; case Instruction::Or: - MadeChange |= foldConcatOfBoolMasks(I); + if (foldConcatOfBoolMasks(I)) + return true; [[fallthrough]]; default: if (Instruction::isBinaryOp(Opcode)) { - MadeChange |= foldExtractExtract(I); - MadeChange |= foldExtractedCmps(I); - MadeChange |= foldBinopOfReductions(I); + if (foldExtractExtract(I)) + return true; + if (foldExtractedCmps(I)) + return true; + if (foldBinopOfReductions(I)) + return true; } break; } } + return false; }; + bool MadeChange = false; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. if (!DT.isReachableFromEntry(&BB)) @@ -3981,7 +4017,7 @@ bool VectorCombine::run() { for (Instruction &I : make_early_inc_range(BB)) { if (I.isDebugOrPseudoInst()) continue; - FoldInst(I); + MadeChange |= FoldInst(I); } } @@ -3995,7 +4031,7 @@ bool VectorCombine::run() { continue; } - FoldInst(*I); + MadeChange |= FoldInst(*I); } return MadeChange; diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll index bb6f3e719bb14..8968484ff0db3 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll @@ -939,53 +939,17 @@ define void @same_op8_splat(ptr noalias noundef %a, ptr noundef %b, ptr noundef ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[C]], align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_VEC19:%.*]] = load <16 x float>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC23:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC24:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC25:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC26:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC27:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[STRIDED_VEC20]], <2 x float> [[STRIDED_VEC21]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[STRIDED_VEC]], <2 x float> [[STRIDED_VEC12]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <4 x float> [[TMP8]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[STRIDED_VEC22]], <2 x float> [[STRIDED_VEC23]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[STRIDED_VEC13]], <2 x float> [[STRIDED_VEC14]], <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <4 x float> [[TMP12]], [[TMP2]] -; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[TMP11]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x float> [[STRIDED_VEC24]], <2 x float> [[STRIDED_VEC25]], <4 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[STRIDED_VEC15]], <2 x float> [[STRIDED_VEC16]], <4 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <4 x float> [[TMP16]], [[TMP3]] -; CHECK-NEXT: [[TMP18:%.*]] = fadd fast <4 x float> [[TMP15]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[STRIDED_VEC26]], <2 x float> [[STRIDED_VEC27]], <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[STRIDED_VEC17]], <2 x float> [[STRIDED_VEC18]], <4 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = fmul fast <4 x float> [[TMP20]], [[TMP4]] -; CHECK-NEXT: [[TMP22:%.*]] = fadd fast <4 x float> [[TMP19]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP14]], <8 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x float> [[TMP18]], <4 x float> [[TMP22]], <8 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP23]], <8 x float> [[TMP24]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[TMP1]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x float> [[WIDE_VEC19]], [[TMP4]] ; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 144 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll index 798df4cd4ff54..63f8250b5f3de 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll @@ -404,13 +404,13 @@ define <16 x i16> @add_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) { ; SSE4-LABEL: @add_v16i16_FEuCBA98765432u0( ; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> ; SSE4-NEXT: [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> -; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> +; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> ; SSE4-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]] -; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> ; SSE4-NEXT: [[TMP9:%.*]] = add <16 x i16> [[TMP7]], [[TMP8]] -; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[TMP6]], <16 x i32> +; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[TMP6]], <16 x i32> ; SSE4-NEXT: ret <16 x i16> [[RESULT]] ; ; AVX2-LABEL: @add_v16i16_FEuCBA98765432u0( diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll index fd160b7c57024..bbfe844400b0c 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll @@ -398,13 +398,13 @@ define <16 x i16> @sub_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) { ; SSE4-LABEL: @sub_v16i16_FEuCBA98765432u0( ; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> ; SSE4-NEXT: [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> -; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> +; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> ; SSE4-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]] -; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> ; SSE4-NEXT: [[TMP9:%.*]] = sub <16 x i16> [[TMP7]], [[TMP8]] -; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[TMP6]], <16 x i32> +; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[TMP6]], <16 x i32> ; SSE4-NEXT: ret <16 x i16> [[RESULT]] ; ; AVX2-LABEL: @sub_v16i16_FEuCBA98765432u0( diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll index d369279c15db4..41d77e89476ba 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll @@ -268,7 +268,7 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: @ext5_ext0_add( ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0 +; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: ret i8 [[R]] ; %e0 = extractelement <16 x i8> %x, i32 5 @@ -294,7 +294,7 @@ define float @ext1_ext0_fmul(<4 x float> %x) { ; CHECK-LABEL: @ext1_ext0_fmul( ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[SHIFT]], [[X]] -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP1]], i64 0 +; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 ; CHECK-NEXT: ret float [[R]] ; %e0 = extractelement <4 x float> %x, i32 1 @@ -363,7 +363,7 @@ define float @ext7_ext4_fmul_v8f32(<8 x float> %x) { ; AVX-LABEL: @ext7_ext4_fmul_v8f32( ; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x float> [[X:%.*]], <8 x float> poison, <8 x i32> ; AVX-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[SHIFT]], [[X]] -; AVX-NEXT: [[R:%.*]] = extractelement <8 x float> [[TMP1]], i64 4 +; AVX-NEXT: [[R:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 ; AVX-NEXT: ret float [[R]] ; %e0 = extractelement <8 x float> %x, i32 7 @@ -484,7 +484,7 @@ define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[TMP1]], [[SHIFT1]] ; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[SHIFT2]], [[TMP2]] -; CHECK-NEXT: [[Z0123:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[Z0123:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; CHECK-NEXT: ret i32 [[Z0123]] ; %z = and <4 x i32> %x, %y @@ -504,7 +504,7 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) { ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[X]] ; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[SHIFT1]], [[TMP1]] -; CHECK-NEXT: [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: ret i32 [[X210]] ; %x0 = extractelement <4 x i32> %x, i32 0 @@ -523,7 +523,7 @@ define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[SHIFT1]], [[TMP1]] ; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHIFT2]], [[TMP2]] -; CHECK-NEXT: [[X2Y210:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[X2Y210:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; CHECK-NEXT: ret i32 [[X2Y210]] ; %y0 = extractelement <4 x i32> %y, i32 0 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll index d11fb1426c94e..4c1ca82b2bd06 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -268,7 +268,7 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: @ext5_ext0_add( ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0 +; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: ret i8 [[R]] ; %e0 = extractelement <16 x i8> %x, i32 5 @@ -294,7 +294,7 @@ define float @ext1_ext0_fmul(<4 x float> %x) { ; CHECK-LABEL: @ext1_ext0_fmul( ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[SHIFT]], [[X]] -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP1]], i64 0 +; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 ; CHECK-NEXT: ret float [[R]] ; %e0 = extractelement <4 x float> %x, i32 1 @@ -363,7 +363,7 @@ define float @ext7_ext4_fmul_v8f32(<8 x float> %x) { ; AVX-LABEL: @ext7_ext4_fmul_v8f32( ; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x float> [[X:%.*]], <8 x float> poison, <8 x i32> ; AVX-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[SHIFT]], [[X]] -; AVX-NEXT: [[R:%.*]] = extractelement <8 x float> [[TMP1]], i64 4 +; AVX-NEXT: [[R:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 ; AVX-NEXT: ret float [[R]] ; %e0 = extractelement <8 x float> %x, i32 7 @@ -490,7 +490,7 @@ define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[TMP1]], [[SHIFT1]] ; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[SHIFT2]], [[TMP2]] -; CHECK-NEXT: [[Z0123:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[Z0123:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; CHECK-NEXT: ret i32 [[Z0123]] ; %z = and <4 x i32> %x, %y @@ -510,7 +510,7 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) { ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[X]] ; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[SHIFT1]], [[TMP1]] -; CHECK-NEXT: [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: ret i32 [[X210]] ; %x0 = extractelement <4 x i32> %x, i32 0 @@ -529,7 +529,7 @@ define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[SHIFT1]], [[TMP1]] ; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHIFT2]], [[TMP2]] -; CHECK-NEXT: [[X2Y210:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[X2Y210:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; CHECK-NEXT: ret i32 [[X2Y210]] ; %y0 = extractelement <4 x i32> %y, i32 0 @@ -573,10 +573,8 @@ define i64 @instsimplify_folder_crash(<4 x i64> %in) { ; CHECK-LABEL: @instsimplify_folder_crash( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SHUFFLE_1:%.*]] = shufflevector <4 x i64> [[IN:%.*]], <4 x i64> zeroinitializer, <4 x i32> -; CHECK-NEXT: [[E_0:%.*]] = extractelement <4 x i64> zeroinitializer, i64 0 -; CHECK-NEXT: [[E_1:%.*]] = extractelement <4 x i64> [[SHUFFLE_1]], i64 1 -; CHECK-NEXT: [[OR:%.*]] = or i64 [[E_1]], [[E_0]] -; CHECK-NEXT: ret i64 [[OR]] +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i64> [[SHUFFLE_1]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: ret i64 0 ; entry: %shuffle.1 = shufflevector <4 x i64> %in, <4 x i64> zeroinitializer, <4 x i32> diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll index 3dae93665b1ed..795832f22b096 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll @@ -130,7 +130,7 @@ define i1 @cmp10_v2f64(<2 x double> %x, <2 x double> %y) { ; AVX-LABEL: @cmp10_v2f64( ; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> poison, <2 x i32> ; AVX-NEXT: [[TMP1:%.*]] = fcmp ule <2 x double> [[SHIFT]], [[Y:%.*]] -; AVX-NEXT: [[CMP:%.*]] = extractelement <2 x i1> [[TMP1]], i64 0 +; AVX-NEXT: [[CMP:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 ; AVX-NEXT: ret i1 [[CMP]] ; %x1 = extractelement <2 x double> %x, i32 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll index b26e5ec2698a5..50e32b79a91c2 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll @@ -27,6 +27,8 @@ define void @multiple_extract(ptr %p) { ; infinite loop if we fold an extract that is waiting to be erased define void @unused_extract(ptr %p) { ; CHECK-LABEL: @unused_extract( +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, ptr [[P:%.*]], align 8 +; CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x float> [[LOAD]], i64 1 ; CHECK-NEXT: ret void ; %load = load <4 x float>, ptr %p, align 8