diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 084b788d51828..1a548a536f088 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3201,11 +3201,15 @@ class LLVM_ABI TargetLoweringBase { /// Lower an interleaved load to target specific intrinsics. Return /// true on success. /// - /// \p LI is the vector load instruction. + /// \p Load is the vector load instruction. Can be either a plain load + /// instruction or a vp.load intrinsic. + /// \p Mask is a per-segment (i.e. number of lanes equal to that of one + /// component being interwoven) mask. Can be nullptr, in which case the + /// result is uncondiitional. /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector. /// \p Indices is the corresponding indices for each shufflevector. /// \p Factor is the interleave factor. - virtual bool lowerInterleavedLoad(LoadInst *LI, + virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { @@ -3223,17 +3227,6 @@ class LLVM_ABI TargetLoweringBase { return false; } - /// Lower an interleaved load to target specific intrinsics. Return - /// true on success. - /// - /// \p Load is a vp.load instruction. - /// \p Mask is a mask value - /// \p DeinterleaveRes is a list of deinterleaved results. - virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveRes) const { - return false; - } - /// Lower an interleaved store to target specific intrinsics. Return /// true on success. /// diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index d43cd46f8ad82..d2b2edf2ebc80 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -367,34 +367,23 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( bool BinOpShuffleChanged = replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); + Value *Mask = nullptr; if (auto *VPLoad = dyn_cast(Load)) { - Value *LaneMask = - getMask(VPLoad->getMaskParam(), Factor, cast(VecTy)); - if (!LaneMask) + Mask = getMask(VPLoad->getMaskParam(), Factor, cast(VecTy)); + if (!Mask) return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n"); - - // Sometimes the number of Shuffles might be less than Factor, we have to - // fill the gaps with null. Also, lowerInterleavedVPLoad - // expects them to be sorted. - SmallVector ShuffleValues(Factor, nullptr); - for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices)) - ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx]; - if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues)) - // If Extracts is not empty, tryReplaceExtracts made changes earlier. - return !Extracts.empty() || BinOpShuffleChanged; } else { LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); - - // Try to create target specific intrinsics to replace the load and - // shuffles. - if (!TLI->lowerInterleavedLoad(cast(Load), Shuffles, Indices, - Factor)) - // If Extracts is not empty, tryReplaceExtracts made changes earlier. - return !Extracts.empty() || BinOpShuffleChanged; } + // Try to create target specific intrinsics to replace the load and + // shuffles. + if (!TLI->lowerInterleavedLoad(cast(Load), Mask, Shuffles, + Indices, Factor)) + // If Extracts is not empty, tryReplaceExtracts made changes earlier. + return !Extracts.empty() || BinOpShuffleChanged; + DeadInsts.insert_range(Shuffles); DeadInsts.insert(Load); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ff23f76fadccd..d04e6c45e2103 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17155,7 +17155,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, + Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -17163,6 +17163,11 @@ bool AArch64TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + const DataLayout &DL = LI->getDataLayout(); VectorType *VTy = Shuffles[0]->getType(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 7b1de3d3254f2..713793ec77da3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -211,7 +211,7 @@ class AArch64TargetLowering : public TargetLowering { unsigned getMaxSupportedInterleaveFactor() const override { return 4; } - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 65d1c4e2d6515..8dd35f61d4ddc 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21585,7 +21585,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, + Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -21593,6 +21593,11 @@ bool ARMTargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + auto *VecTy = cast(Shuffles[0]->getType()); Type *EltTy = VecTy->getElementType(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 5f4aef55b22c9..9159f3d2c3ed0 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -681,7 +681,7 @@ class VectorType; unsigned getMaxSupportedInterleaveFactor() const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index a5d735c407e5c..e0a8c07b4206e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -429,7 +429,7 @@ class RISCVTargetLowering : public TargetLowering { bool fallBackToDAGISel(const Instruction &Inst) const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; @@ -444,9 +444,6 @@ class RISCVTargetLowering : public TargetLowering { Instruction *Store, Value *Mask, ArrayRef InterleaveValues) const override; - bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveRes) const override; - bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, ArrayRef InterleaveOps) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 0d4f24172b574..38cc0ce00a352 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -115,21 +115,49 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool RISCVTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, + Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Indices.size() == Shuffles.size()); - IRBuilder<> Builder(LI); - - const DataLayout &DL = LI->getDataLayout(); + IRBuilder<> Builder(Load); + const DataLayout &DL = Load->getDataLayout(); auto *VTy = cast(Shuffles[0]->getType()); - if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(), - LI->getPointerAddressSpace(), DL)) - return false; + auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); + + Value *Ptr, *VL; + Align Alignment; + if (auto *LI = dyn_cast(Load)) { + assert(LI->isSimple()); + Ptr = LI->getPointerOperand(); + Alignment = LI->getAlign(); + assert(!Mask && "Unexpected mask on a load\n"); + Mask = Builder.getAllOnesMask(VTy->getElementCount()); + VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); + } else { + auto *VPLoad = cast(Load); + assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load && + "Unexpected intrinsic"); + Ptr = VPLoad->getMemoryPointerParam(); + Alignment = VPLoad->getPointerAlignment().value_or( + DL.getABITypeAlign(VTy->getElementType())); - auto *PtrTy = LI->getPointerOperandType(); - auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + assert(Mask && "vp.load needs a mask!"); + + Value *WideEVL = VPLoad->getVectorLengthParam(); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, DL, Factor)) + return false; + + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); + } + + Type *PtrTy = Ptr->getType(); + unsigned AS = PtrTy->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + return false; // If the segment load is going to be performed segment at a time anyways // and there's only one element used, use a strided load instead. This @@ -138,26 +166,23 @@ bool RISCVTargetLowering::lowerInterleavedLoad( unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); - Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(), - VTy->getElementCount()); - + Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); + // Note: Same VL as above, but i32 not xlen due to signature of + // vp.strided.load + VL = Builder.CreateElementCount(Builder.getInt32Ty(), + VTy->getElementCount()); CallInst *CI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, {VTy, BasePtr->getType(), Stride->getType()}, {BasePtr, Stride, Mask, VL}); - CI->addParamAttr( - 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); + CI->addParamAttr(0, + Attribute::getWithAlignment(CI->getContext(), Alignment)); Shuffles[0]->replaceAllUsesWith(CI); return true; }; - Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); CallInst *VlsegN = Builder.CreateIntrinsic( - FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, - {LI->getPointerOperand(), Mask, VL}); + FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); for (unsigned i = 0; i < Shuffles.size(); i++) { Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); @@ -426,122 +451,6 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( return true; } -/// Lower an interleaved vp.load into a vlsegN intrinsic. -/// -/// E.g. Lower an interleaved vp.load (Factor = 2): -/// %l = call @llvm.vp.load.nxv64i8.p0(ptr %ptr, -/// %mask, -/// i32 %wide.rvl) -/// %dl = tail call { , } -/// @llvm.vector.deinterleave2.nxv64i8( -/// %l) -/// %r0 = extractvalue { , } %dl, 0 -/// %r1 = extractvalue { , } %dl, 1 -/// -/// Into: -/// %rvl = udiv %wide.rvl, 2 -/// %sl = call { , } -/// @llvm.riscv.vlseg2.mask.nxv32i8.i64( undef, -/// undef, -/// ptr %ptr, -/// %mask, -/// i64 %rvl, -/// i64 1) -/// %r0 = extractvalue { , } %sl, 0 -/// %r1 = extractvalue { , } %sl, 1 -/// -/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be -/// removed by the caller -/// TODO: We probably can loosen the dependency on matching extractvalue when -/// dealing with factor of 2 (extractvalue is still required for most of other -/// factors though). -bool RISCVTargetLowering::lowerInterleavedVPLoad( - VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveResults) const { - const unsigned Factor = DeinterleaveResults.size(); - assert(Mask && "Expect a valid mask"); - assert(Load->getIntrinsicID() == Intrinsic::vp_load && - "Unexpected intrinsic"); - - Value *FirstActive = *llvm::find_if(DeinterleaveResults, - [](Value *V) { return V != nullptr; }); - VectorType *VTy = cast(FirstActive->getType()); - - auto &DL = Load->getModule()->getDataLayout(); - Align Alignment = Load->getParamAlign(0).value_or( - DL.getABITypeAlign(VTy->getElementType())); - if (!isLegalInterleavedAccessType( - VTy, Factor, Alignment, - Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) - return false; - - IRBuilder<> Builder(Load); - - Value *WideEVL = Load->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) - return false; - - auto *PtrTy = Load->getArgOperand(0)->getType(); - auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); - auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); - Value *EVL = - Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); - - Value *Return = nullptr; - if (isa(VTy)) { - Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], - {VTy, PtrTy, XLenTy}, - {Load->getArgOperand(0), Mask, EVL}); - } else { - unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); - unsigned NumElts = VTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - Load->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), - NumElts * SEW / 8), - Factor); - - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), ScalableVlsegIntrIds[Factor - 2], - {VecTupTy, PtrTy, Mask->getType(), EVL->getType()}); - - Value *Operands[] = { - PoisonValue::get(VecTupTy), - Load->getArgOperand(0), - Mask, - EVL, - ConstantInt::get(XLenTy, - RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC), - ConstantInt::get(XLenTy, Log2_64(SEW))}; - - CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); - - SmallVector AggrTypes{Factor, VTy}; - Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); - Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); - for (unsigned i = 0; i < Factor; ++i) { - Value *VecExtract = - Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); - Return = Builder.CreateInsertValue(Return, VecExtract, i); - } - } - - for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) { - if (!DIO) - continue; - // We have to create a brand new ExtractValue to replace each - // of these old ExtractValue instructions. - Value *NewEV = - Builder.CreateExtractValue(Return, {static_cast(Idx)}); - DIO->replaceAllUsesWith(NewEV); - } - - return true; -} - /// Lower an interleaved vp.store into a vssegN intrinsic. /// /// E.g. Lower an interleaved vp.store (Factor = 2): diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 6bcb7a36e91b5..26369792db26d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1661,7 +1661,7 @@ namespace llvm { /// Lower interleaved load(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 1eb47e3b2cd18..360293bce54e8 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // number of shuffles and ISA. // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, + Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + // Create an interleaved access group. IRBuilder<> Builder(LI); X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 019bbe2908a2c..dbc8e891ab5f7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1721,8 +1721,9 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) { define <4 x i32> @vp_load_factor3_one_active(ptr %ptr) { ; CHECK-LABEL: vp_load_factor3_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: vlse32.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12) %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> @@ -1732,8 +1733,9 @@ define <4 x i32> @vp_load_factor3_one_active(ptr %ptr) { define <4 x i32> @vp_load_factor5_one_active(ptr %ptr) { ; CHECK-LABEL: vp_load_factor5_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 20 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlseg5e32.v v8, (a0) +; CHECK-NEXT: vlse32.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = tail call <20 x i32> @llvm.vp.load.v20i32.p0(ptr %ptr, <20 x i1> splat (i1 true), i32 20) %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32>