diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 1a548a536f088..cbdc1b6031680 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3219,25 +3219,19 @@ class LLVM_ABI TargetLoweringBase { /// Lower an interleaved store to target specific intrinsics. Return /// true on success. /// - /// \p SI is the vector store instruction. + /// \p SI is the vector store instruction. Can be either a plain store + /// or a vp.store. + /// \p Mask is a per-segment (i.e. number of lanes equal to that of one + /// component being interwoven) mask. Can be nullptr, in which case the + /// result is unconditional. /// \p SVI is the shufflevector to RE-interleave the stored vector. /// \p Factor is the interleave factor. - virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + virtual bool lowerInterleavedStore(Instruction *Store, Value *Mask, + ShuffleVectorInst *SVI, unsigned Factor) const { return false; } - /// Lower an interleaved store to target specific intrinsics. Return - /// true on success. - /// - /// \p Store is the vp.store instruction. - /// \p Mask is a mask value - /// \p InterleaveOps is a list of values being interleaved. - virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, - ArrayRef InterleaveOps) const { - return false; - } - /// Lower a deinterleave intrinsic to a target specific load intrinsic. /// Return true on success. Currently only supports /// llvm.vector.deinterleave{2,3,5,7} diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index df162fca18d92..7afeb51b17c2f 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -518,46 +518,26 @@ bool InterleavedAccessImpl::lowerInterleavedStore( assert(NumStoredElements % Factor == 0 && "number of stored element should be a multiple of Factor"); + Value *Mask = nullptr; if (auto *VPStore = dyn_cast(Store)) { unsigned LaneMaskLen = NumStoredElements / Factor; - Value *LaneMask = getMask(VPStore->getMaskParam(), Factor, - ElementCount::getFixed(LaneMaskLen)); - if (!LaneMask) + Mask = getMask(VPStore->getMaskParam(), Factor, + ElementCount::getFixed(LaneMaskLen)); + if (!Mask) return false; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store << "\n"); - IRBuilder<> Builder(VPStore); - // We need to effectively de-interleave the shufflemask - // because lowerInterleavedVPStore expects individual de-interleaved - // values. - SmallVector NewShuffles; - SmallVector NewShuffleMask(LaneMaskLen); - auto ShuffleMask = SVI->getShuffleMask(); - - for (unsigned i = 0; i < Factor; i++) { - for (unsigned j = 0; j < LaneMaskLen; j++) - NewShuffleMask[j] = ShuffleMask[i + Factor * j]; - - NewShuffles.push_back(Builder.CreateShuffleVector( - SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask)); - } - - // Try to create target specific intrinsics to replace the vp.store and - // shuffle. - if (!TLI->lowerInterleavedVPStore(VPStore, LaneMask, NewShuffles)) - // We already created new shuffles. - return true; } else { LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n"); - - // Try to create target specific intrinsics to replace the store and - // shuffle. - if (!TLI->lowerInterleavedStore(cast(Store), SVI, Factor)) - return false; } + // Try to create target specific intrinsics to replace the store and + // shuffle. + if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor)) + return false; + // Already have a new target specific interleaved store. Erase the old store. DeadInsts.insert(Store); DeadInsts.insert(SVI); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f026726c3f484..02ee517a0a9b8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17343,12 +17343,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) { /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) -bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, +bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, + Value *LaneMask, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); + auto *SI = dyn_cast(Store); + if (!SI) + return false; + assert(!LaneMask && "Unexpected mask on store"); auto *VecTy = cast(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 713793ec77da3..d8403c2971696 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -215,7 +215,8 @@ class AArch64TargetLowering : public TargetLowering { ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *Store, Value *Mask, + ShuffleVectorInst *SVI, unsigned Factor) const override; bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index fd3b0525c1056..8b7f06a5b5014 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21731,11 +21731,16 @@ bool ARMTargetLowering::lowerInterleavedLoad( /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) -bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, +bool ARMTargetLowering::lowerInterleavedStore(Instruction *Store, + Value *LaneMask, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); + auto *SI = dyn_cast(Store); + if (!SI) + return false; + assert(!LaneMask && "Unexpected mask on store"); auto *VecTy = cast(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 9159f3d2c3ed0..825145d813fb1 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -685,7 +685,8 @@ class VectorType; ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *Store, Value *Mask, + ShuffleVectorInst *SVI, unsigned Factor) const override; bool shouldInsertFencesForAtomic(const Instruction *I) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index e0a8c07b4206e..f0447e02191ae 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -434,7 +434,8 @@ class RISCVTargetLowering : public TargetLowering { ArrayRef Indices, unsigned Factor) const override; - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *Store, Value *Mask, + ShuffleVectorInst *SVI, unsigned Factor) const override; bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, @@ -444,9 +445,6 @@ class RISCVTargetLowering : public TargetLowering { Instruction *Store, Value *Mask, ArrayRef InterleaveValues) const override; - bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, - ArrayRef InterleaveOps) const override; - bool supportKCFIBundles() const override { return true; } SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 878401ef4063f..272b347a4db28 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -266,22 +266,28 @@ bool RISCVTargetLowering::lowerInterleavedLoad( /// /// Note that the new shufflevectors will be removed and we'll only generate one /// vsseg3 instruction in CodeGen. -bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, +bool RISCVTargetLowering::lowerInterleavedStore(Instruction *Store, + Value *LaneMask, ShuffleVectorInst *SVI, unsigned Factor) const { - IRBuilder<> Builder(SI); - const DataLayout &DL = SI->getDataLayout(); + IRBuilder<> Builder(Store); + const DataLayout &DL = Store->getDataLayout(); auto Mask = SVI->getShuffleMask(); auto *ShuffleVTy = cast(SVI->getType()); // Given SVI : , then VTy : auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(), ShuffleVTy->getNumElements() / Factor); - if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(), - SI->getPointerAddressSpace(), DL)) + auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); + + Value *Ptr, *VL; + Align Alignment; + if (!getMemOperands(Factor, VTy, XLenTy, Store, Ptr, LaneMask, VL, Alignment)) return false; - auto *PtrTy = SI->getPointerOperandType(); - auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); + Type *PtrTy = Ptr->getType(); + unsigned AS = PtrTy->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + return false; unsigned Index; // If the segment store only has one active lane (i.e. the interleave is @@ -292,27 +298,27 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, unsigned ScalarSizeInBytes = DL.getTypeStoreSize(ShuffleVTy->getElementType()); Value *Data = SVI->getOperand(0); - auto *DataVTy = cast(Data->getType()); + Data = Builder.CreateExtractVector(VTy, Data, uint64_t(0)); Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset); - Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount()); - Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(), - VTy->getElementCount()); - - CallInst *CI = Builder.CreateIntrinsic( - Intrinsic::experimental_vp_strided_store, - {Data->getType(), BasePtr->getType(), Stride->getType()}, - {Data, BasePtr, Stride, Mask, VL}); - Align Alignment = commonAlignment(SI->getAlign(), Index * ScalarSizeInBytes); - CI->addParamAttr( - 1, Attribute::getWithAlignment(CI->getContext(), Alignment)); + Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); + // Note: Same VL as above, but i32 not xlen due to signature of + // vp.strided.store + VL = Builder.CreateElementCount(Builder.getInt32Ty(), + VTy->getElementCount()); + CallInst *CI = + Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store, + {VTy, BasePtr->getType(), Stride->getType()}, + {Data, BasePtr, Stride, LaneMask, VL}); + Alignment = commonAlignment(Alignment, Index * ScalarSizeInBytes); + CI->addParamAttr(1, + Attribute::getWithAlignment(CI->getContext(), Alignment)); return true; } Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}); + Store->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}); SmallVector Ops; SmallVector NewShuffleMask; @@ -328,13 +334,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, NewShuffleMask.clear(); } - // This VL should be OK (should be executable in one vsseg instruction, - // potentially under larger LMULs) because we checked that the fixed vector - // type fits in isLegalInterleavedAccessType - Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); - Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount()); - Ops.append({SI->getPointerOperand(), StoreMask, VL}); - + Ops.append({Ptr, LaneMask, VL}); Builder.CreateCall(VssegNFunc, Ops); return true; @@ -457,91 +457,3 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( Builder.CreateCall(VssegNFunc, Operands); return true; } - -/// Lower an interleaved vp.store into a vssegN intrinsic. -/// -/// E.g. Lower an interleaved vp.store (Factor = 2): -/// -/// %is = tail call -/// @llvm.vector.interleave2.nxv64i8( -/// %load0, -/// %load1 -/// %wide.rvl = shl nuw nsw i32 %rvl, 1 -/// tail call void @llvm.vp.store.nxv64i8.p0( -/// %is, ptr %ptr, -/// %mask, -/// i32 %wide.rvl) -/// -/// Into: -/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64( -/// %load1, -/// %load2, ptr %ptr, -/// %mask, -/// i64 %rvl) -bool RISCVTargetLowering::lowerInterleavedVPStore( - VPIntrinsic *Store, Value *Mask, - ArrayRef InterleaveOperands) const { - assert(Mask && "Expect a valid mask"); - assert(Store->getIntrinsicID() == Intrinsic::vp_store && - "Unexpected intrinsic"); - - const unsigned Factor = InterleaveOperands.size(); - - auto *VTy = dyn_cast(InterleaveOperands[0]->getType()); - if (!VTy) - return false; - - const DataLayout &DL = Store->getDataLayout(); - Align Alignment = Store->getParamAlign(1).value_or( - DL.getABITypeAlign(VTy->getElementType())); - if (!isLegalInterleavedAccessType( - VTy, Factor, Alignment, - Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL)) - return false; - - IRBuilder<> Builder(Store); - Value *WideEVL = Store->getArgOperand(3); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor)) - return false; - - auto *PtrTy = Store->getArgOperand(1)->getType(); - auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); - auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); - Value *EVL = - Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); - - if (isa(VTy)) { - SmallVector Operands(InterleaveOperands); - Operands.append({Store->getArgOperand(1), Mask, EVL}); - Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2], - {VTy, PtrTy, XLenTy}, Operands); - return true; - } - - unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); - unsigned NumElts = VTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - Store->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Store->getContext()), - NumElts * SEW / 8), - Factor); - - Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration( - Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy}); - Value *StoredVal = PoisonValue::get(VecTupTy); - for (unsigned i = 0; i < Factor; ++i) - StoredVal = Builder.CreateCall( - VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)}); - - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - Store->getModule(), ScalableVssegIntrIds[Factor - 2], - {VecTupTy, PtrTy, Mask->getType(), EVL->getType()}); - - Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL, - ConstantInt::get(XLenTy, Log2_64(SEW))}; - - Builder.CreateCall(VssegNFunc, Operands); - return true; -} diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 26369792db26d..547b2210fdbf0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1668,7 +1668,8 @@ namespace llvm { /// Lower interleaved store(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *Store, Value *Mask, + ShuffleVectorInst *SVI, unsigned Factor) const override; SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 360293bce54e8..636b072837441 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -822,7 +822,8 @@ bool X86TargetLowering::lowerInterleavedLoad( return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); } -bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, +bool X86TargetLowering::lowerInterleavedStore(Instruction *Store, + Value *LaneMask, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && @@ -832,6 +833,11 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, 0 && "Invalid interleaved store"); + auto *SI = dyn_cast(Store); + if (!SI) + return false; + assert(!LaneMask && "Unexpected mask on store"); + // Holds the indices of SVI that correspond to the starting index of each // interleaved shuffle. auto Mask = SVI->getShuffleMask(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index bdf344d4d16ae..70523f16e0f00 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1757,8 +1757,9 @@ define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) { define void @vpstore_factor4_one_active(ptr %ptr, <4 x i32> %v) { ; CHECK-LABEL: vpstore_factor4_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: vsse32.v v8, (a0), a1 ; CHECK-NEXT: ret %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> tail call void @llvm.vp.store.v16i32.p0(<16 x i32> %v0, ptr %ptr, <16 x i1> splat (i1 true), i32 16) @@ -1782,7 +1783,7 @@ define void @store_factor4_one_active_fullwidth(ptr %ptr, <16 x i32> %v) { ; CHECK-LABEL: store_factor4_one_active_fullwidth: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vsse32.v v8, (a0), a1 ; CHECK-NEXT: ret %v0 = shufflevector <16 x i32> %v, <16 x i32> poison, <16 x i32>