From ef5080cb2efdb76c04852fdc7b2e3d8384fb6e50 Mon Sep 17 00:00:00 2001 From: Mateja Marjanovic Date: Mon, 15 Jan 2024 12:47:49 +0100 Subject: [PATCH 01/13] [AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/bf8 instructions Add VOP1, VOP1_DPP8, VOP1_DPP16, VOP3, VOP3_DPP8, VOP3_DPP16 instructions that were supported on GFX940 (MI300): - V_CVT_F32_FP8 - V_CVT_F32_BF8 - V_CVT_PK_F32_FP8 - V_CVT_PK_F32_BF8 - V_CVT_PK_FP8_F32 - V_CVT_PK_BF8_F32 - V_CVT_SR_FP8_F32 - V_CVT_SR_BF8_F32 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 4 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 68 +++- .../Disassembler/AMDGPUDisassembler.cpp | 50 ++- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 4 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 11 + llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 + llvm/lib/Target/AMDGPU/VOP1Instructions.td | 98 ++++- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 53 ++- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll | 105 +++++ .../AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir | 197 ++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll | 370 +++++++++++++++--- llvm/test/MC/AMDGPU/gfx12_asm_vop1.s | 45 +++ llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s | 12 + llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s | 12 + llvm/test/MC/AMDGPU/gfx12_asm_vop3.s | 36 ++ llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s | 108 +++++ llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s | 48 +++ .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 138 +++++++ .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s | 12 + .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s | 12 + .../Disassembler/AMDGPU/gfx12_dasm_vop1.txt | 36 ++ .../AMDGPU/gfx12_dasm_vop1_dpp16.txt | 12 + .../AMDGPU/gfx12_dasm_vop1_dpp8.txt | 12 + .../Disassembler/AMDGPU/gfx12_dasm_vop3.txt | 36 ++ .../AMDGPU/gfx12_dasm_vop3_dpp16.txt | 108 +++++ .../AMDGPU/gfx12_dasm_vop3_dpp8.txt | 48 +++ .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt | 36 ++ .../gfx12_dasm_vop3_from_vop1_dpp16.txt | 12 + .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt | 12 + 29 files changed, 1623 insertions(+), 75 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index c1c863d885c3a..852a99786efff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1495,6 +1495,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeatureExtendedImageInsts, + FeatureFP8Insts, FeaturePackedTID, FeatureVcmpxPermlaneHazard, FeatureSALUFloatInsts, @@ -1502,7 +1503,8 @@ def FeatureISAVersion12 : FeatureSet< FeatureHasRestrictedSOffset, FeatureVGPRSingleUseHintInsts, FeatureMADIntraFwdBug, - FeatureScalarDwordx3Loads]>; + FeatureScalarDwordx3Loads, + FeatureDPPSrc1SGPR]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ba79affe683d6..b2b81446016ec 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -3500,6 +3500,9 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) { return !isInlineConstant(Inst, OpIdx); } else if (MO.isReg()) { auto Reg = MO.getReg(); + if (!Reg) { + return false; + } const MCRegisterInfo *TRI = getContext().getRegisterInfo(); auto PReg = mc2PseudoReg(Reg); return isSGPR(PReg, TRI) && PReg != SGPR_NULL; @@ -8273,6 +8276,16 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } + if (isVOP1Cvt_F32_Fp8_Bf8_e64(Opc) && + Opc != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 && + Opc != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I++]); + Op.addRegOrImmWithFPInputModsOperands(Inst, 1); // src0 + // Add dummy src1 + Inst.addOperand(MCOperand::createImm(0)); + Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(0, getSTI()))); + } + for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { @@ -8321,12 +8334,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0; if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi || - Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) { + Opc == AMDGPU::V_CVT_SR_FP8_F32_vi || + Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) { Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods Inst.addOperand(Inst.getOperand(0)); } - if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) { + // Adding vdst_in operand is already covered for these DPP instructions in + // cvtVOP3DPP. + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) && + !(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) { assert(!IsPacked); Inst.addOperand(Inst.getOperand(0)); } @@ -8765,6 +8786,11 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, int OldIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::old); int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers); + int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); + bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12; bool IsMAC = OldIdx != -1 && Src2ModIdx != -1 && Desc.getOperandConstraint(OldIdx, MCOI::TIED_TO) == -1; @@ -8788,6 +8814,20 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, } } + if (VdstInIdx != -1) { + int NumOperands = Inst.getNumOperands(); + if (VdstInIdx == NumOperands) + Inst.addOperand(Inst.getOperand(0)); + } + + if (IsVOP3CvtSrDpp) { + int NumOperands = Inst.getNumOperands(); + if (Src2ModIdx == NumOperands) { + Inst.addOperand(MCOperand::createImm(0)); + Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(0, getSTI()))); + } + } + auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), MCOI::TIED_TO); if (TiedTo != -1) { @@ -8801,6 +8841,13 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, Fi = Op.getImm(); } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + if (isVOP1Cvt_F32_Fp8_Bf8_e64(Inst.getOpcode()) && + Inst.getOpcode() != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 && + Inst.getOpcode() != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12) { + // Add dummy src1 + Inst.addOperand(MCOperand::createImm(0)); + Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(0, getSTI()))); + } } else if (Op.isReg()) { Op.addRegOperands(Inst, 1); } else if (Op.isImm() && @@ -8847,6 +8894,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I OptionalImmIndexMap OptionalIdx; unsigned I = 1; + const unsigned Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); @@ -8874,6 +8922,14 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I Op.addImmOperands(Inst, 1); } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegWithFPInputModsOperands(Inst, 2); + if (Opc == AMDGPU::V_CVT_F32_BF8_dpp_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_dpp_gfx12 || + Opc == AMDGPU::V_CVT_F32_BF8_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_dpp8_gfx12) { + // Add dummy src1 + Inst.addOperand(MCOperand::createImm(0)); + Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(0, getSTI()))); + } } else if (Op.isDppFI()) { Fi = Op.getImm(); } else if (Op.isReg()) { @@ -8884,6 +8940,14 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I } else { if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegWithFPInputModsOperands(Inst, 2); + if (Opc == AMDGPU::V_CVT_F32_BF8_dpp_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_dpp_gfx12 || + Opc == AMDGPU::V_CVT_F32_BF8_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_dpp8_gfx12) { + // Add dummy src1 + Inst.addOperand(MCOperand::createImm(0)); + Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(0, getSTI()))); + } } else if (Op.isReg()) { Op.addRegOperands(Inst, 1); } else if (Op.isDPPCtrl()) { diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 9dff3f6c2efd0..75d0511b567bb 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -522,6 +522,15 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, convertVOPCDPPInst(MI); // Special VOP3 case } else { assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3); + + if (AMDGPU::isVOP1Cvt_F32_Fp8_Bf8_e64(MI.getOpcode())) { + // Add omod and clamp modifiers. + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::omod); + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::clamp); + } + convertVOP3DPPInst(MI); // Regular VOP3 case } }; @@ -691,8 +700,15 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW, Address, CS); - if (Res) + if (Res) { + if (AMDGPU::isVOP1Cvt_F32_Fp8_Bf8_e64(MI.getOpcode())) { + // Add omod and clamp modifiers. + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod); + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::clamp); + } break; + } Res = tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW, Address, CS); @@ -708,6 +724,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, AMDGPU::OpName::src2_modifiers); } + if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) { + // Insert dummy unused src2_modifiers. + insertNamedMCOperand(MI, MCOperand::createImm(0), + AMDGPU::OpName::src2_modifiers); + } + if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) && !AMDGPU::hasGDS(STI)) { insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds); @@ -938,6 +961,13 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const { // first add optional MI operands to check FI DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); + + if (AMDGPU::isVOP1Cvt_F32_Fp8_Bf8_e64(Opc)) { + // Add omod and clamp modifiers. + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod); + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp); + } + if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) { convertVOP3PDPPInst(MI); } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) || @@ -947,6 +977,15 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { if (isMacDPP(MI)) convertMacDPPInst(MI); + int VDstInIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); + if (VDstInIdx != -1) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); + + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); + unsigned DescNumOps = MCII->get(Opc).getNumOperands(); if (MI.getNumOperands() < DescNumOps && AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) { @@ -973,6 +1012,15 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const { if (isMacDPP(MI)) convertMacDPPInst(MI); + int VDstInIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in); + if (VDstInIdx != -1) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in); + + if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 || + MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12) + insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2); + unsigned Opc = MI.getOpcode(); unsigned DescNumOps = MCII->get(Opc).getNumOperands(); if (MI.getNumOperands() < DescNumOps && diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 6c7977e22599c..1fc70f0bbbd2d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1300,7 +1300,9 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Opc = MI->getOpcode(); - if (isPermlane16(Opc)) { + if (isPermlane16(Opc) || (isVOP1Cvt_F32_Fp8_Bf8_e64(Opc) && + Opc != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 && + Opc != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12)) { auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); unsigned FI = !!(MI->getOperand(FIN).getImm() & SISrcMods::OP_SEL_0); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 26ba2575ff34a..ae197ee83acc0 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -503,6 +503,17 @@ bool isPermlane16(unsigned Opc) { Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12; } +bool isVOP1Cvt_F32_Fp8_Bf8_e64(unsigned Opc) { + return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 || + Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 || + Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12; +} + bool isGenericAtomic(unsigned Opc) { return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN || Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX || diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 50c741760d714..9d0bac084feab 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -542,6 +542,9 @@ bool isPermlane16(unsigned Opc); LLVM_READNONE bool isGenericAtomic(unsigned Opc); +LLVM_READNONE +bool isVOP1Cvt_F32_Fp8_Bf8_e64(unsigned Opc); + namespace VOPD { enum Component : unsigned { diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index d604990dc88c2..48202e2250c85 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -571,6 +571,7 @@ let SubtargetPredicate = isGFX9Only in { } // End SubtargetPredicate = isGFX9Only class VOPProfile_Base_CVT_F32_F8 : VOPProfileI2F { + let HasExtDPP = 1; let HasExtSDWA = 1; let HasExtSDWA9 = 1; let HasExt = 1; @@ -599,6 +600,7 @@ class Cvt_F32_F8_Pat; +let SubtargetPredicate = isGFX9Only in { let OtherPredicates = [HasCvtFP8VOP1Bug] in { def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)), (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>; @@ -617,6 +619,7 @@ foreach Index = [1, 2, 3] in { def : Cvt_F32_F8_Pat; def : Cvt_F32_F8_Pat; } +} // End SubtargetPredicate = isGFX9Only class Cvt_PK_F32_F8_Pat : GCNPat< @@ -626,11 +629,82 @@ class Cvt_PK_F32_F8_Pat; -foreach Index = [0, -1] in { - def : Cvt_PK_F32_F8_Pat; - def : Cvt_PK_F32_F8_Pat; +let SubtargetPredicate = isGFX9Only in { + foreach Index = [0, -1] in { + def : Cvt_PK_F32_F8_Pat; + def : Cvt_PK_F32_F8_Pat; + } +} + + +// Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions. +def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F { + let InsVOP3OpSel = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, + clampmod:$clamp, omod:$omod, op_sel0:$op_sel); + + let HasOpSel = 1; + let HasExtVOP3DPP = 0; +} + +def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, i32, untyped]> { + let InsVOP3OpSel = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, + Src1Mod:$src1_modifiers, Src1RC64:$src1, + clampmod:$clamp, omod:$omod, op_sel0:$op_sel); + let AsmVOP3OpSel = !subst(", $src1_modifiers", "", getAsmVOP3OpSel<2, 0, 0, 1, 1, 0>.ret); + + let HasOpSel = 1; + let HasExtDPP = 1; + let HasExtVOP3DPP = 1; + + let Src1VOP3DPP = Src1RC64; + let AsmVOP3DPP8 = getAsmVOP3DPP8.ret; + let AsmVOP3DPP16 = getAsmVOP3DPP16.ret; +} + +let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0, + SchedRW = [WriteFloatCvt] in { + defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>; + defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>; + defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>; + defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>; +} + +class Cvt_F32_F8_Pat_OpSel index, + VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat< + (f32 (node i32:$src, index)), + !if (index, + (inst_e64 !if(index{0}, SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), $src, + !if(index{1}, SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), (i32 0), + 0, 0, 0), + (inst_e32 $src)) +>; + +let SubtargetPredicate = isGFX12Plus in { + foreach Index = [0, 1, 2, 3] in { + def : Cvt_F32_F8_Pat_OpSel; + def : Cvt_F32_F8_Pat_OpSel; + } +} + +class Cvt_PK_F32_F8_Pat_OpSel : GCNPat< + (v2f32 (node i32:$src, index)), + !if (index, + (inst_e64 SRCMODS.OP_SEL_0, $src, 0, 0, SRCMODS.NONE), + (inst_e32 $src)) +>; + +let SubtargetPredicate = isGFX12Plus in { + foreach Index = [0, -1] in { + def : Cvt_PK_F32_F8_Pat_OpSel; + def : Cvt_PK_F32_F8_Pat_OpSel; + } } let SubtargetPredicate = isGFX10Plus in { @@ -854,6 +928,20 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name op, VOP3_Real_with_name; +// Define VOP1 instructions using the pseudo instruction with its old profile and +// VOP3 using the OpSel profile for the pseudo instruction. +defm V_CVT_F32_FP8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06c, "V_CVT_F32_FP8", "v_cvt_f32_fp8">; +defm V_CVT_F32_FP8 : VOP1_Realtriple_e64_with_name; + +defm V_CVT_F32_BF8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06d, "V_CVT_F32_BF8", "v_cvt_f32_bf8">; +defm V_CVT_F32_BF8 : VOP1_Realtriple_e64_with_name; + +defm V_CVT_PK_F32_FP8 : VOP1_Real_e32_with_name; +defm V_CVT_PK_F32_FP8 : VOP3_Real_with_name; + +defm V_CVT_PK_F32_BF8 : VOP1_Real_e32_with_name; +defm V_CVT_PK_F32_BF8 : VOP3_Real_with_name; + defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c, "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">; defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00d, diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index eebd323210f95..cc0d536c6e0b4 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -520,8 +520,26 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile { let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0, FP32InputMods:$src1_modifiers, Src1RC64:$src1, VGPR_32:$vdst_in, op_sel0:$op_sel); + let InsVOP3DPP = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + + let InsVOP3DPP16 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi); + let InsVOP3DPP8 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, FI:$fi); + let HasClamp = 0; - let HasExtVOP3DPP = 0; + let HasExtVOP3DPP = 1; } def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, @@ -530,14 +548,36 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, FP32InputMods:$src1_modifiers, Src1RC64:$src1, FP32InputMods:$src2_modifiers, VGPR_32:$src2, op_sel0:$op_sel); + let InsVOP3DPP16 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + FP32InputMods:$src2_modifiers, VGPR_32:$src2, + op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi); + let InsVOP3DPP8 = (ins VGPR_32:$old, + FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0, + FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1, + FP32InputMods:$src2_modifiers, VGPR_32:$src2, + op_sel0:$op_sel, dpp8:$dpp8, FI:$fi); let HasClamp = 0; let HasSrc2 = 0; let HasSrc2Mods = 1; + let HasExtVOP3DPP = 1; + let HasOpSel = 1; let AsmVOP3OpSel = !subst(", $src2_modifiers", "", getAsmVOP3OpSel<3, HasClamp, HasOMod, HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret); - let HasExtVOP3DPP = 0; + let AsmVOP3DPP16 = !subst(", $src2_modifiers", "", + getAsmVOP3DPP16.ret>.ret); + let AsmVOP3DPP8 = !subst(", $src2_modifiers", "", + getAsmVOP3DPP8.ret>.ret); } def IsPow2Plus1: PatLeaf<(i32 imm), [{ @@ -618,13 +658,13 @@ let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0, class Cvt_PK_F8_F32_Pat : GCNPat< (i32 (node f32:$src0, f32:$src1, i32:$old, index)), - (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, !if(index, SRCMODS.OP_SEL_0, 0)) + (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0) >; class Cvt_SR_F8_F32_Pat index, VOP3_Pseudo inst> : GCNPat< (i32 (node f32:$src0, i32:$src1, i32:$old, index)), (inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, - !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, !if(index{1}, SRCMODS.OP_SEL_0, 0)) + !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0) >; foreach Index = [0, -1] in { @@ -998,6 +1038,11 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x368>; defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>; defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>; +defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_gfx12<0x369>; +defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36a>; +defm V_CVT_SR_FP8_F32 : VOP3Only_Realtriple_gfx12<0x36b>; +defm V_CVT_SR_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36c>; + //===----------------------------------------------------------------------===// // GFX11, GFX12 //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll new file mode 100644 index 0000000000000..2b8a65cf6eaa9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s + +define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) { +; GFX12-LABEL: test_cvt_f32_bf8_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_f32_bf8_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 0) + ret float %ret +} + +define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) { +; GFX12-LABEL: test_cvt_f32_bf8_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 op_sel:[0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2) + ret float %ret +} + +define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) { +; GFX12-LABEL: test_cvt_f32_fp8_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_f32_fp8_e64_dpp v0, v0 op_sel:[1,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3) + ret float %ret +} + +define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %tmp1, float %y, i32 %old, i1 false) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_pk_fp8_f32_e64_dpp v2, v0, v1 op_sel:[0,0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %tmp1, float %y, i32 %old, i1 true) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %tmp1, i32 %r, i32 %old, i32 0) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 op_sel:[0,0,0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 2) + store i32 %ret, ptr addrspace(1) %out + ret void +} + +declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32) +declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32) +declare i32 @llvm.amdgcn.cvt.pk.bf8.f32(float, float, i32, i1) +declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1) +declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32) +declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32) + +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1 + +attributes #0 = { nounwind convergent } +attributes #1 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir new file mode 100644 index 0000000000000..89e34a779bb96 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir @@ -0,0 +1,197 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=gcn-dpp-combine %s -o - | FileCheck -check-prefix=GFX12 %s + +--- +name: test_cvt_f32_bf8_byte0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX12-LABEL: name: test_cvt_f32_bf8_byte0 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_F32_BF8_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_dpp]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %2:vgpr_32 = V_CVT_F32_BF8_e32 killed %1, implicit $mode, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN_TO_EPILOG $vgpr0 + +... +--- +name: test_cvt_f32_bf8_byte2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX12-LABEL: name: test_cvt_f32_bf8_byte2 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], 8, [[COPY]], 4, 0, 0, 0, 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_dpp]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 4, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN_TO_EPILOG $vgpr0 + +... +--- +name: test_cvt_f32_fp8_byte3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX12-LABEL: name: test_cvt_f32_fp8_byte3 + ; GFX12: liveins: $vgpr0 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], 4, [[COPY]], 4, 0, 0, 0, 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_dpp]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 4, killed %1, 4, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN_TO_EPILOG $vgpr0 + +... +--- +name: test_cvt_pk_bf8_f32_word0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_pk_bf8_f32_word0 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_PK_BF8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_PK_BF8_F32_e64_dpp [[DEF]], 0, [[COPY4]], 0, [[COPY3]], [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_BF8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_PK_BF8_F32_e64 0, killed %6, 0, %1, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_cvt_pk_fp8_f32_word1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_pk_fp8_f32_word1 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_PK_FP8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_PK_FP8_F32_e64_dpp [[DEF]], 8, [[COPY4]], 0, [[COPY3]], [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_FP8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_PK_FP8_F32_e64 8, killed %6, 0, %1, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_cvt_sr_bf8_f32_byte0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_sr_bf8_f32_byte0 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_SR_BF8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_SR_BF8_F32_e64_dpp [[DEF]], 0, [[COPY4]], 0, [[COPY3]], 0, [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_BF8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_SR_BF8_F32_e64 0, killed %6, 0, %1, 0, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_cvt_sr_fp8_f32_byte2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + + ; GFX12-LABEL: name: test_cvt_sr_fp8_f32_byte2 + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX12-NEXT: [[V_CVT_SR_FP8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_SR_FP8_F32_e64_dpp [[DEF]], 8, [[COPY4]], 0, [[COPY3]], 0, [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_FP8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 + %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec + %7:vgpr_32 = V_CVT_SR_FP8_F32_e64 8, killed %6, 0, %1, 0, %2, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll index 26d0d702d99db..0a9dae594c74e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll @@ -1,4 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32) declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32) @@ -9,182 +11,428 @@ declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1) declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32) declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32) -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte0: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0{{$}} define float @test_cvt_f32_bf8_byte0(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_f32_bf8_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte1: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1 define float @test_cvt_f32_bf8_byte1(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte2: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2 define float @test_cvt_f32_bf8_byte2(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte3: -; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3 define float @test_cvt_f32_bf8_byte3(i32 %a) { +; GFX940-LABEL: test_cvt_f32_bf8_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_bf8_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte0: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0{{$}} define float @test_cvt_f32_fp8_byte0(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_f32_fp8_e32 v0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte1: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1 define float @test_cvt_f32_fp8_byte1(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte2: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2 define float @test_cvt_f32_fp8_byte2(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte3: -; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3 define float @test_cvt_f32_fp8_byte3(i32 %a) { +; GFX940-LABEL: test_cvt_f32_fp8_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_f32_fp8_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3) ret float %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word0: -; GCN: v_cvt_pk_f32_bf8_e32 v[0:1], v0{{$}} define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_bf8_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word1: -; GCN: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_bf8_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word0: -; GCN: v_cvt_pk_f32_fp8_e32 v[0:1], v0{{$}} define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_fp8_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word1: -; GCN: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1 define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) { +; GFX940-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_f32_fp8_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) ret <2 x float> %ret } -; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word0: -; GCN: v_cvt_pk_bf8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word1: -; GCN: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_bf8_f32_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_bf8_f32_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word0: -; GCN: v_cvt_pk_fp8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_fp8_f32_word0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_fp8_f32_word0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word1: -; GCN: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) { +; GFX940-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte0: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 0) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte1: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 1) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte2: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 2) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte3: -; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_bf8_f32_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_bf8_f32_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 3) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte0: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1{{$}} -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 0) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte1: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 1) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte2: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 2) ret i32 %ret } -; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte3: -; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] -; GCN: v_mov_b32_e32 v0, v2 define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) { +; GFX940-LABEL: test_cvt_sr_fp8_f32_byte3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte3: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 3) ret i32 %ret } diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s index 35411ee0ba2a5..c9c4fceffaeb0 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s @@ -397,6 +397,51 @@ v_ctz_i32_b32 v5, src_scc v_ctz_i32_b32 v255, 0xaf123456 // GFX12: encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf] +v_cvt_f32_bf8_e32 v1, s3 +// GFX12: encoding: [0x03,0xda,0x02,0x7e] + +v_cvt_f32_bf8_e32 v1, 3 +// GFX12: encoding: [0x83,0xda,0x02,0x7e] + +v_cvt_f32_bf8_e32 v1, v3 +// GFX12: encoding: [0x03,0xdb,0x02,0x7e] + +v_cvt_f32_fp8_e32 v1, s3 +// GFX12: encoding: [0x03,0xd8,0x02,0x7e] + +v_cvt_f32_fp8_e32 v1, 3 +// GFX12: encoding: [0x83,0xd8,0x02,0x7e] + +v_cvt_f32_fp8_e32 v1, v3 +// GFX12: encoding: [0x03,0xd9,0x02,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], s3 +// GFX12: encoding: [0x03,0xde,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[3:4], s5 +// GFX12: encoding: [0x05,0xde,0x06,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], 3 +// GFX12: encoding: [0x83,0xde,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[3:4], 3 +// GFX12: encoding: [0x83,0xde,0x06,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], v3 +// GFX12: encoding: [0x03,0xdf,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[3:4], v3 +// GFX12: encoding: [0x03,0xdf,0x06,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], s3 +// GFX12: encoding: [0x03,0xdc,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], 3 +// GFX12: encoding: [0x83,0xdc,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], v3 +// GFX12: encoding: [0x03,0xdd,0x04,0x7e] + v_cvt_f16_f32 v5, v1 // GFX12: encoding: [0x01,0x15,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s index dd6afb28c396a..5e0e1b688bc58 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s @@ -337,6 +337,18 @@ v_ctz_i32_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 v_ctz_i32_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x05,0x30] +v_cvt_f32_fp8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc +// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac] + +v_cvt_f32_fp8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe +// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e] + +v_cvt_f32_bf8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc +// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac] + +v_cvt_f32_bf8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe +// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e] + v_cvt_f16_f32 v5, v1 quad_perm:[3,2,1,0] // GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s index 6530de0268456..36c89710ce8f8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s @@ -73,6 +73,18 @@ v_ctz_i32_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_ctz_i32_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: encoding: [0xe9,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00] +v_cvt_f32_fp8 v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_fp8 v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05] + +v_cvt_f32_bf8 v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_bf8 v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05] + v_cvt_f16_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s index cf3f9c45bdcc8..beb57999b855e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s @@ -1099,6 +1099,42 @@ v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 // GFX12: encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] +v_cvt_pk_fp8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_pk_fp8_f32 v1, -v2, |v3| +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20] + +v_cvt_pk_fp8_f32 v1, s2, 3 +// GFX12: encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00] + +v_cvt_pk_bf8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_pk_bf8_f32 v1, -v2, |v3| +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20] + +v_cvt_pk_bf8_f32 v1, s2, 3 +// GFX12: encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00] + +v_cvt_sr_fp8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_sr_fp8_f32 v10, s2, v5 +// GFX12: encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00] + +v_cvt_sr_fp8_f32 v5, -|v255|, v4 +// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20] + +v_cvt_sr_bf8_f32 v1, v2, v3 +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00] + +v_cvt_sr_bf8_f32 v10, s2, v5 +// GFX12: encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00] + +v_cvt_sr_bf8_f32 v5, -|v255|, v4 +// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20] + v_cvt_pk_i16_f32 v5, v1, v2 // GFX12: encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s index 26f63102df950..df3430f376f69 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s @@ -1015,6 +1015,114 @@ v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_m v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30] +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3] +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3] +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] + +v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] + +v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 +// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] + v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s index de294b1ff2a22..09dd6df618c5b 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s @@ -570,6 +570,54 @@ v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0x87,0x0e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] +v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1] +// GFX12: encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21] + +v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] + +v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + +v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] + +v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] +// GFX12: encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] + v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s index e35bb63290672..7ee60262a5c1b 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s @@ -396,6 +396,144 @@ v_ctz_i32_b32_e64 v5, src_scc v_ctz_i32_b32_e64 v255, 0xaf123456 // GFX12: encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] +v_cvt_f32_bf8_e64 v1, s3 +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 op_sel:[0,1] +// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,0] +// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,1] +// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 +// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 +// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 +// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 +// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 +// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 +// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] +// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], s3 +// GFX12: encoding: [0x03,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], s3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], 3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], v3 +// GFX12: encoding: [0x03,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[3:4], v3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], s3 +// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], 3 +// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], 3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], v3 +// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[3:4], v3 op_sel:[1,0] +// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + v_cvt_f16_f32_e64 v5, v1 // GFX12: encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s index 6b915bd14683a..808f941197c42 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s @@ -336,6 +336,18 @@ v_ctz_i32_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 v_ctz_i32_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 // GFX12: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30] +V_CVT_F32_FP8_e64_dpp v5, v1 quad_perm:[3,1,2,0] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0x27,0x00,0x2d] + +V_CVT_F32_FP8_e64_dpp v1, v3 quad_perm:[2,1,0,3] row_mask:0x5 bank_mask:0xe +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0xc6,0x00,0x5e] + +V_CVT_F32_BF8_e64_dpp v5, v1 quad_perm:[0,3,2,1] row_mask:0x2 bank_mask:0xd +// GFX12: encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0x6c,0x00,0x2d] + +V_CVT_F32_BF8_e64_dpp v1, v3 quad_perm:[0,1,3,2] row_mask:0x5 bank_mask:0xe +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0xb4,0x00,0x5e] + v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX12: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s index 61266f3776c28..f7b51cfb6bda8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s @@ -84,6 +84,18 @@ v_ctz_i32_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_ctz_i32_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX12: [0xff,0x00,0xba,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00] +v_cvt_f32_fp8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] + +v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] +// GFX12: encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] + +v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] + v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX12: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt index a839f03c42ba1..39bb7338c8074 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt @@ -397,6 +397,42 @@ # GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf] 0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_f32_bf8_e32 v1, s3 ; encoding: [0x03,0xda,0x02,0x7e] +0x03,0xda,0x02,0x7e + +# GFX12: v_cvt_f32_bf8_e32 v1, 3 ; encoding: [0x83,0xda,0x02,0x7e] +0x83,0xda,0x02,0x7e + +# GFX12: v_cvt_f32_bf8_e32 v1, v3 ; encoding: [0x03,0xdb,0x02,0x7e] +0x03,0xdb,0x02,0x7e + +# GFX12: v_cvt_f32_fp8_e32 v1, s3 ; encoding: [0x03,0xd8,0x02,0x7e] +0x03,0xd8,0x02,0x7e + +# GFX12: v_cvt_f32_fp8_e32 v1, 3 ; encoding: [0x83,0xd8,0x02,0x7e] +0x83,0xd8,0x02,0x7e + +# GFX12: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e] +0x03,0xd9,0x02,0x7e + +# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], s3 ; encoding: [0x03,0xde,0x04,0x7e] +0x03,0xde,0x04,0x7e + +# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], 3 ; encoding: [0x83,0xde,0x04,0x7e] +0x83,0xde,0x04,0x7e + +# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3 ; encoding: [0x03,0xdf,0x04,0x7e] +0x03,0xdf,0x04,0x7e + +# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], s3 ; encoding: [0x03,0xdc,0x04,0x7e] +0x03,0xdc,0x04,0x7e + +# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], 3 ; encoding: [0x83,0xdc,0x04,0x7e] +0x83,0xdc,0x04,0x7e + +# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3 ; encoding: [0x03,0xdd,0x04,0x7e] +0x03,0xdd,0x04,0x7e + # GFX12: v_cvt_f16_f32_e32 v5, v1 ; encoding: [0x01,0x15,0x0a,0x7e] 0x01,0x15,0x0a,0x7e diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt index bcb9ad9febb96..5848333f41ef7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt @@ -337,6 +337,18 @@ # GFX12: v_ctz_i32_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x0d,0x30] 0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x0d,0x30 +# GFX12: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc ; encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac] +0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac + +# GFX12: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe ; encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e] +0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e + +# GFX12: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc ; encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac] +0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac + +# GFX12: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe ; encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e] +0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e + # GFX12: v_cvt_f16_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff] 0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt index 928165997dd91..d42e9ae25039b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt @@ -49,6 +49,18 @@ # GFX12: v_ctz_i32_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00] 0xea,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_f32_fp8_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa] +0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_fp8_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05] +0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05 + +# GFX12: v_cvt_f32_bf8_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa] +0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_bf8_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05] +0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05 + # GFX12: v_cvt_f16_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05] 0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt index db690aa99e4ab..f86903b8de44b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt @@ -993,6 +993,42 @@ # GFX12: v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] 0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00] +0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00 + +# GFX12: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00] +0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00 + +# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00] +0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00 + +# GFX12: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20] +0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20 + +# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00] +0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00 + +# GFX12: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00] +0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00 + +# GFX12: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20] +0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20 + # GFX12: v_cvt_pk_i16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00] 0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt index 69f61c7eb8030..1be1d6e91ad8a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt @@ -825,6 +825,114 @@ # GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30] 0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30 +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20] +0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed] +0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed] +0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed + # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt index a7f0183016147..44b3f7594029f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt @@ -495,6 +495,54 @@ # GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00] 0xff,0x87,0x0e,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21] +0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05] +0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00] +0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] +0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05] +0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05 + +# GFX12: v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00] +0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00 + # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt index 4fe4284e8eb4e..9a8368a65f3d3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt @@ -396,6 +396,42 @@ # GFX12: v_ctz_i32_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] 0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_f32_bf8_e64 v1, s3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] +0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] +0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_bf8_e64 v1, v3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] +0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00 + +# GFX12: v_cvt_f32_fp8_e64 v1, s3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] +0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_fp8_e64 v1, 3 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] +0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_f32_fp8_e64 v1, v3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] +0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00 + +# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] +0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00] +0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] +0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00 + +# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] +0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] +0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00 + +# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] +0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00 + # GFX12: v_cvt_f16_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00] 0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt index e914d139e240e..8af274e0b4028 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt @@ -336,6 +336,18 @@ # GFX12: v_ctz_i32_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30] 0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30 +# GFX12: v_cvt_f32_fp8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d] +0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d + +# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e] +0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e + +# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d] +0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d + +# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e] +0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e + # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] 0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt index 2a4b677620d38..3d48d58c775b1 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt @@ -72,6 +72,18 @@ # GFX12: v_ctz_i32_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xba,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00] 0xff,0x00,0xba,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_f32_fp8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] +0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] +0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05 + +# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa] +0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa + +# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] +0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05 + # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] 0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 From a91e42a473541f8ca95190fe53f425fe4b055c8d Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Wed, 17 Jan 2024 12:54:11 +0100 Subject: [PATCH 02/13] Update tests --- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll index 2b8a65cf6eaa9..62b374fac5c0c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll @@ -11,6 +11,16 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) { ret float %ret } +define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) { +; GFX12-LABEL: test_cvt_f32_bf8_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: ; return to shader part epilog + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1) + ret float %ret +} + define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) { ; GFX12-LABEL: test_cvt_f32_bf8_byte2: ; GFX12: ; %bb.0: @@ -76,6 +86,21 @@ define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr a ret void } +define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { +; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: global_store_b32 v[3:4], v2, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) + %tmp1 = bitcast i32 %tmp0 to float + %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 1) + store i32 %ret, ptr addrspace(1) %out + ret void +} + define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: ; GFX12: ; %bb.0: From 8e058e0a415334877d5fee66756fab893b7a112d Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Thu, 18 Jan 2024 12:43:09 +0100 Subject: [PATCH 03/13] Update feature naming to FP8ConversionInsts --- llvm/lib/Target/AMDGPU/AMDGPU.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 1f16e610ab220..975d58171a803 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1502,7 +1502,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeatureExtendedImageInsts, - FeatureFP8Insts, + FeatureFP8ConversionInsts, FeaturePackedTID, FeatureVcmpxPermlaneHazard, FeatureSALUFloatInsts, From 6149014b2e1779648f0567dac890a58779163bb9 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Thu, 18 Jan 2024 13:48:32 +0100 Subject: [PATCH 04/13] Update test --- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll | 2 +- .../AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir | 2 +- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll | 144 +++++++++++++++--- 3 files changed, 122 insertions(+), 26 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll index 62b374fac5c0c..3620806971282 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) { ; GFX12-LABEL: test_cvt_f32_bf8_byte0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir index 89e34a779bb96..65ef120c6ed22 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=gcn-dpp-combine %s -o - | FileCheck -check-prefix=GFX12 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=gcn-dpp-combine %s -o - | FileCheck -check-prefix=GFX12 %s --- name: test_cvt_f32_bf8_byte0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll index 0a9dae594c74e..17b1fcf865e94 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll @@ -20,7 +20,11 @@ define float @test_cvt_f32_bf8_byte0(i32 %a) { ; ; GFX12-LABEL: test_cvt_f32_bf8_byte0: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_f32_bf8_e32 v0, v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) @@ -36,7 +40,11 @@ define float @test_cvt_f32_bf8_byte1(i32 %a) { ; ; GFX12-LABEL: test_cvt_f32_bf8_byte1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1) @@ -52,7 +60,11 @@ define float @test_cvt_f32_bf8_byte2(i32 %a) { ; ; GFX12-LABEL: test_cvt_f32_bf8_byte2: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2) @@ -68,7 +80,11 @@ define float @test_cvt_f32_bf8_byte3(i32 %a) { ; ; GFX12-LABEL: test_cvt_f32_bf8_byte3: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3) @@ -84,7 +100,11 @@ define float @test_cvt_f32_fp8_byte0(i32 %a) { ; ; GFX12-LABEL: test_cvt_f32_fp8_byte0: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_f32_fp8_e32 v0, v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0) @@ -100,7 +120,11 @@ define float @test_cvt_f32_fp8_byte1(i32 %a) { ; ; GFX12-LABEL: test_cvt_f32_fp8_byte1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) @@ -116,7 +140,11 @@ define float @test_cvt_f32_fp8_byte2(i32 %a) { ; ; GFX12-LABEL: test_cvt_f32_fp8_byte2: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2) @@ -132,7 +160,11 @@ define float @test_cvt_f32_fp8_byte3(i32 %a) { ; ; GFX12-LABEL: test_cvt_f32_fp8_byte3: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3) @@ -148,7 +180,11 @@ define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) { ; ; GFX12-LABEL: test_cvt_pk_f32_bf8_word0: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) @@ -164,7 +200,11 @@ define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) { ; ; GFX12-LABEL: test_cvt_pk_f32_bf8_word1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true) @@ -180,7 +220,11 @@ define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) { ; ; GFX12-LABEL: test_cvt_pk_f32_fp8_word0: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false) @@ -196,7 +240,11 @@ define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) { ; ; GFX12-LABEL: test_cvt_pk_f32_fp8_word1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) @@ -213,7 +261,11 @@ define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) { ; ; GFX12-LABEL: test_cvt_pk_bf8_f32_word0: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 @@ -233,7 +285,11 @@ define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) { ; ; GFX12-LABEL: test_cvt_pk_bf8_f32_word1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 @@ -252,7 +308,11 @@ define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) { ; ; GFX12-LABEL: test_cvt_pk_fp8_f32_word0: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 @@ -272,7 +332,11 @@ define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) { ; ; GFX12-LABEL: test_cvt_pk_fp8_f32_word1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 @@ -291,7 +355,11 @@ define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) { ; ; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 @@ -310,7 +378,11 @@ define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) { ; ; GFX12-LABEL: test_cvt_sr_bf8_f32_byte1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 @@ -330,7 +402,11 @@ define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) { ; ; GFX12-LABEL: test_cvt_sr_bf8_f32_byte2: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 @@ -350,7 +426,11 @@ define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) { ; ; GFX12-LABEL: test_cvt_sr_bf8_f32_byte3: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 @@ -369,7 +449,11 @@ define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) { ; ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte0: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 @@ -388,7 +472,11 @@ define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) { ; ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 @@ -408,7 +496,11 @@ define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) { ; ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 @@ -428,7 +520,11 @@ define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) { ; ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte3: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v0, v2 From 4996c773910a647fcc4bde4c20ba4bca66636d0b Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Thu, 18 Jan 2024 21:14:44 +0100 Subject: [PATCH 05/13] Move instruction closer to place where used --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index b2b81446016ec..7f23674dadcff 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -8786,11 +8786,6 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, int OldIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::old); int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers); - int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); - bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || - Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 || - Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12; bool IsMAC = OldIdx != -1 && Src2ModIdx != -1 && Desc.getOperandConstraint(OldIdx, MCOI::TIED_TO) == -1; @@ -8814,15 +8809,17 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, } } - if (VdstInIdx != -1) { - int NumOperands = Inst.getNumOperands(); - if (VdstInIdx == NumOperands) - Inst.addOperand(Inst.getOperand(0)); + int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); + if (VdstInIdx == static_cast(Inst.getNumOperands())) { + Inst.addOperand(Inst.getOperand(0)); } + bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12; if (IsVOP3CvtSrDpp) { - int NumOperands = Inst.getNumOperands(); - if (Src2ModIdx == NumOperands) { + if (Src2ModIdx == static_cast(Inst.getNumOperands())) { Inst.addOperand(MCOperand::createImm(0)); Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(0, getSTI()))); } From a36de86751e6ae357a81c1502fa98416a7a71e1b Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Thu, 18 Jan 2024 21:18:53 +0100 Subject: [PATCH 06/13] Don't use getMCReg --- llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 7f23674dadcff..ba85f53c5369f 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -8283,7 +8283,7 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, Op.addRegOrImmWithFPInputModsOperands(Inst, 1); // src0 // Add dummy src1 Inst.addOperand(MCOperand::createImm(0)); - Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(0, getSTI()))); + Inst.addOperand(MCOperand::createReg(0)); } for (unsigned E = Operands.size(); I != E; ++I) { @@ -8821,7 +8821,7 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, if (IsVOP3CvtSrDpp) { if (Src2ModIdx == static_cast(Inst.getNumOperands())) { Inst.addOperand(MCOperand::createImm(0)); - Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(0, getSTI()))); + Inst.addOperand(MCOperand::createReg(0)); } } @@ -8843,7 +8843,7 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, Inst.getOpcode() != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12) { // Add dummy src1 Inst.addOperand(MCOperand::createImm(0)); - Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(0, getSTI()))); + Inst.addOperand(MCOperand::createReg(0)); } } else if (Op.isReg()) { Op.addRegOperands(Inst, 1); @@ -8925,7 +8925,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I Opc == AMDGPU::V_CVT_F32_FP8_dpp8_gfx12) { // Add dummy src1 Inst.addOperand(MCOperand::createImm(0)); - Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(0, getSTI()))); + Inst.addOperand(MCOperand::createReg(0)); } } else if (Op.isDppFI()) { Fi = Op.getImm(); @@ -8943,7 +8943,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I Opc == AMDGPU::V_CVT_F32_FP8_dpp8_gfx12) { // Add dummy src1 Inst.addOperand(MCOperand::createImm(0)); - Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(0, getSTI()))); + Inst.addOperand(MCOperand::createReg(0)); } } else if (Op.isReg()) { Op.addRegOperands(Inst, 1); From afe32a9277b8393a4a8d317134690c0e567d842a Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Thu, 18 Jan 2024 21:21:15 +0100 Subject: [PATCH 07/13] Remove VOP1 suffix from function name --- llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 5 ++--- llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 6 +++--- llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 2 +- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 2 +- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ba85f53c5369f..cc4ba220bb2b6 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -8276,8 +8276,7 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } - if (isVOP1Cvt_F32_Fp8_Bf8_e64(Opc) && - Opc != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 && + if (isCvt_F32_Fp8_Bf8_e64(Opc) && Opc != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 && Opc != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I++]); Op.addRegOrImmWithFPInputModsOperands(Inst, 1); // src0 @@ -8838,7 +8837,7 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, Fi = Op.getImm(); } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegOrImmWithFPInputModsOperands(Inst, 2); - if (isVOP1Cvt_F32_Fp8_Bf8_e64(Inst.getOpcode()) && + if (isCvt_F32_Fp8_Bf8_e64(Inst.getOpcode()) && Inst.getOpcode() != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 && Inst.getOpcode() != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12) { // Add dummy src1 diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 75d0511b567bb..ecc5efa53dfb7 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -523,7 +523,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } else { assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3); - if (AMDGPU::isVOP1Cvt_F32_Fp8_Bf8_e64(MI.getOpcode())) { + if (AMDGPU::isCvt_F32_Fp8_Bf8_e64(MI.getOpcode())) { // Add omod and clamp modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod); @@ -701,7 +701,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW, Address, CS); if (Res) { - if (AMDGPU::isVOP1Cvt_F32_Fp8_Bf8_e64(MI.getOpcode())) { + if (AMDGPU::isCvt_F32_Fp8_Bf8_e64(MI.getOpcode())) { // Add omod and clamp modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod); insertNamedMCOperand(MI, MCOperand::createImm(0), @@ -962,7 +962,7 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const { DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); - if (AMDGPU::isVOP1Cvt_F32_Fp8_Bf8_e64(Opc)) { + if (AMDGPU::isCvt_F32_Fp8_Bf8_e64(Opc)) { // Add omod and clamp modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod); insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 1fc70f0bbbd2d..ee400ba19c7cd 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1300,7 +1300,7 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Opc = MI->getOpcode(); - if (isPermlane16(Opc) || (isVOP1Cvt_F32_Fp8_Bf8_e64(Opc) && + if (isPermlane16(Opc) || (isCvt_F32_Fp8_Bf8_e64(Opc) && Opc != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 && Opc != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12)) { auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 6910fb998d50d..0834ff4b420fe 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -541,7 +541,7 @@ bool isPermlane16(unsigned Opc) { Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12; } -bool isVOP1Cvt_F32_Fp8_Bf8_e64(unsigned Opc) { +bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) { return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 || Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 || Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 || diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index b911b742ba0da..b5c382c512a18 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -543,7 +543,7 @@ LLVM_READNONE bool isGenericAtomic(unsigned Opc); LLVM_READNONE -bool isVOP1Cvt_F32_Fp8_Bf8_e64(unsigned Opc); +bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc); namespace VOPD { From 7de5e75390cbe9c84367b53139343324fef33bcb Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Fri, 19 Jan 2024 16:46:15 +0100 Subject: [PATCH 08/13] Add fp8-conversion-insts feature to clang --- clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 +-- .../test/CodeGenOpenCL/builtins-amdgcn-fp8.cl | 35 ++++++++++--------- llvm/lib/TargetParser/TargetParser.cpp | 1 + 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 7495bca72a9df..e1021241728c6 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -100,8 +100,8 @@ // GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl index 56d757012a5e7..4e3a56b4201bb 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl @@ -1,59 +1,60 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s typedef float v2f __attribute__((ext_vector_type(2))); -// CHECK-GFX940-LABEL: @test_cvt_f32_bf8 -// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) +// CHECK-LABEL: @test_cvt_f32_bf8 +// CHECK: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0) void test_cvt_f32_bf8(global int* out, int a) { *out = __builtin_amdgcn_cvt_f32_bf8(a, 0); } -// CHECK-GFX940-LABEL: @test_cvt_f32_fp8 -// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) +// CHECK-LABEL: @test_cvt_f32_fp8 +// CHECK: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) void test_cvt_f32_fp8(global int* out, int a) { *out = __builtin_amdgcn_cvt_f32_fp8(a, 1); } -// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8 -// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) +// CHECK-LABEL: @test_cvt_pk_f32_bf8 +// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false) void test_cvt_pk_f32_bf8(global v2f* out, int a) { *out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false); } -// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8 -// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) +// CHECK-LABEL: @test_cvt_pk_f32_fp8 +// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true) void test_cvt_pk_f32_fp8(global v2f* out, int a) { *out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true); } -// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false) +// CHECK-LABEL: @test_cvt_pk_bf8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false) void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b) { *out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false); } -// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true) +// CHECK-LABEL: @test_cvt_pk_fp8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true) void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b) { *out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true); } -// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2) +// CHECK-LABEL: @test_cvt_sr_bf8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2) void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b) { *out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2); } -// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32 -// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3) +// CHECK-LABEL: @test_cvt_sr_fp8_f32 +// CHECK: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3) void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b) { *out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3); diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 2cfe23676d20f..969df7f49aee8 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -296,6 +296,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["atomic-fadd-rtn-insts"] = true; Features["image-insts"] = true; Features["gws"] = true; + Features["fp8-conversion-insts"] = true; break; case GK_GFX1151: case GK_GFX1150: From 3211bc3731d9d8e62c377c026a3f3765723a287f Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Mon, 22 Jan 2024 15:12:45 +0100 Subject: [PATCH 09/13] Remove OP_SEL_1 from Cvt_F32_F8_Pat_OpSel pattern --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index b1e2000a523b4..9d6b0512c60e2 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -675,8 +675,8 @@ class Cvt_F32_F8_Pat_OpSel index, VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat< (f32 (node i32:$src, index)), !if (index, - (inst_e64 !if(index{0}, SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), $src, - !if(index{1}, SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), (i32 0), + (inst_e64 !if(index{0}, SRCMODS.OP_SEL_0, 0), $src, + !if(index{1}, SRCMODS.OP_SEL_0, 0), (i32 0), 0, 0, 0), (inst_e32 $src)) >; From 55d4d1b6e8f7f3d83637d83bc58e4e33e478f398 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= Date: Mon, 22 Jan 2024 17:44:46 +0100 Subject: [PATCH 10/13] =?UTF-8?q?Update=20handling=20op=5Fsel=20by=20Mirko?= =?UTF-8?q?=20Brku=C5=A1anin?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 16 ---------- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 14 +++++++-- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 ++ llvm/lib/Target/AMDGPU/VOP1Instructions.td | 21 ++++---------- llvm/lib/Target/AMDGPU/VOPInstructions.td | 29 +++++++++++++------ .../AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir | 8 ++--- 6 files changed, 43 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 035f4b612591c..05836448fd8f7 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -8257,15 +8257,6 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } - if (isCvt_F32_Fp8_Bf8_e64(Opc) && Opc != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 && - Opc != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I++]); - Op.addRegOrImmWithFPInputModsOperands(Inst, 1); // src0 - // Add dummy src1 - Inst.addOperand(MCOperand::createImm(0)); - Inst.addOperand(MCOperand::createReg(0)); - } - for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { @@ -8818,13 +8809,6 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, Fi = Op.getImm(); } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegOrImmWithFPInputModsOperands(Inst, 2); - if (isCvt_F32_Fp8_Bf8_e64(Inst.getOpcode()) && - Inst.getOpcode() != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 && - Inst.getOpcode() != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12) { - // Add dummy src1 - Inst.addOperand(MCOperand::createImm(0)); - Inst.addOperand(MCOperand::createReg(0)); - } } else if (Op.isReg()) { Op.addRegOperands(Inst, 1); } else if (Op.isImm() && diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index b9b9ccbf77242..9e64e3fd79576 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1305,9 +1305,17 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Opc = MI->getOpcode(); - if (isPermlane16(Opc) || (isCvt_F32_Fp8_Bf8_e64(Opc) && - Opc != AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 && - Opc != AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12)) { + if (isCvt_F32_Fp8_Bf8_e64(Opc)) { + auto SrcMod = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + unsigned Mod = MI->getOperand(SrcMod).getImm(); + unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0); + unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1); + if (Index0 || Index1) + O << " op_sel:[" << Index0 << ',' << Index1 << ']'; + return; + } + if (isPermlane16(Opc)) { auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); unsigned FI = !!(MI->getOperand(FIN).getImm() & SISrcMods::OP_SEL_0); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 3aeed6aec3650..71341b9592359 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2282,6 +2282,8 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { field bit IsSingle = 0; field bit IsWMMA = 0; + field bit IsFP8 = 0; + field bit HasDst = !ne(DstVT.Value, untyped.Value); field bit HasDst32 = HasDst; field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 9d6b0512c60e2..655aafc3684e3 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -641,26 +641,16 @@ let SubtargetPredicate = isGFX9Only in { // Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions. def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F { - let InsVOP3OpSel = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, - clampmod:$clamp, omod:$omod, op_sel0:$op_sel); - let HasOpSel = 1; let HasExtVOP3DPP = 0; } -def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, i32, untyped]> { - let InsVOP3OpSel = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, - Src1Mod:$src1_modifiers, Src1RC64:$src1, - clampmod:$clamp, omod:$omod, op_sel0:$op_sel); - let AsmVOP3OpSel = !subst(", $src1_modifiers", "", getAsmVOP3OpSel<2, 0, 0, 1, 1, 0>.ret); - +def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]> { let HasOpSel = 1; let HasExtDPP = 1; let HasExtVOP3DPP = 1; - + let IsFP8 = 1; let Src1VOP3DPP = Src1RC64; - let AsmVOP3DPP8 = getAsmVOP3DPP8.ret; - let AsmVOP3DPP16 = getAsmVOP3DPP16.ret; } let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0, @@ -675,9 +665,10 @@ class Cvt_F32_F8_Pat_OpSel index, VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat< (f32 (node i32:$src, index)), !if (index, - (inst_e64 !if(index{0}, SRCMODS.OP_SEL_0, 0), $src, - !if(index{1}, SRCMODS.OP_SEL_0, 0), (i32 0), - 0, 0, 0), + (inst_e64 !if(index{0}, + !if(index{1}, 12 /*SRCMODS.OP_SEL_0 | SRCMODS.OP_SEL_1*/, SRCMODS.OP_SEL_0), + !if(index{1}, SRCMODS.OP_SEL_1, 0)), + $src, 0, 0, 0), (inst_e32 $src)) >; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index df505c3365cbd..8b6d1ddba595e 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -305,6 +305,11 @@ class VOP3OpSel_gfx10 op, VOPProfile p> : VOP3e_gfx10 { class VOP3OpSel_gfx11_gfx12 op, VOPProfile p> : VOP3OpSel_gfx10; +class VOP3FP8OpSel_gfx11_gfx12 op, VOPProfile p> : VOP3e_gfx10 { + let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0); + let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0); +} + class VOP3DotOpSel_gfx11_gfx12 op, VOPProfile p> : VOP3OpSel_gfx11_gfx12{ let Inst{11} = ?; let Inst{12} = ?; @@ -738,7 +743,7 @@ class VOP3_DPPe_Common_Base op, VOPProfile P> : Enc96 { let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs. let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?); - let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, 0),?); + let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?); let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?); let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?); let Inst{15} = !if(P.HasClamp, clamp, 0); @@ -1406,14 +1411,20 @@ multiclass VOP3_Real_with_name op, string opName, defvar ps = !cast(opName#"_e64"); let AsmString = asmName # ps.AsmOperands, IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { - if ps.Pfl.HasOpSel then - def _e64#Gen.Suffix : - VOP3_Real_Gen, - VOP3OpSel_gfx11_gfx12; - if !not(ps.Pfl.HasOpSel) then - def _e64#Gen.Suffix : - VOP3_Real_Gen, - VOP3e_gfx11_gfx12; + if ps.Pfl.IsFP8 then { + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3FP8OpSel_gfx11_gfx12; + } else { + if ps.Pfl.HasOpSel then + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3OpSel_gfx11_gfx12; + if !not(ps.Pfl.HasOpSel) then + def _e64#Gen.Suffix : + VOP3_Real_Gen, + VOP3e_gfx11_gfx12; + } } def Gen.Suffix#"_VOP3_alias" : MnemonicAlias, Requires<[Gen.AssemblerPredicate]>, LetDummies; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir index 65ef120c6ed22..c197e58875600 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir @@ -35,12 +35,12 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], 8, [[COPY]], 4, 0, 0, 0, 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], 8, [[COPY]], 0, 0, 0, 228, 15, 15, 1, implicit $mode, implicit $exec ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_dpp]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec - %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 4, 0, 0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, 0, 0, implicit $mode, implicit $exec $vgpr0 = COPY %2 SI_RETURN_TO_EPILOG $vgpr0 @@ -57,12 +57,12 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], 4, [[COPY]], 4, 0, 0, 0, 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], 12, [[COPY]], 0, 0, 0, 228, 15, 15, 1, implicit $mode, implicit $exec ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_dpp]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec - %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 4, killed %1, 4, 0, 0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, 0, 0, implicit $mode, implicit $exec $vgpr0 = COPY %2 SI_RETURN_TO_EPILOG $vgpr0 From 55d611a2f4b6ff45ed3484e2a04e80b449d03ff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= Date: Mon, 22 Jan 2024 20:13:46 +0100 Subject: [PATCH 11/13] =?UTF-8?q?Remove=20dummy=20operands=20by=20Mirko=20?= =?UTF-8?q?Brku=C5=A1anin?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 17 ------------- .../Disassembler/AMDGPUDisassembler.cpp | 24 +------------------ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 5 ++-- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 8 +++++-- .../AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir | 8 +++---- 5 files changed, 14 insertions(+), 48 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 05836448fd8f7..3f6ad249fe113 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -8855,7 +8855,6 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I OptionalImmIndexMap OptionalIdx; unsigned I = 1; - const unsigned Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); @@ -8883,14 +8882,6 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I Op.addImmOperands(Inst, 1); } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegWithFPInputModsOperands(Inst, 2); - if (Opc == AMDGPU::V_CVT_F32_BF8_dpp_gfx12 || - Opc == AMDGPU::V_CVT_F32_FP8_dpp_gfx12 || - Opc == AMDGPU::V_CVT_F32_BF8_dpp8_gfx12 || - Opc == AMDGPU::V_CVT_F32_FP8_dpp8_gfx12) { - // Add dummy src1 - Inst.addOperand(MCOperand::createImm(0)); - Inst.addOperand(MCOperand::createReg(0)); - } } else if (Op.isDppFI()) { Fi = Op.getImm(); } else if (Op.isReg()) { @@ -8901,14 +8892,6 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I } else { if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegWithFPInputModsOperands(Inst, 2); - if (Opc == AMDGPU::V_CVT_F32_BF8_dpp_gfx12 || - Opc == AMDGPU::V_CVT_F32_FP8_dpp_gfx12 || - Opc == AMDGPU::V_CVT_F32_BF8_dpp8_gfx12 || - Opc == AMDGPU::V_CVT_F32_FP8_dpp8_gfx12) { - // Add dummy src1 - Inst.addOperand(MCOperand::createImm(0)); - Inst.addOperand(MCOperand::createReg(0)); - } } else if (Op.isReg()) { Op.addRegOperands(Inst, 1); } else if (Op.isDPPCtrl()) { diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index ee1790d2e0a70..2c6aa2ee348a1 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -522,15 +522,6 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, convertVOPCDPPInst(MI); // Special VOP3 case } else { assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3); - - if (AMDGPU::isCvt_F32_Fp8_Bf8_e64(MI.getOpcode())) { - // Add omod and clamp modifiers. - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::omod); - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::clamp); - } - convertVOP3DPPInst(MI); // Regular VOP3 case } }; @@ -704,15 +695,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW, Address, CS); - if (Res) { - if (AMDGPU::isCvt_F32_Fp8_Bf8_e64(MI.getOpcode())) { - // Add omod and clamp modifiers. - insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod); - insertNamedMCOperand(MI, MCOperand::createImm(0), - AMDGPU::OpName::clamp); - } + if (Res) break; - } Res = tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW, Address, CS); @@ -966,12 +950,6 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const { DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const { unsigned Opc = MI.getOpcode(); - if (AMDGPU::isCvt_F32_Fp8_Bf8_e64(Opc)) { - // Add omod and clamp modifiers. - insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod); - insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp); - } - if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) { convertVOP3PDPPInst(MI); } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) || diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 71341b9592359..99c0a17a68f94 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1686,8 +1686,9 @@ class getIns64 let HasExtDPP = 1; let HasExtVOP3DPP = 1; let IsFP8 = 1; + let HasClamp = 0; + let HasOMod = 0; + let HasModifiers = 1; let Src1VOP3DPP = Src1RC64; } @@ -666,9 +669,10 @@ class Cvt_F32_F8_Pat_OpSel index, (f32 (node i32:$src, index)), !if (index, (inst_e64 !if(index{0}, - !if(index{1}, 12 /*SRCMODS.OP_SEL_0 | SRCMODS.OP_SEL_1*/, SRCMODS.OP_SEL_0), + !if(index{1}, !or(SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), + SRCMODS.OP_SEL_0), !if(index{1}, SRCMODS.OP_SEL_1, 0)), - $src, 0, 0, 0), + $src, 0), (inst_e32 $src)) >; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir index c197e58875600..40d2923b68a8c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir @@ -35,12 +35,12 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], 8, [[COPY]], 0, 0, 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], 8, [[COPY]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_dpp]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec - %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, implicit $mode, implicit $exec $vgpr0 = COPY %2 SI_RETURN_TO_EPILOG $vgpr0 @@ -57,12 +57,12 @@ body: | ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], 12, [[COPY]], 0, 0, 0, 228, 15, 15, 1, implicit $mode, implicit $exec + ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], 12, [[COPY]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_dpp]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec - %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, implicit $mode, implicit $exec $vgpr0 = COPY %2 SI_RETURN_TO_EPILOG $vgpr0 From 585e918fa69d6dae6a70ef805de456557abfaad9 Mon Sep 17 00:00:00 2001 From: Mirko Brkusanin Date: Tue, 23 Jan 2024 12:35:30 +0100 Subject: [PATCH 12/13] [AMDGPU] Properly check op_sel in GCNDPPCombine --- llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp | 28 +++++-- .../test/CodeGen/AMDGPU/dpp_combine_gfx11.mir | 79 ++++++++++++++++++- 2 files changed, 95 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index a75082268c773..94d28dc0a2c74 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -274,8 +274,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, break; } - if (auto *Mod0 = TII->getNamedOperand(OrigMI, - AMDGPU::OpName::src0_modifiers)) { + auto *Mod0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0_modifiers); + if (Mod0) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src0_modifiers)); assert(HasVOP3DPP || @@ -298,8 +298,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst->getOperand(NumOperands).setIsKill(false); ++NumOperands; - if (auto *Mod1 = TII->getNamedOperand(OrigMI, - AMDGPU::OpName::src1_modifiers)) { + auto *Mod1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1_modifiers); + if (Mod1) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src1_modifiers)); assert(HasVOP3DPP || @@ -330,8 +330,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.add(*Src1); ++NumOperands; } - if (auto *Mod2 = - TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) { + + auto *Mod2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers); + if (Mod2) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers)); assert(HasVOP3DPP || @@ -350,6 +351,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.add(*Src2); ++NumOperands; } + if (HasVOP3DPP) { auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp); if (ClampOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::clamp)) { @@ -368,7 +370,13 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, // all 1. if (auto *OpSelOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) { - auto OpSel = OpSelOpr->getImm(); + int64_t OpSel = 0; + OpSel |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << 0) : 0); + OpSel |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << 1) : 0); + OpSel |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << 2) : 0); + if (Mod0 && TII->isVOP3(OrigMI) && !TII->isVOP3P(OrigMI)) + OpSel |= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << 3; + if (OpSel != 0) { LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n"); Fail = true; @@ -379,7 +387,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, } if (auto *OpSelHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) { - auto OpSelHi = OpSelHiOpr->getImm(); + int64_t OpSelHi = 0; + OpSelHi |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << 0) : 0); + OpSelHi |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << 1) : 0); + OpSelHi |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << 2) : 0); + // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check // the bitmask for 3 op_sel_hi bits set assert(Src2 && "Expected vop3p with 3 operands"); diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir index 89ea3ed12aea3..3df087256ad6e 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir @@ -83,7 +83,8 @@ body: | # Regression test for src_modifiers on base u16 opcode # GCN-label: name: vop3_u16 # GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec -# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 4, %5, 8, %5, 0, 0, 1, 15, 15, 1, implicit $exec +# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec +# GCN: %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec name: vop3_u16 tracksRegLiveness: true body: | @@ -97,7 +98,9 @@ body: | %4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec %5:vgpr_32 = V_ADD_NC_U16_e64 0, %4, 0, %3, 0, 0, implicit $exec %6:vgpr_32 = V_MOV_B32_dpp %3, %5, 1, 15, 15, 1, implicit $exec - %7:vgpr_32 = V_ADD_NC_U16_e64 4, %6, 8, %5, 0, 0, implicit $exec + %7:vgpr_32 = V_ADD_NC_U16_e64 1, %6, 2, %5, 0, 0, implicit $exec + %8:vgpr_32 = V_MOV_B32_dpp %3, %7, 1, 15, 15, 1, implicit $exec + %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec ... name: vop3p @@ -116,7 +119,7 @@ body: | ; GCN: [[V_DOT2_F32_F16_:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16 0, [[V_MOV_B32_dpp]], 0, [[COPY]], 0, [[COPY2]], 0, 5, 0, 0, 0, implicit $mode, implicit $exec ; GCN: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec ; GCN: [[V_DOT2_F32_F16_1:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16 0, [[V_MOV_B32_dpp1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, 4, 0, 0, implicit $mode, implicit $exec - ; GCN: [[V_DOT2_F32_F16_dpp:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16_dpp [[DEF]], 10, [[COPY1]], 8, [[COPY]], 13, [[COPY2]], 1, 0, 7, 4, 5, 1, 15, 15, 1, implicit $mode, implicit $exec + ; GCN: [[V_DOT2_F32_F16_dpp:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16_dpp [[DEF]], 10, [[COPY1]], 8, [[COPY]], 9, [[COPY2]], 1, 0, 7, 4, 5, 1, 15, 15, 1, implicit $mode, implicit $exec ; GCN: [[V_FMA_MIX_F32_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 1, 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec ; GCN: [[V_FMA_MIXLO_F16_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 0, [[COPY2]], 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec ; GCN: [[V_FMA_MIXHI_F16_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 1, [[COPY]], 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec @@ -134,7 +137,7 @@ body: | %7:vgpr_32 = V_DOT2_F32_F16 0, %6, 0, %0, 0, %2, 0, 0, 4, 0, 0, implicit $mode, implicit $exec %8:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec - %9:vgpr_32 = V_DOT2_F32_F16 10, %8, 8, %0, 13, %2, 1, 0, 7, 4, 5, implicit $mode, implicit $exec + %9:vgpr_32 = V_DOT2_F32_F16 10, %8, 8, %0, 9, %2, 1, 0, 7, 4, 5, implicit $mode, implicit $exec %10:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec %11:vgpr_32 = V_FMA_MIX_F32 8, %10, 8, %0, 8, %2, 1, 0, 7, implicit $mode, implicit $exec @@ -871,3 +874,71 @@ body: | %5:vgpr_32 = V_ADD_U32_e32 %4.sub0, %4.sub0, implicit $exec %6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec ... + +# Check op_sel is all 0s when combining +# GCN-LABEL: name: opsel_vop3 +# GCN: %4:vgpr_32 = V_ADD_I16_e64_dpp %2, 0, %0, 0, %1, 0, 0, 1, 15, 15, 1, implicit $exec +# GCN: %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec +# GCN: %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec +# GCN: %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec +name: opsel_vop3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = IMPLICIT_DEF + + ; Combine for op_sel:[0,0,0] + %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec + %4:vgpr_32 = V_ADD_I16_e64 0, %3, 0, %1, 0, 0, implicit $exec + + ; Do not combine for op_sel:[1,0,0] + %5:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec + %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec + + ; Do not combine for op_sel:[0,1,0] + %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec + %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec + + ; Do not combine for op_sel:[1,1,0] + %9:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec + %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec +... + +# Check op_sel is all 0s and op_sel_hi is all 1s when combining +# GCN-LABEL: name: opsel_vop3p +# GCN: %5:vgpr_32 = V_FMA_MIX_F32 0, %4, 0, %1, 0, %2, 0, 0, 0, implicit $mode, implicit $exec +# GCN: %7:vgpr_32 = V_FMA_MIX_F32 4, %6, 4, %1, 4, %2, 0, 0, 0, implicit $mode, implicit $exec +# GCN: %9:vgpr_32 = V_FMA_MIX_F32_dpp %3, 8, %0, 8, %1, 8, %2, 0, 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %11:vgpr_32 = V_FMA_MIX_F32 12, %10, 12, %1, 12, %2, 0, 0, 0, implicit $mode, implicit $exec + +name: opsel_vop3p +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = IMPLICIT_DEF + + ; Do not combine for op_sel:[0,0,0] op_sel_hi:[0,0,0] + %4:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec + %5:vgpr_32 = V_FMA_MIX_F32 0, %4, 0, %1, 0, %2, 0, 0, 0, implicit $mode, implicit $exec + + ; Do not combine for op_sel:[1,1,1] op_sel_hi:[0,0,0] + %6:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec + %7:vgpr_32 = V_FMA_MIX_F32 4, %6, 4, %1, 4, %2, 0, 0, 0, implicit $mode, implicit $exec + + ; Combine for op_sel:[0,0,0] op_sel_hi:[1,1,1] + %8:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec + %9:vgpr_32 = V_FMA_MIX_F32 8, %8, 8, %1, 8, %2, 0, 0, 0, implicit $mode, implicit $exec + + ; Do not combine for op_sel:[1,1,1] op_sel_hi:[1,1,1] + %10:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec + %11:vgpr_32 = V_FMA_MIX_F32 12, %10, 12, %1, 12, %2, 0, 0, 0, implicit $mode, implicit $exec +... From 1db3bf431d1a3d7880985f4638d71dbdc5630172 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Tue, 23 Jan 2024 19:48:06 +0100 Subject: [PATCH 13/13] Update tests --- .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll | 24 ++++++++++++++----- .../AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir | 24 +++++++++---------- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll index 3620806971282..f49fec60892cd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll @@ -14,7 +14,9 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) { define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) { ; GFX12-LABEL: test_cvt_f32_bf8_byte1: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] ; GFX12-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1) @@ -24,7 +26,9 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) { define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) { ; GFX12-LABEL: test_cvt_f32_bf8_byte2: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 op_sel:[0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] ; GFX12-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2) @@ -34,7 +38,9 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) { define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) { ; GFX12-LABEL: test_cvt_f32_fp8_byte3: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_cvt_f32_fp8_e64_dpp v0, v0 op_sel:[1,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1] ; GFX12-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3) @@ -59,7 +65,9 @@ define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) { ; GFX12-LABEL: test_cvt_pk_fp8_f32_word1: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_cvt_pk_fp8_f32_e64_dpp v2, v0, v1 op_sel:[0,0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] ; GFX12-NEXT: global_store_b32 v[3:4], v2, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -89,7 +97,9 @@ define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr a define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0] ; GFX12-NEXT: global_store_b32 v[3:4], v2, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -104,7 +114,9 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr a define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) { ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 op_sel:[0,0,0,1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1] ; GFX12-NEXT: global_store_b32 v[3:4], v2, off ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir index 40d2923b68a8c..d11fb27640ee7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir @@ -34,9 +34,9 @@ body: | ; GFX12: liveins: $vgpr0 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], 8, [[COPY]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec - ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_dpp]] + ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec + ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec @@ -56,9 +56,9 @@ body: | ; GFX12: liveins: $vgpr0 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], 12, [[COPY]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec - ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_dpp]] + ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec + ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_]] ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec @@ -115,9 +115,9 @@ body: | ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX12-NEXT: [[V_CVT_PK_FP8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_PK_FP8_F32_e64_dpp [[DEF]], 8, [[COPY4]], 0, [[COPY3]], [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec - ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_FP8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY4]], [[COPY4]], 228, 15, 15, -1, implicit $exec + ; GFX12-NEXT: [[V_CVT_PK_FP8_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_PK_FP8_F32_e64 8, killed [[V_MOV_B32_dpp]], 0, [[COPY3]], [[COPY2]], 0, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_FP8_F32_e64_]], 0, 0, implicit $exec ; GFX12-NEXT: S_ENDPGM 0 %4:vgpr_32 = COPY $vgpr4 %3:vgpr_32 = COPY $vgpr3 @@ -179,9 +179,9 @@ body: | ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GFX12-NEXT: [[V_CVT_SR_FP8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_SR_FP8_F32_e64_dpp [[DEF]], 8, [[COPY4]], 0, [[COPY3]], 0, [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec - ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_FP8_F32_e64_dpp]], 0, 0, implicit $exec + ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY4]], [[COPY4]], 228, 15, 15, -1, implicit $exec + ; GFX12-NEXT: [[V_CVT_SR_FP8_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_SR_FP8_F32_e64 8, killed [[V_MOV_B32_dpp]], 0, [[COPY3]], 0, [[COPY2]], 0, implicit $mode, implicit $exec + ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_FP8_F32_e64_]], 0, 0, implicit $exec ; GFX12-NEXT: S_ENDPGM 0 %4:vgpr_32 = COPY $vgpr4 %3:vgpr_32 = COPY $vgpr3