Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit e12cbd8

Browse filesBrowse files
authored
[AMDGPU] Fix scale opsel flags for scaled MFMA operations (#140183)
Fix for src scale opsel flags encoding and ASM parsing for gfx950 scaled MFMA.
1 parent d219a71 commit e12cbd8
Copy full SHA for e12cbd8

File tree

Expand file treeCollapse file tree

7 files changed

+254
-78
lines changed
Filter options
Expand file treeCollapse file tree

7 files changed

+254
-78
lines changed

‎llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Copy file name to clipboardExpand all lines: llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+80-7Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1878,6 +1878,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
18781878

18791879
void cvtVOP3(MCInst &Inst, const OperandVector &Operands,
18801880
OptionalImmIndexMap &OptionalIdx);
1881+
void cvtScaledMFMA(MCInst &Inst, const OperandVector &Operands);
18811882
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
18821883
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
18831884
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
@@ -6784,17 +6785,25 @@ ParseStatus AMDGPUAsmParser::parseTH(OperandVector &Operands, int64_t &TH) {
67846785
return ParseStatus::Success;
67856786
}
67866787

6787-
static void addOptionalImmOperand(
6788-
MCInst& Inst, const OperandVector& Operands,
6789-
AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx,
6790-
AMDGPUOperand::ImmTy ImmT,
6791-
int64_t Default = 0) {
6788+
static void
6789+
addOptionalImmOperand(MCInst &Inst, const OperandVector &Operands,
6790+
AMDGPUAsmParser::OptionalImmIndexMap &OptionalIdx,
6791+
AMDGPUOperand::ImmTy ImmT, int64_t Default = 0,
6792+
std::optional<unsigned> InsertAt = std::nullopt) {
67926793
auto i = OptionalIdx.find(ImmT);
67936794
if (i != OptionalIdx.end()) {
67946795
unsigned Idx = i->second;
6795-
((AMDGPUOperand &)*Operands[Idx]).addImmOperands(Inst, 1);
6796+
const AMDGPUOperand &Op =
6797+
static_cast<const AMDGPUOperand &>(*Operands[Idx]);
6798+
if (InsertAt)
6799+
Inst.insert(Inst.begin() + *InsertAt, MCOperand::createImm(Op.getImm()));
6800+
else
6801+
Op.addImmOperands(Inst, 1);
67966802
} else {
6797-
Inst.addOperand(MCOperand::createImm(Default));
6803+
if (InsertAt.has_value())
6804+
Inst.insert(Inst.begin() + *InsertAt, MCOperand::createImm(Default));
6805+
else
6806+
Inst.addOperand(MCOperand::createImm(Default));
67986807
}
67996808
}
68006809

@@ -8811,6 +8820,70 @@ void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands)
88118820
Inst.getOperand(ModIdx).setImm(ModVal);
88128821
}
88138822
}
8823+
void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
8824+
const OperandVector &Operands) {
8825+
OptionalImmIndexMap OptionalIdx;
8826+
unsigned Opc = Inst.getOpcode();
8827+
unsigned I = 1;
8828+
8829+
const MCInstrDesc &Desc = MII.get(Opc);
8830+
8831+
for (unsigned J = 0; J < Desc.getNumDefs(); ++J)
8832+
static_cast<AMDGPUOperand &>(*Operands[I++]).addRegOperands(Inst, 1);
8833+
8834+
for (unsigned E = Operands.size(); I != E; ++I) {
8835+
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands[I]);
8836+
8837+
if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
8838+
Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
8839+
} else if (Op.isImmModifier()) {
8840+
OptionalIdx[Op.getImmTy()] = I;
8841+
} else {
8842+
Op.addRegOrImmOperands(Inst, 1);
8843+
}
8844+
}
8845+
8846+
// Insert CBSZ and BLGP operands for F8F6F4 variants
8847+
int InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz);
8848+
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCBSZ,
8849+
0, InsertPos);
8850+
InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
8851+
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyBLGP,
8852+
0, InsertPos);
8853+
8854+
// Add dummy src_modifiers
8855+
Inst.addOperand(MCOperand::createImm(0));
8856+
Inst.addOperand(MCOperand::createImm(0));
8857+
8858+
// Handle op_sel fields
8859+
8860+
unsigned OpSel = 0;
8861+
auto OpselIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSel);
8862+
if (OpselIdx != OptionalIdx.end()) {
8863+
OpSel = static_cast<const AMDGPUOperand &>(*Operands[OpselIdx->second])
8864+
.getImm();
8865+
}
8866+
8867+
unsigned OpSelHi = 0;
8868+
auto OpselHiIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSelHi);
8869+
if (OpselHiIdx != OptionalIdx.end()) {
8870+
OpSelHi = static_cast<const AMDGPUOperand &>(*Operands[OpselHiIdx->second])
8871+
.getImm();
8872+
}
8873+
const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers,
8874+
AMDGPU::OpName::src1_modifiers};
8875+
8876+
for (unsigned J = 0; J < 2; ++J) {
8877+
unsigned ModVal = 0;
8878+
if (OpSel & (1 << J))
8879+
ModVal |= SISrcMods::OP_SEL_0;
8880+
if (OpSelHi & (1 << J))
8881+
ModVal |= SISrcMods::OP_SEL_1;
8882+
8883+
const int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
8884+
Inst.getOperand(ModIdx).setImm(ModVal);
8885+
}
8886+
}
88148887

88158888
void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
88168889
OptionalImmIndexMap &OptionalIdx) {

‎llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Copy file name to clipboardExpand all lines: llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+7-8Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -829,12 +829,12 @@ class MFMA_F8F6F4_WithSizeTable_Helper<VOP3_Pseudo ps, string F8F8Op> :
829829
// Currently assumes scaled instructions never have abid
830830
class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = false> : PatFrag <
831831
!if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp,
832-
node:$scale_src0_opsel, node:$scale_src0,
833-
node:$scale_src1_opsel, node:$scale_src1),
832+
node:$src0_modifiers, node:$scale_src0,
833+
node:$src1_modifiers, node:$scale_src1),
834834
!con((ops node:$src0, node:$src1, node:$src2, node:$cbsz),
835835
!if(HasAbid, (ops node:$abid), (ops)),
836836
(ops node:$blgp))),
837-
!if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $scale_src0_opsel, $scale_src0, $scale_src1_opsel, $scale_src1),
837+
!if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1),
838838
!if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp),
839839
(Op $src0, $src1, $src2, $cbsz, $blgp))),
840840
pred
@@ -895,12 +895,12 @@ class ScaledMAIInst<string OpName, MAIInst BaseInst, SDPatternOperator node> :
895895
let InOperandList = !con(BaseInst.InOperandList,
896896
(ins VSrc_b32:$scale_src0,
897897
VSrc_b32:$scale_src1,
898-
op_sel0:$scale_src0_opsel,
899-
op_sel_hi0:$scale_src1_opsel));
898+
op_sel0:$src0_modifiers,
899+
op_sel_hi0:$src1_modifiers));
900900
let AsmOperands =
901901
"$vdst, $src0, $src1, $src2, $scale_src0, $scale_src1"
902-
"$scale_src0_opsel$scale_src1_opsel$cbsz$blgp";
903-
902+
"$src0_modifiers$src1_modifiers$cbsz$blgp";
903+
let AsmMatchConverter = "cvtScaledMFMA";
904904
let FixedSize = 1;
905905
let Size = 16;
906906
}
@@ -2041,7 +2041,6 @@ multiclass VOP3PX_Real_ScaledMFMA<bits<7> op> {
20412041
defvar PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64");
20422042
defvar Name = PS_ACD.Mnemonic;
20432043
defvar F8F8Name = !substr(NAME, 0, !sub(!size(NAME), !size("_fN_fM")))#"_f8_f8";
2044-
20452044
let SubtargetPredicate = HasGFX950Insts,
20462045
DecoderNamespace = "GFX940",
20472046
AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in {

‎llvm/lib/Target/AMDGPU/VOPInstructions.td

Copy file name to clipboardExpand all lines: llvm/lib/Target/AMDGPU/VOPInstructions.td
+12-10Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -526,14 +526,16 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_
526526
bits<9> scale_src0;
527527
bits<9> scale_src1;
528528

529-
bits<2> scale_src0_opsel;
530-
bits<2> scale_src1_opsel;
529+
//MFMALdScaleModifierOp transforms 2 bit opsel input to 4 bit value
530+
//where opsel and opselHi are in 3rd and 4th bit.
531+
bits<4> src0_modifiers;
532+
bits<4> src1_modifiers;
531533

532534
// Inst{7-0} = unused
533535
// Inst{10-8} = neg_hi;
534536
// Inst{13-11} = op_sel
535-
let Inst{11} = scale_src0_opsel{0};
536-
let Inst{12} = scale_src1_opsel{0};
537+
let Inst{11} = src0_modifiers{2}; //opsel[0]
538+
let Inst{12} = src1_modifiers{2}; //opsel[1]
537539
// Inst{13} = unused op_sel
538540
// Inst{14} = unused op_sel_hi2
539541

@@ -542,8 +544,8 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_
542544
let Inst{49-41} = scale_src1;
543545
// Inst{50-58} = unused
544546
// Inst{60-59} = op_sel_hi;
545-
let Inst{59} = scale_src0_opsel{1};
546-
let Inst{60} = scale_src1_opsel{1};
547+
let Inst{59} = src0_modifiers{3}; //opsel_hi[0]
548+
let Inst{60} = src1_modifiers{3}; //opsel_hi[1]
547549
// Inst{63-61} = neg;
548550

549551
// The high half of the encoding is the unscaled mfma op.
@@ -1437,17 +1439,17 @@ class getVOP3MAIScaledPat<VOPProfile P, SDPatternOperator node> {
14371439
// mfma
14381440
[(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
14391441
timm:$cbsz, timm:$blgp,
1440-
MFMALdScaleModifierOp:$scale_src0_opsel,
1442+
MFMALdScaleModifierOp:$src0_modifiers,
14411443
i32:$scale_src0,
1442-
MFMALdScaleModifierOp:$scale_src1_opsel,
1444+
MFMALdScaleModifierOp:$src1_modifiers,
14431445
i32:$scale_src1
14441446
))],
14451447
// smfmac
14461448
[(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx,
14471449
timm:$cbsz, timm:$abid,
1448-
MFMALdScaleModifierOp:$scale_src0_opsel,
1450+
MFMALdScaleModifierOp:$src0_modifiers,
14491451
i32:$scale_src0,
1450-
MFMALdScaleModifierOp:$scale_src1_opsel,
1452+
MFMALdScaleModifierOp:$src1_modifiers,
14511453
i32:$scale_src1))]);
14521454
}
14531455

‎llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll

Copy file name to clipboardExpand all lines: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+16-16Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
4646
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
4747
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
4848
; GCN-NEXT: s_nop 1
49-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
49+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
5050
; GCN-NEXT: s_nop 7
5151
; GCN-NEXT: s_nop 3
5252
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -70,7 +70,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
7070
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
7171
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
7272
; GCN-NEXT: s_nop 1
73-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
73+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
7474
; GCN-NEXT: s_nop 7
7575
; GCN-NEXT: s_nop 3
7676
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -94,7 +94,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
9494
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
9595
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
9696
; GCN-NEXT: s_nop 1
97-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
97+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
9898
; GCN-NEXT: s_nop 7
9999
; GCN-NEXT: s_nop 3
100100
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -118,7 +118,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
118118
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
119119
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
120120
; GCN-NEXT: s_nop 1
121-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
121+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
122122
; GCN-NEXT: s_nop 7
123123
; GCN-NEXT: s_nop 3
124124
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -142,7 +142,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
142142
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
143143
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
144144
; GCN-NEXT: s_nop 1
145-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
145+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
146146
; GCN-NEXT: s_nop 7
147147
; GCN-NEXT: s_nop 3
148148
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -166,7 +166,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
166166
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
167167
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
168168
; GCN-NEXT: s_nop 1
169-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
169+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
170170
; GCN-NEXT: s_nop 7
171171
; GCN-NEXT: s_nop 3
172172
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -190,7 +190,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
190190
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
191191
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
192192
; GCN-NEXT: s_nop 1
193-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
193+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
194194
; GCN-NEXT: s_nop 7
195195
; GCN-NEXT: s_nop 3
196196
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1797,7 +1797,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
17971797
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
17981798
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
17991799
; GCN-NEXT: s_nop 1
1800-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0]
1800+
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[1,1,0]
18011801
; GCN-NEXT: s_nop 7
18021802
; GCN-NEXT: s_nop 3
18031803
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1819,7 +1819,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
18191819
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
18201820
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
18211821
; SDAG-NEXT: s_nop 1
1822-
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0]
1822+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[1,1,0]
18231823
; SDAG-NEXT: s_nop 7
18241824
; SDAG-NEXT: s_nop 3
18251825
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1837,7 +1837,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
18371837
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
18381838
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
18391839
; GISEL-NEXT: s_nop 1
1840-
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
1840+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[1,1,0]
18411841
; GISEL-NEXT: s_nop 7
18421842
; GISEL-NEXT: s_nop 3
18431843
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1860,7 +1860,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
18601860
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
18611861
; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d
18621862
; SDAG-NEXT: s_nop 1
1863-
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
1863+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[1,1,0]
18641864
; SDAG-NEXT: s_nop 7
18651865
; SDAG-NEXT: s_nop 3
18661866
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1879,7 +1879,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
18791879
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
18801880
; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d
18811881
; GISEL-NEXT: s_nop 1
1882-
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
1882+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
18831883
; GISEL-NEXT: s_nop 7
18841884
; GISEL-NEXT: s_nop 3
18851885
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
@@ -1921,7 +1921,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
19211921
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
19221922
; SDAG-NEXT: v_mov_b32_e32 v17, s13
19231923
; SDAG-NEXT: s_nop 1
1924-
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel_hi:[0,0,0] blgp:2
1924+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
19251925
; SDAG-NEXT: s_nop 7
19261926
; SDAG-NEXT: s_nop 3
19271927
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15]
@@ -1946,7 +1946,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
19461946
; GISEL-NEXT: v_accvgpr_write_b32 a3, s27
19471947
; GISEL-NEXT: v_mov_b32_e32 v16, s29
19481948
; GISEL-NEXT: s_nop 1
1949-
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel_hi:[0,0,0] blgp:2
1949+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
19501950
; GISEL-NEXT: v_mov_b32_e32 v0, 0
19511951
; GISEL-NEXT: s_nop 7
19521952
; GISEL-NEXT: s_nop 2
@@ -1987,7 +1987,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
19871987
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
19881988
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
19891989
; SDAG-NEXT: s_nop 1
1990-
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel_hi:[0,0,0]
1990+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
19911991
; SDAG-NEXT: s_nop 7
19921992
; SDAG-NEXT: s_nop 3
19931993
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
@@ -2013,7 +2013,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
20132013
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
20142014
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
20152015
; GISEL-NEXT: s_nop 1
2016-
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
2016+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
20172017
; GISEL-NEXT: v_mov_b32_e32 v0, 0
20182018
; GISEL-NEXT: s_nop 7
20192019
; GISEL-NEXT: s_nop 2

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.