From 2b79bbe52acef964b57a47c8aa1a52df0431874b Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Tue, 13 May 2025 16:04:10 +0530 Subject: [PATCH 1/6] Handle the case for gather where index is SHL --- llvm/lib/Target/X86/X86ISelLowering.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ac4fb157a6026..4bb23ced2bc42 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56710,12 +56710,23 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalize()) { unsigned IndexWidth = Index.getScalarValueSizeInBits(); - + // If the index is a left shift, \ComputeNumSignBits we are recomputing + // the number of sign bits from the shifted value. We are trying to enable + // the optimization in which we can shrink indices if they are larger than + // 32-bits. Using the existing fold techniques implemented below. + unsigned ComputeNumSignBits = DAG.ComputeNumSignBits(Index); + if (Index.getOpcode() == ISD::SHL) { + if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) { + if (DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) { + ComputeNumSignBits += *MinShAmt; + } + } + } // Shrink indices if they are larger than 32-bits. // Only do this before legalize types since v2i64 could become v2i32. // FIXME: We could check that the type is legal if we're after legalize // types, but then we would need to construct test cases where that happens. - if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { + if (IndexWidth > 32 && ComputeNumSignBits > (IndexWidth - 32)) { EVT NewVT = IndexVT.changeVectorElementType(MVT::i32); // FIXME: We could support more than just constant fold, but we need to From 9606a43317cec4f3ca95f5e0cff3946ac38acaf1 Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Wed, 14 May 2025 00:22:01 +0530 Subject: [PATCH 2/6] fix the test cases --- .../test/CodeGen/X86/masked_gather_scatter.ll | 211 ++++-------------- 1 file changed, 40 insertions(+), 171 deletions(-) diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index a5c727e8df9d6..4aa906b1ae557 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -4806,18 +4806,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm0 -; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; X64-KNL-NEXT: kshiftrw $8, %k1, %k2 -; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2} -; X64-KNL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1} -; X64-KNL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0 +; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm0 +; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0), %zmm1 {%k1} +; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index: @@ -4832,44 +4823,15 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X86-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X86-KNL-NEXT: retl ; -; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index: -; X64-SKX-SMALL: # %bb.0: -; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2 -; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2} -; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1} -; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0 -; X64-SKX-SMALL-NEXT: retq -; -; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index: -; X64-SKX-LARGE: # %bb.0: -; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2 -; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2} -; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1} -; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0 -; X64-SKX-LARGE-NEXT: retq +; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0 +; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm0 +; X64-SKX-NEXT: vgatherdps (%rdi,%zmm0), %zmm1 {%k1} +; X64-SKX-NEXT: vmovaps %zmm1, %zmm0 +; X64-SKX-NEXT: retq ; ; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index: ; X86-SKX: # %bb.0: @@ -4896,18 +4858,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm0 -; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; X64-KNL-NEXT: kshiftrw $8, %k1, %k2 -; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2} -; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1} -; X64-KNL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0 +; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm0 +; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0), %zmm1 {%k1} +; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index_offset: @@ -4922,44 +4875,15 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X86-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X86-KNL-NEXT: retl ; -; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index_offset: -; X64-SKX-SMALL: # %bb.0: -; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2 -; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2} -; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1} -; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0 -; X64-SKX-SMALL-NEXT: retq -; -; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index_offset: -; X64-SKX-LARGE: # %bb.0: -; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2 -; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2} -; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1} -; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0 -; X64-SKX-LARGE-NEXT: retq +; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index_offset: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0 +; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm0 +; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm0), %zmm1 {%k1} +; X64-SKX-NEXT: vmovaps %zmm1, %zmm0 +; X64-SKX-NEXT: retq ; ; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index_offset: ; X86-SKX: # %bb.0: @@ -4986,25 +4910,11 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm3 -; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm4 -; X64-KNL-NEXT: kshiftrw $8, %k1, %k2 -; X64-KNL-NEXT: kmovw %k2, %k3 -; X64-KNL-NEXT: vmovaps %ymm4, %ymm0 -; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3} -; X64-KNL-NEXT: vmovaps %ymm1, %ymm5 -; X64-KNL-NEXT: kmovw %k1, %k3 -; X64-KNL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3} -; X64-KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0 -; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2} -; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1} -; X64-KNL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 +; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm2 +; X64-KNL-NEXT: kmovw %k1, %k2 +; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 +; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2), %zmm0 {%k2} +; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm2), %zmm1 {%k1} ; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index_pair: @@ -5021,58 +4931,17 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X86-KNL-NEXT: vgatherdps 4(%eax,%zmm2), %zmm1 {%k1} ; X86-KNL-NEXT: retl ; -; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index_pair: -; X64-SKX-SMALL: # %bb.0: -; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm3 -; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm4 -; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2 -; X64-SKX-SMALL-NEXT: kmovw %k2, %k3 -; X64-SKX-SMALL-NEXT: vmovaps %ymm4, %ymm0 -; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3} -; X64-SKX-SMALL-NEXT: vmovaps %ymm1, %ymm5 -; X64-SKX-SMALL-NEXT: kmovw %k1, %k3 -; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3} -; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0 -; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2} -; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1} -; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 -; X64-SKX-SMALL-NEXT: retq -; -; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index_pair: -; X64-SKX-LARGE: # %bb.0: -; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm3 -; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm4 -; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2 -; X64-SKX-LARGE-NEXT: vmovaps %ymm4, %ymm0 -; X64-SKX-LARGE-NEXT: kmovw %k2, %k3 -; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3} -; X64-SKX-LARGE-NEXT: vmovaps %ymm1, %ymm5 -; X64-SKX-LARGE-NEXT: kmovw %k1, %k3 -; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3} -; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0 -; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2} -; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1} -; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 -; X64-SKX-LARGE-NEXT: retq +; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index_pair: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0 +; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm2 +; X64-SKX-NEXT: kmovw %k1, %k2 +; X64-SKX-NEXT: vmovaps %zmm1, %zmm0 +; X64-SKX-NEXT: vgatherdps (%rdi,%zmm2), %zmm0 {%k2} +; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm2), %zmm1 {%k1} +; X64-SKX-NEXT: retq ; ; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index_pair: ; X86-SKX: # %bb.0: From c8ec5f80214b0bd47a4d1347615e05bcdf6c328d Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Wed, 14 May 2025 00:32:59 +0530 Subject: [PATCH 3/6] fix formatting issue --- llvm/lib/Target/X86/X86ISelLowering.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 758f543a9fa3a..f6d9524a157b0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56740,7 +56740,6 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, } } } - // If the index is a left shift, \ComputeNumSignBits we are recomputing // the number of sign bits from the shifted value. We are trying to enable // the optimization in which we can shrink indices if they are larger than From 91e3edffa72af4bc77819220bfcabbd9458a973b Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Wed, 14 May 2025 15:11:59 +0530 Subject: [PATCH 4/6] Remove the compute number of sign bits and allow scale update --- llvm/lib/Target/X86/X86ISelLowering.cpp | 17 ++-------- .../test/CodeGen/X86/masked_gather_scatter.ll | 34 +++++++++++-------- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f6d9524a157b0..f1ad164e693d1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56726,7 +56726,8 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, return SDValue(N, 0); } if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) { - if (*MinShAmt >= 1 && (*MinShAmt + Log2ScaleAmt) < 4 && + if (*MinShAmt >= 1 && + (((*MinShAmt + Log2ScaleAmt) < 4) || (1 + Log2ScaleAmt < 4)) && DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) { SDValue ShAmt = Index.getOperand(1); SDValue NewShAmt = @@ -56740,24 +56741,12 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, } } } - // If the index is a left shift, \ComputeNumSignBits we are recomputing - // the number of sign bits from the shifted value. We are trying to enable - // the optimization in which we can shrink indices if they are larger than - // 32-bits. Using the existing fold techniques implemented below. - unsigned ComputeNumSignBits = DAG.ComputeNumSignBits(Index); - if (Index.getOpcode() == ISD::SHL) { - if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) { - if (DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) { - ComputeNumSignBits += *MinShAmt; - } - } - } // Shrink indices if they are larger than 32-bits. // Only do this before legalize types since v2i64 could become v2i32. // FIXME: We could check that the type is legal if we're after legalize // types, but then we would need to construct test cases where that happens. - if (IndexWidth > 32 && ComputeNumSignBits > (IndexWidth - 32)) { + if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { EVT NewVT = IndexVT.changeVectorElementType(MVT::i32); // FIXME: We could support more than just constant fold, but we need to diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 4aa906b1ae557..edafccdd98525 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -4806,8 +4806,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm0 -; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0), %zmm1 {%k1} +; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: retq ; @@ -4828,8 +4829,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm0 -; X64-SKX-NEXT: vgatherdps (%rdi,%zmm0), %zmm1 {%k1} +; X64-SKX-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-SKX-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-NEXT: retq ; @@ -4858,8 +4860,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm0 -; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0), %zmm1 {%k1} +; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: retq ; @@ -4880,8 +4883,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm0 -; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm0), %zmm1 {%k1} +; X64-SKX-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-NEXT: retq ; @@ -4910,11 +4914,12 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm2 +; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm2 ; X64-KNL-NEXT: kmovw %k1, %k2 ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 -; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2), %zmm0 {%k2} -; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm2), %zmm1 {%k1} +; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} +; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1} ; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index_pair: @@ -4936,11 +4941,12 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm2 +; X64-SKX-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-NEXT: vpaddd %zmm0, %zmm0, %zmm2 ; X64-SKX-NEXT: kmovw %k1, %k2 ; X64-SKX-NEXT: vmovaps %zmm1, %zmm0 -; X64-SKX-NEXT: vgatherdps (%rdi,%zmm2), %zmm0 {%k2} -; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm2), %zmm1 {%k1} +; X64-SKX-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} +; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1} ; X64-SKX-NEXT: retq ; ; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index_pair: From e0626d2f73b4b414ee730cb4312fdddc42caa038 Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Sat, 17 May 2025 00:42:07 +0530 Subject: [PATCH 5/6] Fix the test cases --- .../test/CodeGen/X86/masked_gather_scatter.ll | 111 +++++++++++++----- 1 file changed, 79 insertions(+), 32 deletions(-) diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index edafccdd98525..af018d83d520e 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -4807,6 +4807,7 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 @@ -4824,16 +4825,30 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X86-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X86-KNL-NEXT: retl ; -; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index: -; X64-SKX: # %bb.0: -; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-NEXT: vpaddd %zmm0, %zmm0, %zmm0 -; X64-SKX-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} -; X64-SKX-NEXT: vmovaps %zmm1, %zmm0 -; X64-SKX-NEXT: retq +; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index: +; X64-SKX-SMALL: # %bb.0: +; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} +; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0 +; X64-SKX-SMALL-NEXT: retq +; +; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index: +; X64-SKX-LARGE: # %bb.0: +; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax +; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} +; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0 +; X64-SKX-LARGE-NEXT: retq ; ; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index: ; X86-SKX: # %bb.0: @@ -4861,6 +4876,7 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 @@ -4878,16 +4894,30 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X86-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X86-KNL-NEXT: retl ; -; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index_offset: -; X64-SKX: # %bb.0: -; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-NEXT: vpaddd %zmm0, %zmm0, %zmm0 -; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} -; X64-SKX-NEXT: vmovaps %zmm1, %zmm0 -; X64-SKX-NEXT: retq +; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index_offset: +; X64-SKX-SMALL: # %bb.0: +; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} +; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0 +; X64-SKX-SMALL-NEXT: retq +; +; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index_offset: +; X64-SKX-LARGE: # %bb.0: +; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax +; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} +; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0 +; X64-SKX-LARGE-NEXT: retq ; ; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index_offset: ; X86-SKX: # %bb.0: @@ -4915,6 +4945,7 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm2 ; X64-KNL-NEXT: kmovw %k1, %k2 ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 @@ -4936,18 +4967,34 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X86-KNL-NEXT: vgatherdps 4(%eax,%zmm2), %zmm1 {%k1} ; X86-KNL-NEXT: retl ; -; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index_pair: -; X64-SKX: # %bb.0: -; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-NEXT: vpaddd %zmm0, %zmm0, %zmm2 -; X64-SKX-NEXT: kmovw %k1, %k2 -; X64-SKX-NEXT: vmovaps %zmm1, %zmm0 -; X64-SKX-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} -; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1} -; X64-SKX-NEXT: retq +; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index_pair: +; X64-SKX-SMALL: # %bb.0: +; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm2 +; X64-SKX-SMALL-NEXT: kmovw %k1, %k2 +; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0 +; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} +; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1} +; X64-SKX-SMALL-NEXT: retq +; +; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index_pair: +; X64-SKX-LARGE: # %bb.0: +; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax +; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm2 +; X64-SKX-LARGE-NEXT: kmovw %k1, %k2 +; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0 +; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} +; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1} +; X64-SKX-LARGE-NEXT: retq ; ; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index_pair: ; X86-SKX: # %bb.0: From e87eb2d78ad5d3416c275b2d11867de74ccdd58b Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Mon, 19 May 2025 18:20:14 +0530 Subject: [PATCH 6/6] Remove the redundant condition --- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7ea3a215f3e91..003591b50a9ab 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56717,8 +56717,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, return SDValue(N, 0); } if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) { - if (*MinShAmt >= 1 && - (((*MinShAmt + Log2ScaleAmt) < 4) || (1 + Log2ScaleAmt < 4)) && + if (*MinShAmt >= 1 && Log2ScaleAmt < 3 && DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) { SDValue ShAmt = Index.getOperand(1); SDValue NewShAmt =