funnel shift combiner port from SelectionDAG ISel to GlobalISel

llvm · axelcool1234 · Feb 28, 2025 · May 3, 2025 · May 3, 2025 · May 10, 2025
commit 733135ea6efc0cb336a4f06bd8d07262a25590ef
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1033,6 +1033,24 @@ def funnel_shift_overshift: GICombineRule<
  (apply [{ Helper.applyFunnelShiftConstantModulo(*${root}); }])
 >;

+// Transform: fshl x, z, y | shl x, y -> fshl x, z, y
+def funnel_shift_or_shift_to_funnel_shift_left: GICombineRule<
+  (defs root:$root), 
+  (match (G_FSHL $out1, $x, $z, $y),
+         (G_SHL $out2, $x, $y),
+         (G_OR $root, $out1, $out2)),
+  (apply (G_FSHL $root, $x, $z, $y))
+>;
+
+// Transform: fshr z, x, y | srl x, y -> fshr z, x, y
+def funnel_shift_or_shift_to_funnel_shift_right: GICombineRule<
+  (defs root:$root), 
+  (match (G_FSHR $out1, $z, $x, $y),
+         (G_LSHR $out2, $x, $y),
+         (G_OR $root, $out1, $out2)),
+  (apply (G_FSHR $root, $z, $x, $y))
+>;
+
 def rotate_out_of_range : GICombineRule<
  (defs root:$root),
  (match (wip_match_opcode G_ROTR, G_ROTL):$root,
@@ -1105,7 +1123,9 @@ def funnel_shift_combines : GICombineGroup<[funnel_shift_from_or_shift,
                                            funnel_shift_to_rotate,
                                            funnel_shift_right_zero,
                                            funnel_shift_left_zero,
-                                            funnel_shift_overshift]>;
+                                            funnel_shift_overshift,
+                                            funnel_shift_or_shift_to_funnel_shift_left,
+                                            funnel_shift_or_shift_to_funnel_shift_right]>;

 def bitfield_extract_from_sext_inreg : GICombineRule<
  (defs root:$root, build_fn_matchinfo:$info),

diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -674,14 +674,12 @@ define i32 @or_shl_fshl_simplify(i32 %x, i32 %y, i32 %s) {
 ; CHECK-GI-LABEL: or_shl_fshl_simplify:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
-; CHECK-GI-NEXT:    and w9, w2, #0x1f
-; CHECK-GI-NEXT:    lsr w10, w0, #1
-; CHECK-GI-NEXT:    lsl w11, w1, w2
+; CHECK-GI-NEXT:    lsr w9, w0, #1
+; CHECK-GI-NEXT:    and w10, w2, #0x1f
 ; CHECK-GI-NEXT:    bic w8, w8, w2
-; CHECK-GI-NEXT:    lsl w9, w1, w9
-; CHECK-GI-NEXT:    lsr w8, w10, w8
-; CHECK-GI-NEXT:    orr w9, w9, w11
-; CHECK-GI-NEXT:    orr w0, w9, w8
+; CHECK-GI-NEXT:    lsl w10, w1, w10
+; CHECK-GI-NEXT:    lsr w8, w9, w8
+; CHECK-GI-NEXT:    orr w0, w10, w8
 ; CHECK-GI-NEXT:    ret
  %shy = shl i32 %y, %s
  %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s)

diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shift.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shift.ll
@@ -105,3 +105,53 @@ define i16 @test_shl_i48_2(i48 %x, i48 %y) {
  %trunc = trunc i48 %shl to i16
  ret i16 %trunc
 }
+
+define i16 @test_fshl_i32(i32 %x, i32 %_, i32 %y) {
+; RV32-LABEL: test_fshl_i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    not a3, a2
+; RV32-NEXT:    sll a0, a0, a2
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    srl a1, a1, a3
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_fshl_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    not a3, a2
+; RV64-NEXT:    sllw a0, a0, a2
+; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    srlw a1, a1, a3
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+  %fshl = call i32 @llvm.fshl.i32(i32 %x, i32 %_, i32 %y)
+  %shl = shl i32 %x, %y
+  %or = or i32 %fshl, %shl
+  %trunc = trunc i32 %or to i16
+  ret i16 %trunc
+}
+
+define i16 @test_fshr_i32(i32 %_, i32 %x, i32 %y) {
+; RV32-LABEL: test_fshr_i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    not a3, a2
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    sll a0, a0, a3
+; RV32-NEXT:    srl a1, a1, a2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_fshr_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    not a3, a2
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    sllw a0, a0, a3
+; RV64-NEXT:    srlw a1, a1, a2
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+  %fshr = call i32 @llvm.fshr.i32(i32 %_, i32 %x, i32 %y)
+  %lshr = lshr i32 %x, %y
+  %or = or i32 %fshr, %lshr
+  %trunc = trunc i32 %or to i16
+  ret i16 %trunc
+}
-Original file line number
+Diff line change
@@ -105,3 +105,53 @@ define i16 @test_shl_i48_2(i48 %x, i48 %y) {
       %trunc = trunc i48 %shl to i16
       ret i16 %trunc
     }
+    define i16 @test_fshl_i32(i32 %x, i32 %_, i32 %y) {
             Copy link

  
      
    
  

  
      

  
  Member


      

  

  
    
      

      
            mshockwave
  

      

      

      


        Apr 20, 2025


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      could you create two commits for this PR: the first one contains only the new test functions you added (e.g. test_fshl_i32 here) but with CHECK lines showing the unoptimized assembly code. And the second commit being your changes, in which the CHECK lines are updated with optimized code.

With such, it's easier to see the changes made by your patch by comparing the differences of the CHECK lines between the two commits. The first commit is also known as "pre-commit test"
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
    Contributor


      

  Author


  

  
    
      

      
            axelcool1234
  

      

      

      


        May 3, 2025


      
  •

  
    
      
        
          edited
          
        
        
      
    
    
      
  
        
      Loading


  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  
    
  


    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      
Please refrain from force push unless it's necessary: https://www.llvm.org/docs/GitHub.html#rebasing-pull-requests-and-force-pushes. Please append new commits instead, as they're all gonna be squash into one at the end.

Woops! I apologize, I can't believe I forgot this.

could you create two commits for this PR: the first one contains only the new test functions you added (e.g. test_fshl_i32 here) but with CHECK lines showing the unoptimized assembly code. And the second commit being your changes, in which the CHECK lines are updated with optimized code. With such, it's easier to see the changes made by your patch by comparing the differences of the CHECK lines between the two commits. The first commit is also known as "pre-commit test"

I have pushed two commits following this style, although I'm probably going to have to do it again once I introduce vector tests, provide better names for the combiner and tests, and (hopefully) figure out what's causing the AArch64 fshr test to not trigger the combiner (I have a comment in another review thread in this PR noting that it's likely due to the OR instruction not matching if its operands are flipped which is strange since they should commute).
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
  Member


      

  

  
    
      

      
            mshockwave
  

      

      

      


        May 6, 2025


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      (not this patch) next time, you could make the first commit the pre-commit test, followed by your actual patch, such that you don't have to comment & un-comment like you did here.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
            axelcool1234 reacted with thumbs up emoji
      
        
            
    All reactions
  


          
              
                👍
                  1 reaction
+    ; RV32-LABEL: test_fshl_i32:
+    ; RV32:       # %bb.0:
+    ; RV32-NEXT:    not a3, a2
+    ; RV32-NEXT:    sll a0, a0, a2
+    ; RV32-NEXT:    srli a1, a1, 1
+    ; RV32-NEXT:    srl a1, a1, a3
+    ; RV32-NEXT:    or a0, a0, a1
+    ; RV32-NEXT:    ret
+    ;
+    ; RV64-LABEL: test_fshl_i32:
+    ; RV64:       # %bb.0:
+    ; RV64-NEXT:    not a3, a2
+    ; RV64-NEXT:    sllw a0, a0, a2
+    ; RV64-NEXT:    srliw a1, a1, 1
+    ; RV64-NEXT:    srlw a1, a1, a3
+    ; RV64-NEXT:    or a0, a0, a1
+    ; RV64-NEXT:    ret
+      %fshl = call i32 @llvm.fshl.i32(i32 %x, i32 %_, i32 %y)
+      %shl = shl i32 %x, %y
+      %or = or i32 %fshl, %shl
+      %trunc = trunc i32 %or to i16
+      ret i16 %trunc
+    }
+    define i16 @test_fshr_i32(i32 %_, i32 %x, i32 %y) {
+    ; RV32-LABEL: test_fshr_i32:
+    ; RV32:       # %bb.0:
+    ; RV32-NEXT:    not a3, a2
+    ; RV32-NEXT:    slli a0, a0, 1
+    ; RV32-NEXT:    sll a0, a0, a3
+    ; RV32-NEXT:    srl a1, a1, a2
+    ; RV32-NEXT:    or a0, a0, a1
+    ; RV32-NEXT:    ret
+    ;
+    ; RV64-LABEL: test_fshr_i32:
+    ; RV64:       # %bb.0:
+    ; RV64-NEXT:    not a3, a2
+    ; RV64-NEXT:    slli a0, a0, 1
+    ; RV64-NEXT:    sllw a0, a0, a3
+    ; RV64-NEXT:    srlw a1, a1, a2
+    ; RV64-NEXT:    or a0, a0, a1
+    ; RV64-NEXT:    ret
+      %fshr = call i32 @llvm.fshr.i32(i32 %_, i32 %x, i32 %y)
+      %lshr = lshr i32 %x, %y
+      %or = or i32 %fshr, %lshr
+      %trunc = trunc i32 %or to i16
+      ret i16 %trunc
+    }
             Copy link

  
      
    
  

  
      

  
    Contributor


      

  

  
    
      

      
            arsenm
  

      

      

      


        Apr 11, 2025


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Vector version of tests? Also negative multi-use tests. Ideally would share the existing DAG combine tests here instead of creating new copies
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
    Contributor


      

  Author


  

  
    
      

      
            axelcool1234
  

      

      

      


        Apr 13, 2025


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      
Ideally would share the existing DAG combine tests here instead of creating new copies

The issue I ran into has to do with the i48 type leading to a lot of code being generated to emulate @llvm.fshl.i48 since it isn't natively supported. The resulting expected test output would be really large. Should I still go for it?

Also negative multi-use tests.

Is this referring to the fact that I'm truncating or something else? I apologize, I'm still getting a hang to contributing here (and thanks for reviewing my patch!)

Vector version of tests?

Is there a common vector size that many contributors use for vector versions of tests? I'll make sure to implement that :)
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
    Contributor


      

  

  
    
      

      
            arsenm
  

      

      

      


        Apr 13, 2025


      
  •

  
    
      
        
          edited
          
        
        
      
    
    
      
  
        
      Loading


  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  
    
  


    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      
The issue I ran into has to do with the i48 type leading to a lot of code being generated to emulate @llvm.fshl.i48 since it isn't natively supported. The resulting expected test output would be really large. Should I still go for it?

That's going to be true in either case. As long as it actually compiles, it's not a problem and gives a point of reference for whenever the code improves for illegal types.

Also negative multi-use tests.

Is this referring to the fact that I'm truncating or something else? I apologize, I'm still getting a hang to contributing here (and thanks for reviewing my patch!)

I mean where the intermediate instructions have unrelated uses and need to be emitted anyway

Is there a common vector size that many contributors use for vector versions of tests? I'll make sure to implement that :)

No, whatever is relevant for the target (you also wouldn't need to think too hard about that if reusing the original test, which I hope covers some vectors)
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
    Contributor


      

  Author


  

  
    
      

      
            axelcool1234
  

      

      

      


        Apr 16, 2025


      
  •

  
    
      
        
          edited
          
        
        
      
    
    
      
  
        
      Loading


  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  
    
  


    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      
you also wouldn't need to think too hard about that if reusing the original test, which I hope covers some vectors

There aren't any vector tests here, but I'll gladly add some myself!

That's going to be true in either case. As long as it actually compiles, it's not a problem and gives a point of reference for whenever the code improves for illegal types.

Before I go forward and introduce i48 funnel shift calls to the original tests (in order to also test for multi-use i.e. if it correctly ensures the intermediate instructions are still emitted), I want to further clarify what I meant. The following is a test that uses an i32 fshl and another test that uses an i48 fshl. The difference in generated code (via update_llc_test_checks) is quite large; is this something I should even be concerned about? Once I introduce these i48 funnel shift calls to the original tests their emitted code checks will also explode in size.  One way we could avoid this (if that's desirable of course) would be to change the original tests to use i32 types instead of i48. What do you think?
define i32 @test_fshl_i32(i32 %x, i32 %y) {
; RV32-LABEL: test_fshl_i32:
; RV32:       # %bb.0:
; RV32-NEXT:    not a2, a1
; RV32-NEXT:    sll a0, a0, a1
; RV32-NEXT:    li a1, 8
; RV32-NEXT:    srl a1, a1, a2
; RV32-NEXT:    or a0, a0, a1
; RV32-NEXT:    ret
;
; RV64-LABEL: test_fshl_i32:
; RV64:       # %bb.0:
; RV64-NEXT:    not a2, a1
; RV64-NEXT:    sllw a0, a0, a1
; RV64-NEXT:    li a1, 8
; RV64-NEXT:    srlw a1, a1, a2
; RV64-NEXT:    or a0, a0, a1
; RV64-NEXT:    ret
  %fshl = call i32 @llvm.fshl.i32(i32 %x, i32 16, i32 %y)
  ret i32 %fshl
}

define i48 @test_fshl_i48(i48 %x, i48 %y) {
; RV32-LABEL: test_fshl_i48:
; RV32:       # %bb.0:
; RV32-NEXT:    addi sp, sp, -16
; RV32-NEXT:    .cfi_def_cfa_offset 16
; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
; RV32-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
; RV32-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
; RV32-NEXT:    .cfi_offset ra, -4
; RV32-NEXT:    .cfi_offset s0, -8
; RV32-NEXT:    .cfi_offset s1, -12
; RV32-NEXT:    .cfi_offset s2, -16
; RV32-NEXT:    mv s1, a0
; RV32-NEXT:    mv s0, a1
; RV32-NEXT:    mv a0, a2
; RV32-NEXT:    li s2, 47
; RV32-NEXT:    slli a1, a3, 16
; RV32-NEXT:    srli a1, a1, 16
; RV32-NEXT:    li a2, 48
; RV32-NEXT:    li a3, 0
; RV32-NEXT:    call __umoddi3
; RV32-NEXT:    li a2, 32
; RV32-NEXT:    bltu a0, a2, .LBB2_2
; RV32-NEXT:  # %bb.1:
; RV32-NEXT:    li a1, 0
; RV32-NEXT:    sll a4, s1, a0
; RV32-NEXT:    sub a3, s2, a0
; RV32-NEXT:    bnez a0, .LBB2_3
; RV32-NEXT:    j .LBB2_4
; RV32-NEXT:  .LBB2_2:
; RV32-NEXT:    sll a1, s1, a0
; RV32-NEXT:    neg a3, a0
; RV32-NEXT:    srl a3, s1, a3
; RV32-NEXT:    sll a4, s0, a0
; RV32-NEXT:    or a4, a3, a4
; RV32-NEXT:    sub a3, s2, a0
; RV32-NEXT:    beqz a0, .LBB2_4
; RV32-NEXT:  .LBB2_3:
; RV32-NEXT:    mv s0, a4
; RV32-NEXT:  .LBB2_4:
; RV32-NEXT:    li a0, 8
; RV32-NEXT:    bltu a3, a2, .LBB2_6
; RV32-NEXT:  # %bb.5:
; RV32-NEXT:    li a2, 0
; RV32-NEXT:    bnez a3, .LBB2_7
; RV32-NEXT:    j .LBB2_8
; RV32-NEXT:  .LBB2_6:
; RV32-NEXT:    srl a2, a0, a3
; RV32-NEXT:    beqz a3, .LBB2_8
; RV32-NEXT:  .LBB2_7:
; RV32-NEXT:    mv a0, a2
; RV32-NEXT:  .LBB2_8:
; RV32-NEXT:    or a0, a1, a0
; RV32-NEXT:    mv a1, s0
; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
; RV32-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
; RV32-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
; RV32-NEXT:    .cfi_restore ra
; RV32-NEXT:    .cfi_restore s0
; RV32-NEXT:    .cfi_restore s1
; RV32-NEXT:    .cfi_restore s2
; RV32-NEXT:    addi sp, sp, 16
; RV32-NEXT:    .cfi_def_cfa_offset 0
; RV32-NEXT:    ret
;
; RV64-LABEL: test_fshl_i48:
; RV64:       # %bb.0:
; RV64-NEXT:    addi sp, sp, -32
; RV64-NEXT:    .cfi_def_cfa_offset 32
; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
; RV64-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
; RV64-NEXT:    .cfi_offset ra, -8
; RV64-NEXT:    .cfi_offset s0, -16
; RV64-NEXT:    .cfi_offset s1, -24
; RV64-NEXT:    mv s0, a0
; RV64-NEXT:    li s1, 47
; RV64-NEXT:    slli a0, a1, 16
; RV64-NEXT:    srli a0, a0, 16
; RV64-NEXT:    li a1, 48
; RV64-NEXT:    call __umoddi3
; RV64-NEXT:    subw s1, s1, a0
; RV64-NEXT:    sll a0, s0, a0
; RV64-NEXT:    li a1, 8
; RV64-NEXT:    srl a1, a1, s1
; RV64-NEXT:    or a0, a0, a1
; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
; RV64-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
; RV64-NEXT:    .cfi_restore ra
; RV64-NEXT:    .cfi_restore s0
; RV64-NEXT:    .cfi_restore s1
; RV64-NEXT:    addi sp, sp, 32
; RV64-NEXT:    .cfi_def_cfa_offset 0
; RV64-NEXT:    ret
  %fshl = call i48 @llvm.fshl.i48(i48 %x, i48 16, i48 %y)
  ret i48 %fshl
}

    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
  Member


      

  

  
    
      

      
            mshockwave
  

      

      

      


        Apr 20, 2025


      
  •

  
    
      
        
          edited
          
        
        
      
    
    
      
  
        
      Loading


  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  
    
  


    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Personally I don't think you have to test i48, which is an illegal type for RISC-V. Because the majority of the instructions you saw in i48's case are generated by the legalizer, which is unrelated to your patch here. So for the purpose of testing your pattern, it'll be more clear and succinct to only include legal types (i.e. i32 and i64)

Also negative multi-use tests.

I think this related to a check that is potentially missing from your pattern now: ideally, the pattern would replace three instructions
(G_FSHR $out1, $z, $x, $y),
(G_LSHR $out2, $x, $y),
(G_OR $root, $out1, $out2)

into just one:
(G_FSHR $root, $z, $x, $y)

But if $out2 or $out1 have more than one user, than you cannot remove the intermediate G_LSHR or G_FSHR, respectively. In which case you have to emit the following sequence to ensure the correctness
(G_FSHR $out1, $z, $x, $y),
(G_LSHR $out2, $x, $y),
(G_FSHR $root, $z, $x, $y)

But as you might already notice, this "new" sequence of 3 instructions is nowhere better than the original sequence, which also have 3 instructions (in most architectures, G_FSHR will not be faster than G_OR). So in the case of $out2 or $out1 having more than one user -- it makes more sense to just not do any optimization. I don't think your current pattern would conditionally trigger based on the number of users of intermediate values. You can checkout how other patterns handle it (hint: you don't necessarily need to check the number of $out1's users -- if $out2 only has a single use, you could just replace all uses of $root with $out1). Guarding an optimization pattern with checks on whether the intermediate values have one use is a pretty common thing among many optimizations. If you want to learn more, you can search hasOneUse or m_OneUse in DAGCombiner to see some examples.
Finally, a negative test is a test that makes sure your optimization does not trigger under certain conditions. In your case, it would be an input sequence with G_FSHR or G_LSHR that have more than one users, and the CHECK lines should pledge that they won't be optimized.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    





  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
    Contributor


      

  Author


  

  
    
      

      
            axelcool1234
  

      

      

      


        May 3, 2025


      
  •

  
    
      
        
          edited
          
        
        
      
    
    
      
  
        
      Loading


  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  
    
  


    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      I apologize for my two week delay, I got severely sick in that time - I'm alright now though :)
I have changed the combiner to check for hasOneUse on $out2 uses and I now replace uses of $root with $out1 if $out2 only has one use. Additionally, I have changed the original tests test_lshr_i48 and test_shl_i48 to test_lshr_i32 and test_shl_i32. They serve as negative multi-use tests for the combiner now. I have yet to write vector tests - that's the next thing on my list.
I have left two of the original tests untouched and in their i48 form because they both include FIXME messages for the code that's generated.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions