cpymem for RISCV with v extension

钟居哲 Fri, 04 Aug 2023 16:10:31 -0700

Could you add testcases for this patch?

+;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
+;; optimizers from changing cpymem_loop_* into this.
+(define_insn "@cpymem_straight<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
+       (mem:BLK (match_operand:P 1 "register_operand" "r,r")))
+       (use (and (match_dup 1) (const_int 127)))
+   (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr,&vr"))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+  "@vsetvli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)
+   vsetivli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)"
+)
+
+(define_insn "@cpymem_loop<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+       (mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
+   (clobber (match_scratch:P 4 "=&r"))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e<sew>,m8,ta,ma\;"
+                  "vle<sew>.v %3,(%1)\;"
+                  "sub %2,%2,%4", operands);
+  if (<sew> != 8)
+    {
+      rtx xop[2];
+      xop[0] = operands[4];
+      xop[1] = GEN_INT (exact_log2 (<sew>/8));
+      output_asm_insn ("slli %0,%0,%1", xop);
+    }
+  output_asm_insn ("add %1,%1,%4\;"
+                  "vse<sew>.v %3,(%0)\;"
+                  "add %0,%0,%4\;"
+                  "bnez %2,0b", operands);
+  return "";
+})
+
+;; This pattern (at bltu) assumes pointers can be treated as unsigned,
+;; i.e.  objects can't straddle 0xffffffffffffffff / 0x0000000000000000 .
+(define_insn "@cpymem_loop_fast<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+       (mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
+   (clobber (match_scratch:P 4 "=&r"))
+   (clobber (match_scratch:P 5 "=&r"))
+   (clobber (match_scratch:P 6 "=&r"))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{
+  output_asm_insn ("vsetvli %4,%2,e<sew>,m8,ta,ma\;"
+                  "beq %4,%2,1f\;"
+                  "add %5,%0,%2\;"
+                  "sub %6,%5,%4", operands);
+  if (<sew> != 8)
+    {
+      rtx xop[2];
+      xop[0] = operands[4];
+      xop[1] = GEN_INT (exact_log2 (<sew>/8));
+      output_asm_insn ("slli %0,%0,%1", xop);
+    }
+  output_asm_insn ("\n0:\t" "vle<sew>.v %3,(%1)\;"
+                  "add %1,%1,%4\;"
+                  "vse<sew>.v %3,(%0)\;"
+                  "add %0,%0,%4\;"
>>                 "bltu %0,%6,0b\;"
>>                 "sub %5,%5,%0", operands);
>>   if (<sew> != 8)
>>     {
>>       rtx xop[2];
>>       xop[0] = operands[4];
>>       xop[1] = GEN_INT (exact_log2 (<sew>/8));
>>       output_asm_insn ("srli %0,%0,%1", xop);
>>      }
>>   output_asm_insn ("vsetvli %4,%5,e<sew>,m8,ta,ma\n"
>>          "1:\t" "vle<sew>.v %3,(%1)\;"
>>                 "vse<sew>.v %3,(%0)", operands);
>>   return "";
>>  })
I don't think they are necessary.


>>      considering that this code is usually memory-constrainted, limit this
>>      to -O3.  ??? It would make sense to differentiate here between in-order
>>     and OOO microarchitectures.  */
>>     else if (!size_p && optimize >= 3)
>>       emit_insn (gen_cpymem_loop_fast (Pmode, vmode, dst, src, end));
>>      else
>>       emit_insn (gen_cpymem_loop (Pmode, vmode, dst, src, end));
Why not just emit RVV pattern.
>> Just post the update for archival purposes and consider 
>> it pre-approved for the trunk.I am so sorry that I disagree approve this 
>> patch too fast.It should be well tested.
We should at least these 2 following situations:1. an unknown number bytes to 
be memcpy, this codegen should be as follows:   vsetvl a5,a2,e8,m8,ta,ma    vle 
   vse    bump counter    branch2. a known number bytes to be memcpy, and the 
number bytes allow us to fine a VLS modes to hold it.    For example, memcpy 16 
bytes QImode.    Then, we can use V16QImode directly, the codegen should be:    
vsetvli zero,16,....     vle     vseSimple 3 instructions are enough. 
This patch should be well tested with these 2 situations before approved since 
LLVM does the same thing.We should be able to have the same behavior as LLVM.


juzhe.zh...@rivai.ai

cpymem for RISCV with v extension

Reply via email to