yaxunl added a comment.

I did an experiment regarding the ICF issue and it seems not to affect kernel 
stub.

  #include "hip/hip_runtime.h"
  
  template<typename T>
  void bar(T x) { }
  
  template<typename T>
  __global__ void foo(T x) {}
  
  int main() {
    foo<<<1,1>>>(1);
    printf("%p\n", foo<int>);
    printf("%p\n", foo<float>);
    printf("%p\n", bar<int>);
    printf("%p\n", bar<float>);
  }

If I pass `-Wl,/opt:noicf`, I got

  00007FF622A01100
  00007FF622A01170
  00007FF622A01360
  00007FF622A01370

By default, I got

  00007FF693521100
  00007FF693521170
  00007FF693521360
  00007FF693521360

This indicates bar<int> and bar<float> are folded but kernel stubs are not 
folded.

I also tried `-Wl,/opt:icf=10`, and kernel stubs are still not folded.

For HIP, since the kernel stub passes a unique kernel symbol to the internal 
kernel launching API, you may think the kernel stubs are not folded because 
they are not identical.

To imitate the CUDA case, where the address of kernel stub function itself is 
passed to the internal kernel launching API, I used the original patch of this 
review, where the kernel stub function passes the address of itself to the 
internal kernel launching API, therefore in a sense, the kernel stubs are all 
the same. Still, the kernel stubs are not folded.

Looking at the assembly of the kernel stub function:

  ; foo<int>
  .seh_proc "??$foo@H@@YAXH@Z"
  # %bb.0:
          pushq   %rsi
          .seh_pushreg %rsi
          pushq   %rdi
          .seh_pushreg %rdi
          subq    $120, %rsp
          .seh_stackalloc 120
          .seh_endprologue
          movl    %ecx, 60(%rsp)
          leaq    60(%rsp), %rax
          movq    %rax, 64(%rsp)
          leaq    104(%rsp), %rsi
          leaq    88(%rsp), %rdi
          leaq    80(%rsp), %r8
          leaq    72(%rsp), %r9
          movq    %rsi, %rcx
          movq    %rdi, %rdx
          callq   __hipPopCallConfiguration
          movq    80(%rsp), %rax
          movq    72(%rsp), %rcx
          movq    %rcx, 40(%rsp)
          movq    %rax, 32(%rsp)
          leaq    "??$foo@H@@YAXH@Z"(%rip), %rcx
          leaq    64(%rsp), %r9
          movq    %rsi, %rdx
          movq    %rdi, %r8
          callq   hipLaunchKernel
          nop
          addq    $120, %rsp
          popq    %rdi
          popq    %rsi
          retq
          .seh_endproc
  
  ; foo<float>
  .seh_proc "??$foo@M@@YAXM@Z"
  # %bb.0:
          pushq   %rsi
          .seh_pushreg %rsi
          pushq   %rdi
          .seh_pushreg %rdi
          subq    $120, %rsp
          .seh_stackalloc 120
          .seh_endprologue
          movss   %xmm0, 60(%rsp)
          leaq    60(%rsp), %rax
          movq    %rax, 64(%rsp)
          leaq    104(%rsp), %rsi
          leaq    88(%rsp), %rdi
          leaq    80(%rsp), %r8
          leaq    72(%rsp), %r9
          movq    %rsi, %rcx
          movq    %rdi, %rdx
          callq   __hipPopCallConfiguration
          movq    80(%rsp), %rax
          movq    72(%rsp), %rcx
          movq    %rcx, 40(%rsp)
          movq    %rax, 32(%rsp)
          leaq    "??$foo@M@@YAXM@Z"(%rip), %rcx
          leaq    64(%rsp), %r9
          movq    %rsi, %rdx
          movq    %rdi, %r8
          callq   hipLaunchKernel
          nop
          addq    $120, %rsp
          popq    %rdi
          popq    %rsi
          retq
          .seh_endproc

I think they are not folded because link.exe is smart enough to treat them as 
not identical comdat functions. I think we may stop worrying about the ICF 
foading kernel stubs.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D112492/new/

https://reviews.llvm.org/D112492

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to