[llvm-bugs] [Bug 90421] [NVPTX] LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.shfl.sync.bfly.i32 on targets between SM30 and SM70

LLVM Bugs via llvm-bugs Sun, 28 Apr 2024 15:30:18 -0700

Issue	90421
Summary	[NVPTX] LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.shfl.sync.bfly.i32 on targets between SM30 and SM70
Labels	new issue
Assignees
Reporter	sasha0552

    ```
LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.shfl.sync.bfly.i32
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.      Program arguments: llvm-5e5a22ca-ubuntu-x64/bin/llc sum.ll -march=nvptx64 -mcpu sm_60 -mtriple=nvptx64-nvidia-cuda -o sum60.ptx
1.      Running pass 'Function Pass Manager' on module 'sum.ll'.
2.      Running pass 'NVPTX DAG->DAG Pattern Instruction Selection' on function '@test_sum_kernel_'
 #0 0x000000000201c9c7 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x201c9c7)
 #1 0x000000000201a47e llvm::sys::RunSignalHandlers() (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x201a47e)
 #2 0x000000000201d09f SignalHandler(int) Signals.cpp:0:0
 #3 0x00007fc72aa94420 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x14420)
 #4 0x00007fc72a55700b raise /build/glibc-wuryBv/glibc-2.31/signal/../sysdeps/unix/sysv/linux/raise.c:51:1
 #5 0x00007fc72a536859 abort /build/glibc-wuryBv/glibc-2.31/stdlib/abort.c:81:7
 #6 0x0000000001f9d190 llvm::report_fatal_error(llvm::Twine const&, bool) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1f9d190)
 #7 0x0000000001e360dd llvm::SelectionDAGISel::CannotYetSelect(llvm::SDNode*) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1e360dd)
 #8 0x0000000001e351ab llvm::SelectionDAGISel::SelectCodeCommon(llvm::SDNode*, unsigned char const*, unsigned int) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1e351ab)
 #9 0x0000000001e29b15 llvm::SelectionDAGISel::DoInstructionSelection() (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1e29b15)
#10 0x0000000001e28a05 llvm::SelectionDAGISel::CodeGenAndEmitDAG() (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1e28a05)
#11 0x0000000001e26446 llvm::SelectionDAGISel::SelectAllBasicBlocks(llvm::Function const&) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1e26446)
#12 0x0000000001e22cff llvm::SelectionDAGISel::runOnMachineFunction(llvm::MachineFunction&) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1e22cff)
#13 0x0000000001497ebf llvm::MachineFunctionPass::runOnFunction(llvm::Function&) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1497ebf)
#14 0x000000000198e5af llvm::FPPassManager::runOnFunction(llvm::Function&) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x198e5af)
#15 0x00000000019950f1 llvm::FPPassManager::runOnModule(llvm::Module&) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x19950f1)
#16 0x000000000198ecb1 llvm::legacy::PassManagerImpl::run(llvm::Module&) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x198ecb1)
#17 0x00000000006eac2e compileModule(char**, llvm::LLVMContext&) llc.cpp:0:0
#18 0x00000000006e8a6d main (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x6e8a6d)
#19 0x00007fc72a538083 __libc_start_main /build/glibc-wuryBv/glibc-2.31/csu/../csu/libc-start.c:342:3
#20 0x00000000006e7b2e _start (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x6e7b2e)
Aborted (core dumped)
```


<details>
<summary>sum.ll</summary>

```
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"

@printfFormat_0 = internal constant [32 x i8] c"pid (%u, %u, %u) idx (%1u)%s%f\0A\00"
@printfPrefix_0 = internal constant [6 x i8] c" sum: "
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]

; Function Attrs: nofree nounwind
declare noundef i32 @vprintf(ptr nocapture noundef readonly, ptr noundef) local_unnamed_addr #0

define void @test_sum_kernel_() local_unnamed_addr !dbg !7 {
  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
  %2 = lshr i32 %1, 1, !dbg !10
  %3 = and i32 %2, 1, !dbg !10
  %4 = and i32 %1, 1, !dbg !10
 %5 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 1, i32 31), !dbg !11
  %6 = bitcast i32 %5 to float, !dbg !11
  %7 = fadd float %6, 0.000000e+00, !dbg !15
  %8 = zext nneg i32 %3 to i64, !dbg !10
  %9 = getelementptr float, ptr addrspace(3) @global_smem, i64 %8, !dbg !10
  %10 = insertelement <1 x float> undef, float %7, i64 0, !dbg !10
  store <1 x float> %10, ptr addrspace(3) %9, align 4, !dbg !10
 %11 = zext nneg i32 %4 to i64, !dbg !10
  %12 = getelementptr float, ptr addrspace(3) @global_smem, i64 %11, !dbg !10
  %13 = load float, ptr addrspace(3) %12, align 4, !dbg !10
  %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !10
  %15 = tail call i32 asm "mov.u32 $0, %ctaid.y;", "=r"() #4, !dbg !10
  %16 = tail call i32 asm "mov.u32 $0, %ctaid.z;", "=r"() #4, !dbg !10
  %17 = fpext float %13 to double
  %18 = alloca { i32, i32, i32, i32, ptr, double }, align 8
 store i32 %14, ptr %18, align 8
  %19 = getelementptr inbounds { i32, i32, i32, i32, ptr, double }, ptr %18, i64 0, i32 1
  store i32 %15, ptr %19, align 4
  %20 = getelementptr inbounds { i32, i32, i32, i32, ptr, double }, ptr %18, i64 0, i32 2
  store i32 %16, ptr %20, align 8
  %21 = getelementptr inbounds { i32, i32, i32, i32, ptr, double }, ptr %18, i64 0, i32 3
  store i32 %4, ptr %21, align 4
  %22 = getelementptr inbounds { i32, i32, i32, i32, ptr, double }, ptr %18, i64 0, i32 4
  store ptr @printfPrefix_0, ptr %22, align 8
  %23 = getelementptr inbounds { i32, i32, i32, i32, ptr, double }, ptr %18, i64 0, i32 5
  store double %17, ptr %23, align 8
  %24 = call i32 @vprintf(ptr nonnull @printfFormat_0, ptr nonnull %18)
  ret void, !dbg !19
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1

; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2

attributes #0 = { nofree nounwind }
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #3 = { convergent nocallback nounwind }
attributes #4 = { nounwind }

!llvm.module.flags = !{!0, !1}
!llvm.dbg.cu = !{!2}
!nvvm.annotations = !{!4, !5, !5, !4}
!llvm.ident = !{!6}

!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!3 = !DIFile(filename: "trittest.py", directory: "/tmp")
!4 = !{ptr @test_sum_kernel_, !"kernel", i32 1}
!5 = !{ptr @test_sum_kernel_, !"maxntidx", i32 128}
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!7 = distinct !DISubprogram(name: "test_sum_kernel_", linkageName: "test_sum_kernel_", scope: !3, file: !3, line: 13, type: !8, scopeLine: 13, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
!9 = !{}
!10 = !DILocation(line: 16, column: 28, scope: !7)
!11 = !DILocation(line: 243, column: 36, scope: !12, inlinedAt: !14)
!12 = distinct !DILexicalBlockFile(scope: !7, file: !13, discriminator: 0)
!13 = !DIFile(filename: "standard.py", directory: "/mnt/ml/vllm/venv/lib/python3.11/site-packages/triton/language")
!14 = !DILocation(line: 15, column: 18, scope: !12)
!15 = !DILocation(line: 233, column: 15, scope: !16, inlinedAt: !17)
!16 = distinct !DILexicalBlockFile(scope: !12, file: !13, discriminator: 0)
!17 = !DILocation(line: 243, column: 36, scope: !16, inlinedAt: !18)
!18 = !DILocation(line: 15, column: 18, scope: !16)
!19 = !DILocation(line: 16, column: 4, scope: !7)
```
</details>

[LLVM build](https://tritonlang.blob.core.windows.net/llvm-builds/llvm-5e5a22ca-ubuntu-x64.tar.gz) @ https://github.com/llvm/llvm-project/tree/5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372

<details>
<summary>LLVM version</summary>

```
LLVM (http://llvm.org/):
  LLVM version 18.0.0git
  Optimized build with assertions.
  Default target: x86_64-unknown-linux-gnu
  Host CPU: znver3

  Registered Targets:
    amdgcn  - AMD GCN GPUs
    nvptx   - NVIDIA PTX 32-bit
 nvptx64 - NVIDIA PTX 64-bit
    r600    - AMD GPUs HD2XXX-HD6XXX
 x86     - 32-bit X86: Pentium-Pro and above
    x86-64  - 64-bit X86: EM64T and AMD64
```
</details>

It compiles successfully when I specify:
1. SM30
2. SM70+

https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync
It should be supported with the SM30+

Related: vllm-project/vllm#4438

_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

[llvm-bugs] [Bug 90421] [NVPTX] LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.shfl.sync.bfly.i32 on targets between SM30 and SM70

Reply via email to