Issue 171969
Summary Popcount loop doesn't get folded on some targets
Labels missed-optimization
Assignees
Reporter koachan
    Using the example adapted from [here](https://xania.org/202512/11-pop-goes-the-weasel-er-count), it seems that the loop fails to optimize on some targets, even when I tell clang that the target has a hardware instruction for it:

```c
// SPARC64: clang -O2 -mpopc
// MIPS64:  clang -O2 -march=octeon
// X86-64:  clang -O2 -mpopcnt
unsigned population_count_loop(long long value) {
  unsigned result = 0;
  while (value) {
    value &= value - 1;
    ++result;
  }
 return result;
}
```

At least sparc64 and mips64 is affected (but I haven't checked the other targets, more could be affected):
```
! SPARC64
population_count_loop:
        brz %o0, .LBB1_2
        mov %g0, %o1
.LBB1_1:
        add %o0, -1, %o2
        and %o2, %o0, %o0
 brnz %o0, .LBB1_1
        add %o1, 1, %o1
.LBB1_2:
        retl
 srl %o1, 0, %o0
```
```
! MIPS64
population_count_loop:
.Lfunc_begin1 = .Ltmp3
        daddiu  $sp, $sp, -16
        sd      $ra, 8($sp)
 sd      $fp, 0($sp)
        move    $fp, $sp
        beqz    $4, .LBB1_2
 addiu   $2, $zero, 0
.LBB1_1:
        daddiu  $1, $4, -1
        and $4, $1, $4
        bnez    $4, .LBB1_1
        addiu   $2, $2, 1
.LBB1_2:
        sll     $2, $2, 0
        move    $sp, $fp
        ld $fp, 0($sp)
        ld      $ra, 8($sp)
        jr      $ra
 daddiu  $sp, $sp, 16
```

On the other hand, the builtin is compiled down to the hardware instruction as expected:
```c
unsigned population_count_builtin(long long value) {
  return __builtin_popcountll(value);
}
```
```
! SPARC64
population_count_builtin:
        retl
        popc %o0, %o0
```
```
! MIPS64
population_count_builtin:
.Lfunc_begin0 = .Ltmp0
 daddiu  $sp, $sp, -16
        sd      $ra, 8($sp)
        sd $fp, 0($sp)
        move    $fp, $sp
        dpop    $2, $4
        move $sp, $fp
        ld      $fp, 0($sp)
        ld      $ra, 8($sp)
 jr      $ra
        daddiu  $sp, $sp, 16
```

For reference when targeting x86-64 both functions compile down to the popcount instruction:
```
! X86-64
population_count_builtin:
        popcnt  rax, rdi
        ret

population_count_loop:
        popcnt  rax, rdi
 ret
```

Generally I'd expect that such optimizations are performed in a target-independent manner.
[Godbolt link](https://godbolt.org/z/d66zdzoMz).
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to