Issue 92211
Summary Spurious optimization triggered by a `zext i16 %0 to i64` but not `and i64 %0, 65535`
Labels new issue
Assignees
Reporter Validark
    I define the following 2 Zig functions: https://zig.godbolt.org/z/3xfc5bjEc

```zig
export fn foo(x: u64) u64 {
    var y: u64 = @as(u16, @truncate(x));
    y = (y | (y << 24));
    y = (y | (y << 12));
    return y;
}

export fn bar(x: u16) u64 {
    var y: u64 = x;
    y = (y | (y << 24));
    y = (y | (y << 12));
    return y;
}
```

Emitting for Neoverse N2, I get:

```asm
foo:
        and     x8, x0, #0xffff
 orr     x8, x8, x8, lsl #24
        orr     x0, x8, x8, lsl #12
 ret

bar:
        mov     w8, w0
        ubfiz   x10, x0, #12, #32
        orr     x9, x8, x8, lsl #24
        orr     x8, x10, x8, lsl #36
        orr     x0, x8, x9
        ret
```

Here is the LLVM IR:

```llvm
define dso_local i64 @foo(i64 %0) local_unnamed_addr {
Entry:
  %1 = and i64 %0, 65535
  %2 = mul nuw nsw i64 %1, 16777217
  %3 = mul nuw nsw i64 %1, 68719480832
  %4 = or i64 %3, %2
  ret i64 %4
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

define dso_local i64 @bar(i16 zeroext %0) local_unnamed_addr {
Entry:
  %1 = zext i16 %0 to i64
  %2 = mul nuw nsw i64 %1, 16777217
  %3 = mul nuw nsw i64 %1, 68719480832
  %4 = or i64 %3, %2
  ret i64 %4
}
```

Here is the LLVM IR produced by Clang for "equivalent" C code:

```llvm
define dso_local range(i64 0, 4503599627370496) i64 @foo(i64 noundef %x) local_unnamed_addr {
entry:
  %and = and i64 %x, 65535
  %or = mul nuw nsw i64 %and, 16777217
  %shl1 = mul nuw nsw i64 %and, 68719480832
  %or2 = or i64 %shl1, %or
  ret i64 %or2
}

define dso_local range(i64 0, 4503599627370496) i64 @bar(i16 noundef %x) local_unnamed_addr {
entry:
 %conv = zext i16 %x to i64
  %or = mul nuw nsw i64 %conv, 16777217
 %shl1 = mul nuw nsw i64 %conv, 68719480832
  %or2 = or i64 %shl1, %or
 ret i64 %or2
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1
```

And the assembly:

```asm
foo: // @foo
        and     x8, x0, #0xffff
 orr     x8, x8, x8, lsl #24
        orr     x0, x8, x8, lsl #12
 ret
bar:                                    // @bar
        and     x8, x0, #0xffff
        orr     x9, x8, x8, lsl #24
        lsl     x8, x8, #12
        bfi     x8, x0, #36, #16
        orr     x0, x8, x9
 ret
```

On x86, compiling for Zen 4, I get:

```asm
foo:
        movzx   ecx, di
        movabs  rax, 68719480832
        imul    rax, rcx
        mov     rdx, rcx
 shl     rdx, 24
        or      rdx, rcx
        or      rax, rdx
 ret

bar:
        movabs  rax, 68719480832
        mov ecx, edi
        mov     rdx, rcx
        shl     rdx, 24
 imul    rax, rcx
        or      rdx, rcx
        or      rax, rdx
 ret
```

Looks like LLVM is making the decision that a multiply is less expensive than a `shl`? Also, why can't we use `shlx` here? I would think we could do:

```asm
        shlx    rax, rdi, 24
 or      rdi, rax
        shlx    rax, rdi, 12
        or rax, rdi
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to