Issue 170883
Summary ARM vbfdotq_laneq_f32 not generating indexed bfdot
Labels new issue
Assignees
Reporter dsharlet
    Reproducer: https://godbolt.org/z/vdzs9nYWb

It's short enough to reproduce here, input:
```
#include <arm_neon.h>

float32x4_t dot_a_few(bfloat16x8_t a, bfloat16x8_t b) {
    float32x4_t result = vdupq_n_f32(0.0f);
    result = vbfdotq_laneq_f32(result, a, b, 0);
    result = vbfdotq_laneq_f32(result, a, b, 1);
    result = vbfdotq_laneq_f32(result, a, b, 2);
    result = vbfdotq_laneq_f32(result, a, b, 3);
    return result;
}

float32x4_t dot_a_few(bfloat16x4_t a, bfloat16x8_t b) {
    float32x4_t result = vdupq_n_f32(0.0f);
    result = vbfdotq_lane_f32(result, b, a, 0);
 result = vbfdotq_lane_f32(result, b, a, 1);
    return result;
}
```

Output:
```
dot_a_few(__Bfloat16x8_t, __Bfloat16x8_t):
 movi    v2.2d, #0000000000000000
        dup     v3.4s, v1.s[0]
 bfdot   v2.4s, v0.8h, v3.8h
        dup     v3.4s, v1.s[1]
        bfdot v2.4s, v0.8h, v3.8h
        dup     v3.4s, v1.s[2]
        dup     v1.4s, v1.s[3]
        bfdot   v2.4s, v0.8h, v3.8h
        bfdot   v2.4s, v0.8h, v1.8h
        mov     v0.16b, v2.16b
 ret

dot_a_few(__Bfloat16x4_t, __Bfloat16x8_t):
        movi    v2.2d, #0000000000000000
        bfdot   v2.4s, v1.8h, v0.2h[0]
        bfdot v2.4s, v1.8h, v0.2h[1]
        mov     v0.16b, v2.16b
 ret
```
Note that the vbfdotq_lane_f32 works, and generates an indexed bfdot, but vbfdotq_laneq_f32 does not, it's generating explicit dup instructions.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to