| Issue |
170883
|
| Summary |
ARM vbfdotq_laneq_f32 not generating indexed bfdot
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
dsharlet
|
Reproducer: https://godbolt.org/z/vdzs9nYWb
It's short enough to reproduce here, input:
```
#include <arm_neon.h>
float32x4_t dot_a_few(bfloat16x8_t a, bfloat16x8_t b) {
float32x4_t result = vdupq_n_f32(0.0f);
result = vbfdotq_laneq_f32(result, a, b, 0);
result = vbfdotq_laneq_f32(result, a, b, 1);
result = vbfdotq_laneq_f32(result, a, b, 2);
result = vbfdotq_laneq_f32(result, a, b, 3);
return result;
}
float32x4_t dot_a_few(bfloat16x4_t a, bfloat16x8_t b) {
float32x4_t result = vdupq_n_f32(0.0f);
result = vbfdotq_lane_f32(result, b, a, 0);
result = vbfdotq_lane_f32(result, b, a, 1);
return result;
}
```
Output:
```
dot_a_few(__Bfloat16x8_t, __Bfloat16x8_t):
movi v2.2d, #0000000000000000
dup v3.4s, v1.s[0]
bfdot v2.4s, v0.8h, v3.8h
dup v3.4s, v1.s[1]
bfdot v2.4s, v0.8h, v3.8h
dup v3.4s, v1.s[2]
dup v1.4s, v1.s[3]
bfdot v2.4s, v0.8h, v3.8h
bfdot v2.4s, v0.8h, v1.8h
mov v0.16b, v2.16b
ret
dot_a_few(__Bfloat16x4_t, __Bfloat16x8_t):
movi v2.2d, #0000000000000000
bfdot v2.4s, v1.8h, v0.2h[0]
bfdot v2.4s, v1.8h, v0.2h[1]
mov v0.16b, v2.16b
ret
```
Note that the vbfdotq_lane_f32 works, and generates an indexed bfdot, but vbfdotq_laneq_f32 does not, it's generating explicit dup instructions.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs