[llvm-bugs] [Bug 48078] New: __m256i (a+b2)+b (vpmullw; vpaddw) is slower than (b+a2)+a (3 vpaddw)

via llvm-bugs Wed, 04 Nov 2020 15:18:25 -0800

https://bugs.llvm.org/show_bug.cgi?id=48078


            Bug ID: 48078
           Summary: __m256i (a+b*2)+b (vpmullw;vpaddw) is slower than
                    (b+a*2)+a (3 vpaddw)
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: unassignedb...@nondot.org
          Reporter: i...@maskray.me
                CC: craig.top...@gmail.com, llvm-bugs@lists.llvm.org,
                    llvm-...@redking.me.uk, pengfei.w...@intel.com,
                    spatel+l...@rotateright.com

Discovered by JP Maaninen

#include <immintrin.h>

__m256i Slow(__m256i a, __m256i b) {
  __m256i c = _mm256_add_epi16(a, _mm256_slli_epi16(a, 1)); 
  return _mm256_add_epi16(c, b);
}

       vpmullw .LCPI0_0(%rip), %ymm0, %ymm0
        vpaddw  %ymm1, %ymm0, %ymm0
        retq

__m256i Fast(__m256i a, __m256i b) {
  __m256i c = _mm256_add_epi16(b, _mm256_slli_epi16(a, 1)); 
  return _mm256_add_epi16(c, a);
}

        vpaddw  %ymm0, %ymm0, %ymm2
        vpaddw  %ymm0, %ymm1, %ymm0
        vpaddw  %ymm2, %ymm0, %ymm0
retq

----

This is either instcombine's problem or the backend's lack of optimization.

define dso_local <4 x i64> @_Z4SlowDv4_xS_(<4 x i64> %a, <4 x i64> %b)
local_unnamed_addr #0 {
entry:
  %0 = bitcast <4 x i64> %a to <16 x i16>
  %1 = shl <16 x i16> %0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16
1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
  %add.i5 = add <16 x i16> %1, %0
  %2 = bitcast <16 x i16> %add.i5 to <4 x i64>
  %3 = bitcast <4 x i64> %b to <16 x i16>
  %add.i = add <16 x i16> %3, %add.i5
  %4 = bitcast <16 x i16> %add.i to <4 x i64>
  ret <4 x i64> %4
}

attributes #0 = { norecurse nounwind readnone uwtable
"disable-tail-calls"="false" "frame-pointer"="none"
"less-precise-fpmad"="false" "min-legal-vector-width"="256"
"no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
"no-signed-zeros-fp-math"="false" "no-trapping-math"="true"
"stack-protector-buffer-size"="8" "target-cpu"="haswell"
"target-features"="+avx,+avx2,+bmi,+bmi2,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
"unsafe-fp-math"="false" "use-soft-float"="false" }

opt -passes=instcombine -S generates

  %add.i5 = mul <16 x i16> %0, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16
3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>

which will lower to VPMULLWYrm in X86ISelDAGToDAG and sticks after every
codegen pass.

-- 
You are receiving this mail because:
You are on the CC list for the bug.

_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

[llvm-bugs] [Bug 48078] New: __m256i (a+b*2)+b (vpmullw; vpaddw) is slower than (b+a*2)+a (3 vpaddw)

Reply via email to

[llvm-bugs] [Bug 48078] New: __m256i (a+b2)+b (vpmullw; vpaddw) is slower than (b+a2)+a (3 vpaddw)