------- Comment #4 from ajrobb at bigfoot dot com  2008-08-31 07:48 -------
I should point out that for operations (x * y / z) where y > z, there is
nothing to be gained on an Intel Core 2 duo (as extra adds and a shrd are
required for the scaling to work over the whole domain: 0..(1<<32)/y. However,
there are still gains to be had where y < z and a simple 32-bit scale factor
can be used.

In the following example, fast is at least 20% faster than std with gcc 3.4
(Cygwin) when compiled without inlining (gcc -O2 -fomit-frame-pointer).

uint32_t std(uint32_t x) {
  return x * 40 / 47;

uint32_t fast(uint32_t x) {
  static const uint64_t fact = ((((uint64_t)40) << 32) + 46) / 47;
  return (x * fact) >> 32;

        .def    _std;   .scl    2;      .type   32;     .endef
        movl    4(%esp), %edx
        movl    $-1370734243, %eax
        leal    (%edx,%edx,4), %edx
        sall    $3, %edx
        mull    %edx
        shrl    $5, %edx
        movl    %edx, %eax
        .p2align 4,,15
.globl _fast
        .def    _fast;  .scl    2;      .type   32;     .endef
        movl    $-639675980, %eax
        mull    4(%esp)
        movl    %edx, %eax
        xorl    %edx, %edx



Reply via email to