--- Comment #6 from Oleg Endo <olegendo at gcc dot> ---
Created attachment 31144
stitching addc insns

The attached patch is an example that shows how widening additions can be
stitched together.  One application would be arithmetic on user defined integer
types with an arbitrary number of bits.
For example (requires c++11):

template <unsigned int Bits> class integer
  typedef unsigned int word_type;
  typedef unsigned long long ext_word_type;

  static constexpr unsigned int bit_count = Bits;
  static constexpr unsigned int word_bit_count = sizeof (word_type) * 8;
  static constexpr unsigned int word_count = (bit_count + word_bit_count - 1)
                                              / word_bit_count;

  word_type word[word_count];

  friend integer
  operator + (const integer& a, const integer& b)
    integer result;

    word_type carry = 0;
    for (unsigned int i = 0; i < word_count; ++i)
      auto sum = (ext_word_type)a.word[i] + (ext_word_type)b.word[i] + carry;
      result.word[i] = (word_type)sum;
      carry = (sum >> word_bit_count) == 0 ? 0 : 1;

    return result;

With this patch the following examples, compiled with -funroll-all-loops -O2
demonstrate the effect:


integer<64> test_02 (const integer<64>& a, const integer<64>& b)
  return a + b;

        mov.l   @r5,r1
        mov.l   @r4,r0
        mov.l   @(4,r5),r2
        addc    r1,r0
        mov.l   @(4,r4),r1
        addc    r2,r1

this is the same as a 'native' 64 bit addition.


integer<80> test_03 (const integer<80>& a, const integer<80>& b)
  return a + b;

        mov.l   @r5,r3
        mov.l   @r4,r1
        mov.l   @(4,r5),r0
        mov.l   @(4,r4),r6
        addc    r3,r1
        mov.l   @(8,r5),r5
        mov.l   @(8,r4),r4
        addc    r0,r6
        mov.l   r1,@r2
        mov     r2,r0
        addc    r5,r4
        mov.l   r6,@(4,r2)
        mov.l   r4,@(8,r2)

80 bits are rounded up to 96 in the template 'integer', thus 3 addc insns are
required to do the 96 bit addition.

However, when compiling without loop unrolling, it doesn't work because there
is no mechanism to feed back the carry variable inside a loop.  The test_03
function becomes:

        mov.l   r8,@-r15
        mov.l   r9,@-r15
        mov.l   r10,@-r15
        mov.l   r11,@-r15
        mov     #0,r0
        mov     #0,r10
        mov.l   @(r0,r4),r3
        mov.l   @(r0,r5),r1
        mov     r10,r7     // r7 = carry from previous iteration
        mov     #0,r6
        mov     r1,r9
        addc    r3,r9      // r9 = a[i] + b[i] (lsw of 64 bit result)
        movt    r10        // r10 = carry (msw of 64 bit result)
        mov     r9,r11
        addc    r7,r11     // r11 = r9 + previous carry (lsw of 64 bit result)
        addc    r6,r10     // r10 = carry for next iteration
        mov.l   r11,@(r0,r2)
        add     #4,r0
        cmp/eq  #12,r0
        bf      .L4

        mov.l   @r15+,r11
        mov     r2,r0
        mov.l   @r15+,r10
        mov.l   @r15+,r9
        mov.l   @r15+,r8

it could be something like this:

        mov     #0,r0
        mov     #0,r6
        mov.l   @(r0,r4),r3
        mov.l   @(r0,r5),r1
        cmp/pl  r6         // T = r6 > 0 (get carry into T bit)
        addc    r3,r1      // r1 = a[i] + b[i] + T (lsw of 64 bit result)
        movt    r6         // r6 = new carry (msw of 64 bit result)
        mov.l   r1,@(r0,r2)
        add     #4,r0
        cmp/eq  #12,r0
        bf      .L4

        mov     r2,r0

However, that would require some loop analysis in order to discover the T bit
feedback opportunity.

Reply via email to