This optimizes the very core streaming routines, streamer_write_char_stream and streamer_write_uhwi_stream and streamer_write_hwi_stream.
In streamer_write_char_stream you can notice that writing the char possibly clobbers the pointer (and everything else) because it uses alias set zero and may point to an arbitrary location. Thus we have to CSE current_pointer manually here. This also leads to very inefficent loops in streamer_write_uhwi_stream and streamer_write_hwi_stream. To optimize them we have to manually inline streamer_write_char_stream and apply loop invariant motion and unswitching. In streamer_write_hwi_stream we do the same but also note that the ! more = !((work == 0 && (byte & 0x40) == 0) ! || (work == -1 && (byte & 0x40) != 0)); test is very inefficent. We can optimize that if we split the shifting of work into two pieces like ! /* If the lower 7-bits are sign-extended 0 or -1 we are finished. */ ! work >>= 6; ! more = !(work == 0 || work == -1); if (more) ! { ! /* More bits to follow. */ ! work >>= 1; ! byte |= 0x80; which results in a very nice core loop .L21: movq %rbp, %rsi movl %ebp, %ecx sarq $6, %rsi andl $127, %ecx addq $1, %rsi cmpq $1, %rsi jbe .L26 orb $-128, %cl sarq $7, %rbp addl $1, %r12d movb %cl, (%rax) addq $1, %rax subl $1, %edx jne .L21 and threaded tail for the !more case: .L26: movb %cl, (%rax) subl $1, %edx addq $1, %rax addl $1, %r12d movq %rax, 16(%rbx) addl %r12d, 32(%rbx) movl %edx, 24(%rbx) ... ret (above produced by g++ 4.6 with -O2). Compared with what is there before the patch: ... movl 24(%rdi), %eax jmp .L48 .p2align 4,,10 .p2align 3 .L52: xorl %r13d, %r13d testb $64, %r12b je .L46 .L45: orb $-128, %r12b movl $1, %r13d .L46: testl %eax, %eax jne .L47 movq %rbx, %rdi call _Z16lto_append_blockP17lto_output_stream .L47: movq 16(%rbx), %rax movb %r12b, (%rax) movl 24(%rbx), %eax addq $1, 16(%rbx) addl $1, 32(%rbx) subl $1, %eax testl %r13d, %r13d movl %eax, 24(%rbx) je .L51 .L48: movl %ebp, %r12d sarq $7, %rbp andl $127, %r12d testq %rbp, %rbp je .L52 cmpq $-1, %rbp jne .L45 xorl %r13d, %r13d testb $64, %r12b jne .L46 jmp .L45 .p2align 4,,10 .p2align 3 .L51: addq $8, %rsp ... ret that's a _lot_ better. And hopefully get's streaming down in the profile somewhat. LTO bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk. Richard. 2013-06-12 Richard Biener <rguent...@suse.de> * data-streamer.h (streamer_write_char_stream): CSE obs->current_pointer. * data-streamer-out.c (streamer_write_uhwi_stream): Inline streamer_write_char_stream manually and optimize the resulting loop. (streamer_write_hwi_stream): Likewise. Index: gcc/data-streamer.h =================================================================== *** gcc/data-streamer.h (revision 199935) --- gcc/data-streamer.h (working copy) *************** streamer_write_char_stream (struct lto_o *** 183,190 **** lto_append_block (obs); /* Write the actual character. */ ! *obs->current_pointer = c; ! obs->current_pointer++; obs->total_size++; obs->left_in_block--; } --- 183,191 ---- lto_append_block (obs); /* Write the actual character. */ ! char *current_pointer = obs->current_pointer; ! *(current_pointer++) = c; ! obs->current_pointer = current_pointer; obs->total_size++; obs->left_in_block--; } Index: gcc/data-streamer-out.c =================================================================== *** gcc/data-streamer-out.c (revision 199935) --- gcc/data-streamer-out.c (working copy) *************** void *** 187,192 **** --- 187,197 ---- streamer_write_uhwi_stream (struct lto_output_stream *obs, unsigned HOST_WIDE_INT work) { + if (obs->left_in_block == 0) + lto_append_block (obs); + char *current_pointer = obs->current_pointer; + unsigned int left_in_block = obs->left_in_block; + unsigned int size = 0; do { unsigned int byte = (work & 0x7f); *************** streamer_write_uhwi_stream (struct lto_o *** 195,203 **** /* More bytes to follow. */ byte |= 0x80; ! streamer_write_char_stream (obs, byte); } ! while (work != 0); } --- 200,233 ---- /* More bytes to follow. */ byte |= 0x80; ! *(current_pointer++) = byte; ! left_in_block--; ! size++; } ! while (work != 0 && left_in_block > 0); ! if (work != 0) ! { ! obs->left_in_block = 0; ! lto_append_block (obs); ! current_pointer = obs->current_pointer; ! left_in_block = obs->left_in_block; ! do ! { ! unsigned int byte = (work & 0x7f); ! work >>= 7; ! if (work != 0) ! /* More bytes to follow. */ ! byte |= 0x80; ! ! *(current_pointer++) = byte; ! left_in_block--; ! size++; ! } ! while (work != 0); ! } ! obs->current_pointer = current_pointer; ! obs->left_in_block = left_in_block; ! obs->total_size += size; } *************** streamer_write_uhwi_stream (struct lto_o *** 206,226 **** void streamer_write_hwi_stream (struct lto_output_stream *obs, HOST_WIDE_INT work) { ! int more, byte; ! do { ! byte = (work & 0x7f); ! /* arithmetic shift */ ! work >>= 7; ! more = !((work == 0 && (byte & 0x40) == 0) ! || (work == -1 && (byte & 0x40) != 0)); if (more) ! byte |= 0x80; ! ! streamer_write_char_stream (obs, byte); } ! while (more); } /* Write a GCOV counter value WORK to OBS. */ --- 236,291 ---- void streamer_write_hwi_stream (struct lto_output_stream *obs, HOST_WIDE_INT work) { ! if (obs->left_in_block == 0) ! lto_append_block (obs); ! char *current_pointer = obs->current_pointer; ! unsigned int left_in_block = obs->left_in_block; ! unsigned int size = 0; ! bool more; do { ! unsigned int byte = (work & 0x7f); ! /* If the lower 7-bits are sign-extended 0 or -1 we are finished. */ ! work >>= 6; ! more = !(work == 0 || work == -1); if (more) ! { ! /* More bits to follow. */ ! work >>= 1; ! byte |= 0x80; ! } ! ! *(current_pointer++) = byte; ! left_in_block--; ! size++; ! } ! while (more && left_in_block > 0); ! if (more) ! { ! obs->left_in_block = 0; ! lto_append_block (obs); ! current_pointer = obs->current_pointer; ! left_in_block = obs->left_in_block; ! do ! { ! unsigned int byte = (work & 0x7f); ! work >>= 6; ! more = !(work == 0 || work == -1); ! if (more) ! { ! work >>= 1; ! byte |= 0x80; ! } ! ! *(current_pointer++) = byte; ! left_in_block--; ! size++; ! } ! while (more); } ! obs->current_pointer = current_pointer; ! obs->left_in_block = left_in_block; ! obs->total_size += size; } /* Write a GCOV counter value WORK to OBS. */