https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66741
--- Comment #1 from Bernhard Reutner-Fischer <aldot at gcc dot gnu.org> --- i.e. maybe something more along the lines of $ cat <<EOF | gcc-5 -xc -S - -o - -Ofast -fomit-frame-pointer -minline-all-stringops -mstringop-strategy=unrolled_loop -fdump-tree-all-all -fdump-rtl-all-all -fdump-ipa-all-all -msse4 #include <smmintrin.h> #include <assert.h> #include <stdint.h> void sse_tolower_strcpy (const char *d, const char *s) { __m128i ranges = _mm_setr_epi8 ('A', 'Z', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); __m128i *src = (__m128i *) s; __m128i *dst = (__m128i *) d; const __m128i diff = _mm_set1_epi8 (0x20); const uint8_t mode = _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK; for (;; src++, dst++) { const __m128i chunk = _mm_loadu_si128 (src); if (_mm_cmpistrc (ranges, chunk, mode)) { const __m128i tmp1 = _mm_cmpistrm (ranges, chunk, mode); const __m128i mask = _mm_and_si128 (tmp1, diff); _mm_storeu_si128 (dst, _mm_xor_si128 (chunk, mask)); } if (_mm_cmpistrz (ranges, chunk, mode)) break; } } #ifdef MAIN #include <unistd.h> #include <string.h> int main(void) { char src[128], dest[128]; int n = read(0, &src, sizeof(src)); if (n < 1) 1; src[n] = 0; sse_tolower_strcpy(dest, src); write(2, dest, strlen(dest)); return 0; } #endif EOF .file "" .section .text.unlikely,"ax",@progbits .LCOLDB2: .text .LHOTB2: .p2align 4,,15 .globl sse_tolower_strcpy .type sse_tolower_strcpy, @function sse_tolower_strcpy: .LFB641: .cfi_startproc movdqa .LC0(%rip), %xmm2 movdqa .LC1(%rip), %xmm3 jmp .L4 .p2align 4,,10 .p2align 3 .L2: pcmpistrm $68, %xmm1, %xmm2 je .L1 .L9: addq $16, %rsi addq $16, %rdi .L4: movdqu (%rsi), %xmm1 pcmpistrm $68, %xmm1, %xmm2 jnc .L2 pand %xmm3, %xmm0 pxor %xmm1, %xmm0 movups %xmm0, (%rdi) pcmpistrm $68, %xmm1, %xmm2 jne .L9 .L1: rep ret .cfi_endproc .LFE641: .size sse_tolower_strcpy, .-sse_tolower_strcpy .section .text.unlikely .LCOLDE2: .text .LHOTE2: .section .rodata.cst16,"aM",@progbits,16 .align 16 .LC0: .byte 65 .byte 90 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .byte 0 .align 16 .LC1: .quad 2314885530818453536 .quad 2314885530818453536 .ident "GCC: (Debian 5.1.1-12) 5.1.1 20150622" .section .note.GNU-stack,"",@progbits This would be *much* smaller and supposedly is also faster: text data bss dec hex filename 228 0 0 228 e4 comment0.o 153 0 0 153 99 comment1.o