clearing many bytes variables (could use one machine instruction)?

Basile Starynkevitch Tue, 09 Mar 2010 01:58:23 -0800

Hello All,

With a recently compiled gcc-trunk on x86-64/linux, I am compiling the 
folllowing example:


#################

/* file testmanychar.c */
extern void g (int, char *, char *, char *);

void
f (void)
{
  char x0, x1, x2, x3, x4, x5, x6, x7;
  /* assuming  x0 is word aligned on a x86_64, and variables are bytes in 
memory, we could clear all the variables in one machine instruction */
  x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = (char) 0;
  g (10, &x0, &x1, &x2);
  g (20, &x2, &x3, &x4);
  g (30, &x4, &x5, &x6);
  g (40, &x6, &x7, &x0);
}

#################

My intuition was that GCC could store x0 on a 64 bits aligned byte, and x1 
immediately after, and so one, and clear all the eight bytes at once using a 
single machine instruction [clearing a 64 bits word].

But this is not the case, since 
   gcc-trunk -S -O3 -fverbose-asm testmanychar.c
gives the following code

#################
        .type   f, @function
f:
.LFB0:
        .cfi_startproc
        movq    %rbx, -24(%rsp) #,
        movq    %rbp, -16(%rsp) #,
        movl    $10, %edi       #,
        movq    %r12, -8(%rsp)  #,
        subq    $40, %rsp       #,
        .cfi_def_cfa_offset 48
        leaq    13(%rsp), %rbx  #, tmp58
        .cfi_offset 12, -16
        .cfi_offset 6, -24
        .cfi_offset 3, -32
        leaq    15(%rsp), %rbp  #, tmp60
        leaq    14(%rsp), %rdx  #, tmp59
        leaq    11(%rsp), %r12  #, tmp61
        movb    $0, 8(%rsp)     #, x7
        movb    $0, 9(%rsp)     #, x6
        movq    %rbx, %rcx      # tmp58,
        movq    %rbp, %rsi      # tmp60,
        movb    $0, 10(%rsp)    #, x5
        movb    $0, 11(%rsp)    #, x4
        movb    $0, 12(%rsp)    #, x3
        movb    $0, 13(%rsp)    #, x2
        movb    $0, 14(%rsp)    #, x1
        movb    $0, 15(%rsp)    #, x0
        call    g       #
        leaq    12(%rsp), %rdx  #, tmp62
        movq    %r12, %rcx      # tmp61,
        movq    %rbx, %rsi      # tmp58,
        movl    $20, %edi       #,
        leaq    9(%rsp), %rbx   #, tmp64
        call    g       #
        leaq    10(%rsp), %rdx  #, tmp65
        movq    %rbx, %rcx      # tmp64,
        movq    %r12, %rsi      # tmp61,
        movl    $30, %edi       #,
        call    g       #
        leaq    8(%rsp), %rdx   #, tmp68
        movq    %rbp, %rcx      # tmp60,
        movq    %rbx, %rsi      # tmp64,
        movl    $40, %edi       #,
        call    g       #
        movq    16(%rsp), %rbx  #,
        movq    24(%rsp), %rbp  #,
        movq    32(%rsp), %r12  #,
        addq    $40, %rsp       #,
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
.LFE0:
        .size   f, .-f
        .ident  "GCC: (GNU) 4.5.0 20100309 (experimental) [trunk revision 
157303]"

#####################


With  
  gcc-trunk -S -O3 -fverbose-asm -march=core2 -mtune=core2 testmanychar.c
I am getting still

##################

# options passed:  testmanychar.c -march=core2 -mtune=core2 -O3

.globl f
        .type   f, @function
f:
.LFB0:
        .cfi_startproc
        movq    %rbx, -24(%rsp) #,
        movq    %rbp, -16(%rsp) #,
        movq    %r12, -8(%rsp)  #,
        movl    $10, %edi       #,
        subq    $40, %rsp       #,
        .cfi_def_cfa_offset 48
        leaq    13(%rsp), %rbx  #, tmp58
        .cfi_offset 12, -16
        .cfi_offset 6, -24
        .cfi_offset 3, -32
        leaq    15(%rsp), %rbp  #, tmp60
        leaq    11(%rsp), %r12  #, tmp61
        leaq    14(%rsp), %rdx  #, tmp59
        movq    %rbx, %rcx      # tmp58,
        movq    %rbp, %rsi      # tmp60,
        movb    $0, 8(%rsp)     #, x7
        movb    $0, 9(%rsp)     #, x6
        movb    $0, 10(%rsp)    #, x5
        movb    $0, 11(%rsp)    #, x4
        movb    $0, 12(%rsp)    #, x3
        movb    $0, 13(%rsp)    #, x2
        movb    $0, 14(%rsp)    #, x1
        movb    $0, 15(%rsp)    #, x0
        call    g       #
        leaq    12(%rsp), %rdx  #, tmp62
        movq    %r12, %rcx      # tmp61,
        movq    %rbx, %rsi      # tmp58,
        movl    $20, %edi       #,
        leaq    9(%rsp), %rbx   #, tmp64
        call    g       #
        leaq    10(%rsp), %rdx  #, tmp65
        movq    %rbx, %rcx      # tmp64,
        movq    %r12, %rsi      # tmp61,
        movl    $30, %edi       #,
        call    g       #
        leaq    8(%rsp), %rdx   #, tmp68
        movq    %rbp, %rcx      # tmp60,
        movq    %rbx, %rsi      # tmp64,
        movl    $40, %edi       #,
        call    g       #
        movq    16(%rsp), %rbx  #,
        movq    24(%rsp), %rbp  #,
        movq    32(%rsp), %r12  #,
        addq    $40, %rsp       #,
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
.LFE0:
        .size   f, .-f
        .ident  "GCC: (GNU) 4.5.0 20100309 (experimental) [trunk revision 
157303]"

####
I was hoping that 
        movb    $0, 8(%rsp)     #, x7
        movb    $0, 9(%rsp)     #, x6
        movb    $0, 10(%rsp)    #, x5
        movb    $0, 11(%rsp)    #, x4
        movb    $0, 12(%rsp)    #, x3
        movb    $0, 13(%rsp)    #, x2
        movb    $0, 14(%rsp)    #, x1
        movb    $0, 15(%rsp)    #, x0
could be just something like
        movq    $0, 8(%rsp)
or something similar.

I do realize that such an optimization is difficult to implement...
(probably messing the register allocator, etc...). Or is the Core2 processor
sufficient smart to execute exactly as fast a sequence of 8 consecutive byte
moves as a single 8-byte word move?


Regards.
-- 
Basile STARYNKEVITCH         http://starynkevitch.net/Basile/
email: basile<at>starynkevitch<dot>net mobile: +33 6 8501 2359
8, rue de la Faiencerie, 92340 Bourg La Reine, France
*** opinions {are only mines, sont seulement les miennes} ***

clearing many bytes variables (could use one machine instruction)?

Reply via email to