Michel Lanners writes:

> However, there's something wrong with the IDCT code; the output is
> essentially garbage. Makes for some interesting visual effects, but
> that's about it....

Here is my altivec-enabled IDCT, in assembler.  It does everything
internally in floating point so there is no need for scaling.  It
exports two procedures:

void idct_block_copy_altivec(int16_t *dct_block, uint8_t *dest, int stride);
void idct_block_add_altivec(int16_t *dct_block, uint8_t *dest, int stride);

stride is the offset between successive rows of dest.  It does an IDCT
of the 8x8 block of 16-bit integers at *dct_block, and either puts the
result in an 8x8 block at *dest or adds it to the block at *dest.
dct_block has to be 16-byte aligned.  And no, it hasn't been
_deliberately_ obfuscated. :)

I use this in mpeg2dec (actually a hacked version that I use to play
videos off my tivo) and Anton Blanchard hacked this into xine.  I also
have altivec-enabled motion compensation routines for libmpeg2.

Hope this is useful...

Paul.

#  idct_vec.S
#
#  Copyright (C) Aaron Holtzman <[EMAIL PROTECTED]> - Nov 1999
#  Copyright (C) Paul Mackerras <[EMAIL PROTECTED]> - Jan 2001
#
#  Adapted from idct.c by Paul Mackerras.
#
#  Portions of this code are from the MPEG software simulation group
#  idct implementation. This code will be replaced with a new
#  implementation soon.
#
#  This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
#        
#  mpeg2dec is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2, or (at your option)
#  any later version.
#   
#  mpeg2dec is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#   
#  You should have received a copy of the GNU General Public License
#  along with GNU Make; see the file COPYING.  If not, write to
#  the Foundation, Inc., 59 Temple Place, Suite 330, Boston,
#  MA  02111-1307  USA.

        .data
        .align  4
wvec:   .long   0x3f0a8bd4
        .long   0x3e8d42af
        .long   0x3f3504f3
        .long   0x3f968317
        .long   0x3f8e39da
        .long   0x3fd4db31
        .long   0x3ec7c5c2
        .long   0x3ffb14be
        .long   0x3f43ef15
        .long   0x3fec835e
        .long   0
        .long   0

d:      .long   0,0,0,0

        .text
        .globl  idct_block_copy_altivec
idct_block_copy_altivec:
        li      6,0
        b       idct_asm_altivec

        .global idct_block_add_altivec
idct_block_add_altivec:
        li      6,1

        .globl  idct_asm_altivec
idct_asm_altivec:
        lvx     22,0,3          #        ih = *(vector signed short *)(p);
        addi    3,3,16          #        p += 8;
        vupkhsh 20,22           #        i0 = vec_unpackh(ih);
        vupklsh 21,22           #        i1 = vec_unpackl(ih);
        vcfsx   0,20,3          #        x00 = vec_ctf(i0, 3);
        vcfsx   10,21,3         #        x01 = vec_ctf(i1, 3);
        lvx     22,0,3          #        ih = *(vector signed short *)(p);
        addi    3,3,16          #        p += 8;
        vupkhsh 20,22           #        i0 = vec_unpackh(ih);
        vupklsh 21,22           #        i1 = vec_unpackl(ih);
        vcfsx   1,20,3          #        x10 = vec_ctf(i0, 3);
        vcfsx   11,21,3         #        x11 = vec_ctf(i1, 3);
        lvx     22,0,3          #        ih = *(vector signed short *)(p);
        addi    3,3,16          #        p += 8;
        vupkhsh 20,22           #        i0 = vec_unpackh(ih);
        vupklsh 21,22           #        i1 = vec_unpackl(ih);
        vcfsx   2,20,3          #        x20 = vec_ctf(i0, 3);
        vcfsx   12,21,3         #        x21 = vec_ctf(i1, 3);
        lvx     22,0,3          #        ih = *(vector signed short *)(p);
        addi    3,3,16          #        p += 8;
        vupkhsh 20,22           #        i0 = vec_unpackh(ih);
        vupklsh 21,22           #        i1 = vec_unpackl(ih);
        vcfsx   3,20,3          #        x30 = vec_ctf(i0, 3);
        vcfsx   13,21,3         #        x31 = vec_ctf(i1, 3);
        lvx     22,0,3          #        ih = *(vector signed short *)(p);
        addi    3,3,16          #        p += 8;
        vupkhsh 20,22           #        i0 = vec_unpackh(ih);
        vupklsh 21,22           #        i1 = vec_unpackl(ih);
        vcfsx   4,20,3          #        x40 = vec_ctf(i0, 3);
        vcfsx   14,21,3         #        x41 = vec_ctf(i1, 3);
        lvx     22,0,3          #        ih = *(vector signed short *)(p);
        addi    3,3,16          #        p += 8;
        vupkhsh 20,22           #        i0 = vec_unpackh(ih);
        vupklsh 21,22           #        i1 = vec_unpackl(ih);
        vcfsx   5,20,3          #        x50 = vec_ctf(i0, 3);
        vcfsx   15,21,3         #        x51 = vec_ctf(i1, 3);
        lvx     22,0,3          #        ih = *(vector signed short *)(p);
        addi    3,3,16          #        p += 8;
        vupkhsh 20,22           #        i0 = vec_unpackh(ih);
        vupklsh 21,22           #        i1 = vec_unpackl(ih);
        vcfsx   6,20,3          #        x60 = vec_ctf(i0, 3);
        vcfsx   16,21,3         #        x61 = vec_ctf(i1, 3);
        lvx     22,0,3          #        ih = *(vector signed short *)(p);
        vupkhsh 20,22           #        i0 = vec_unpackh(ih);
        vupklsh 21,22           #        i1 = vec_unpackl(ih);
        vcfsx   7,20,3          #        x70 = vec_ctf(i0, 3);
        vcfsx   17,21,3         #        x71 = vec_ctf(i1, 3);

        vmrghw  8,0,2           #       x80 = vec_mergeh(x00, x20);
        vmrghw  9,1,3           #       x90 = vec_mergeh(x10, x30);
        vmrglw  18,0,2          #       x81 = vec_mergel(x00, x20);
        vmrglw  19,1,3          #       x91 = vec_mergel(x10, x30);
        vmrghw  0,8,9           #       x00 = vec_mergeh(x80, x90);
        vmrglw  1,8,9           #       x10 = vec_mergel(x80, x90);
        vmrghw  2,18,19         #       x20 = vec_mergeh(x81, x91);
        vmrglw  3,18,19         #       x30 = vec_mergel(x81, x91);

        vmrghw  8,10,12         #       x80 = vec_mergeh(x01, x21);
        vmrghw  9,11,13         #       x90 = vec_mergeh(x11, x31);
        vmrglw  18,10,12        #       x81 = vec_mergel(x01, x21);
        vmrglw  19,11,13        #       x91 = vec_mergel(x11, x31);
        vmrghw  20,4,6          #       y80 = vec_mergeh(x40, x60);
        vmrghw  22,5,7          #       y90 = vec_mergeh(x50, x70);
        vmrglw  21,4,6          #       y81 = vec_mergel(x40, x60);
        vmrglw  23,5,7          #       y91 = vec_mergel(x50, x70);
        vmrghw  4,8,9           #       x40 = vec_mergeh(x80, x90);
        vmrglw  5,8,9           #       x50 = vec_mergel(x80, x90);
        vmrghw  6,18,19         #       x60 = vec_mergeh(x81, x91);
        vmrglw  7,18,19         #       x70 = vec_mergel(x81, x91);
        vmrghw  10,20,22        #       x01 = vec_mergeh(y80, y90);
        vmrglw  11,20,22        #       x11 = vec_mergel(y80, y90);
        vmrghw  12,21,23        #       x21 = vec_mergeh(y81, y91);
        vmrglw  13,21,23        #       x31 = vec_mergel(y81, y91);

        vmrghw  20,14,16        #       y80 = vec_mergeh(x41, x61);
        vmrghw  22,15,17        #       y90 = vec_mergeh(x51, x71);
        vmrglw  21,14,16        #       y81 = vec_mergel(x41, x61);
        vmrglw  23,15,17        #       y91 = vec_mergel(x51, x71);
        vmrghw  14,20,22        #       x41 = vec_mergeh(y80, y90);
        vmrglw  15,20,22        #       x51 = vec_mergel(y80, y90);
        vmrghw  16,21,23        #       x61 = vec_mergeh(y81, y91);
        vmrglw  17,21,23        #       x71 = vec_mergel(y81, y91);

        lis     7,[EMAIL PROTECTED]
        addi    7,7,[EMAIL PROTECTED]
        addi    8,7,16
        addi    9,7,32
        lvx     28,0,7          #       *(vector float *)wvec2;
        lvx     29,0,8          #       *(vector float *)wvec3;
        lvx     30,0,9          #       *(vector float *)wvec4;

        vspltw  20,28,3         #       W3 = vec_splat(wvec2, 3);
        vspltw  21,28,1         #       W7 = vec_splat(wvec2, 1);
        vspltw  22,29,0         #       W1_W7 = vec_splat(wvec3, 0);
        vspltw  23,29,1         #       W1pW7 = vec_splat(wvec3, 1);
        vspltw  24,29,2         #       W3_W5 = vec_splat(wvec3, 2);
        vspltw  25,29,3         #       W3pW5 = vec_splat(wvec3, 3);
        vspltisw 31,0           #       z = (vector float)(0);

                                #       /* first stage */
        vaddfp  26,1,7
        vmaddfp 8,21,26,31      #       x80 = vec_madd(W7, vec_add(x10, x70), 
z);
        vaddfp  27,11,17
        vmaddfp 18,21,27,31     #       x81 = vec_madd(W7, vec_add(x11, x71), 
z);
        vmaddfp 1,22,1,8        #       x10 = vec_madd(W1_W7, x10, x80);
        vmaddfp 11,22,11,18     #       x11 = vec_madd(W1_W7, x11, x81);
        vnmsubfp 7,23,7,8       #       x70 = vec_nmsub(W1pW7, x70, x80);
        vnmsubfp 17,23,17,18    #       x71 = vec_nmsub(W1pW7, x71, x81);
        vaddfp  26,5,3
        vmaddfp 8,20,26,31      #       x80 = vec_madd(W3, vec_add(x50, x30), 
z);
        vaddfp  27,15,13
        vmaddfp 18,20,27,31     #       x81 = vec_madd(W3, vec_add(x51, x31), 
z);
        vnmsubfp 5,24,5,8       #       x50 = vec_nmsub(W3_W5, x50, x80);
        vnmsubfp 15,24,15,18    #       x51 = vec_nmsub(W3_W5, x51, x81);
        vnmsubfp 3,25,3,8       #       x30 = vec_nmsub(W3pW5, x30, x80);
        vnmsubfp 13,25,13,18    #       x31 = vec_nmsub(W3pW5, x31, x81);
 
        vspltw  20,28,0         #       W6 = vec_splat(wvec2, 0);
        vspltw  21,30,0         #       W2_W6 = vec_splat(wvec4, 0);
        vspltw  22,30,1         #       W2pW6 = vec_splat(wvec4, 1);
        vspltw  23,28,2         #       SQRT0_5 = vec_splat(wvec2, 2);

                                #       /* second stage */
        vaddfp  8,0,4           #       x80 = vec_add(x00, x40);
        vaddfp  18,10,14        #       x81 = vec_add(x01, x41);
        vsubfp  0,0,4           #       x00 = vec_sub(x00, x40);
        vsubfp  10,10,14        #       x01 = vec_sub(x01, x41);
        vaddfp  26,2,6
        vmaddfp 4,20,26,31      #       x40 = vec_madd(W6, vec_add(x20, x60), 
z);
        vaddfp  27,12,16
        vmaddfp 14,20,27,31     #       x41 = vec_madd(W6, vec_add(x21, x61), 
z);
        vnmsubfp 6,22,6,4       #       x60 = vec_nmsub(W2pW6, x60, x40);
        vnmsubfp 16,22,16,14    #       x61 = vec_nmsub(W2pW6, x61, x41);
        vmaddfp 2,21,2,4        #       x20 = vec_madd(W2_W6, x20, x40);
        vmaddfp 12,21,12,14     #       x21 = vec_madd(W2_W6, x21, x41);
        vaddfp  4,1,5           #       x40 = vec_add(x10, x50);
        vaddfp  14,11,15        #       x41 = vec_add(x11, x51);
        vsubfp  1,1,5           #       x10 = vec_sub(x10, x50);
        vsubfp  11,11,15        #       x11 = vec_sub(x11, x51);
        vaddfp  5,7,3           #       x50 = vec_add(x70, x30);
        vaddfp  15,17,13        #       x51 = vec_add(x71, x31);
        vsubfp  7,7,3           #       x70 = vec_sub(x70, x30);
        vsubfp  17,17,13        #       x71 = vec_sub(x71, x31);
 
                                #       /* third stage */
        vaddfp  3,8,2           #       x30 = vec_add(x80, x20);
        vaddfp  13,18,12        #       x31 = vec_add(x81, x21);
        vsubfp  8,8,2           #       x80 = vec_sub(x80, x20);
        vsubfp  18,18,12        #       x81 = vec_sub(x81, x21);
        vaddfp  2,0,6           #       x20 = vec_add(x00, x60);
        vaddfp  12,10,16        #       x21 = vec_add(x01, x61);
        vsubfp  0,0,6           #       x00 = vec_sub(x00, x60);
        vsubfp  10,10,16        #       x01 = vec_sub(x01, x61);
        vaddfp  24,1,7
        vmaddfp 6,23,24,31      #       x60 = vec_madd(SQRT0_5, vec_add(x10, 
x70), z);
        vaddfp  25,11,17
        vmaddfp 16,23,25,31     #       x61 = vec_madd(SQRT0_5, vec_add(x11, 
x71), z);
        vsubfp  26,1,7
        vmaddfp 1,23,26,31      #       x10 = vec_madd(SQRT0_5, vec_sub(x10, 
x70), z);
        vsubfp  27,11,17
        vmaddfp 11,23,27,31     #       x11 = vec_madd(SQRT0_5, vec_sub(x11, 
x71), z);

                                #       /* fourth stage */
        vsubfp  7,3,4           #       x70 = vec_sub(x30, x40);
        vsubfp  17,13,14        #       x71 = vec_sub(x31, x41);
        vaddfp  9,3,4           #       x90 = vec_add(x30, x40);
        vaddfp  19,13,14        #       x91 = vec_add(x31, x41);
        vaddfp  3,8,5           #       x30 = vec_add(x80, x50);
        vaddfp  13,18,15        #       x31 = vec_add(x81, x51);
        vsubfp  4,8,5           #       x40 = vec_sub(x80, x50);
        vsubfp  14,18,15        #       x41 = vec_sub(x81, x51);
        vsubfp  5,0,1           #       x50 = vec_sub(x00, x10);
        vsubfp  15,10,11        #       x51 = vec_sub(x01, x11);
        vaddfp  8,0,1           #       x80 = vec_add(x00, x10);
        vaddfp  18,10,11        #       x81 = vec_add(x01, x11);
        vaddfp  1,2,6           #       x10 = vec_add(x20, x60);
        vaddfp  11,12,16        #       x11 = vec_add(x21, x61);
        vsubfp  6,2,6           #       x60 = vec_sub(x20, x60);
        vsubfp  16,12,16        #       x61 = vec_sub(x21, x61);
                                #       /* x0* is now in x9*, x2* is in x8* */

        vmrghw  20,9,8          #        y80 = vec_mergeh(x90, x80);
        vmrghw  22,1,3          #        y90 = vec_mergeh(x10, x30);
        vmrglw  21,9,8          #        y81 = vec_mergel(x90, x80);
        vmrglw  23,1,3          #        y91 = vec_mergel(x10, x30);
        vmrghw  0,20,22         #        x00 = vec_mergeh(y80, y90);
        vmrglw  1,20,22         #        x10 = vec_mergel(y80, y90);
        vmrghw  2,21,23         #        x20 = vec_mergeh(y81, y91);
        vmrglw  3,21,23         #        x30 = vec_mergel(y81, y91);

        vmrghw  8,19,18         #        x80 = vec_mergeh(x91, x81);
        vmrghw  9,11,13         #        x90 = vec_mergeh(x11, x31);
        vmrglw  18,19,18        #        x81 = vec_mergel(x91, x81);
        vmrglw  19,11,13        #        x91 = vec_mergel(x11, x31);
        vmrghw  20,4,6          #        y80 = vec_mergeh(x40, x60);
        vmrghw  22,5,7          #        y90 = vec_mergeh(x50, x70);
        vmrglw  21,4,6          #        y81 = vec_mergel(x40, x60);
        vmrglw  23,5,7          #        y91 = vec_mergel(x50, x70);
        vmrghw  4,8,9           #        x40 = vec_mergeh(x80, x90);
        vmrglw  5,8,9           #        x50 = vec_mergel(x80, x90);
        vmrghw  6,18,19         #        x60 = vec_mergeh(x81, x91);
        vmrglw  7,18,19         #        x70 = vec_mergel(x81, x91);
        vmrghw  10,20,22        #        x01 = vec_mergeh(y80, y90);
        vmrglw  11,20,22        #        x11 = vec_mergel(y80, y90);
        vmrghw  12,21,23        #        x21 = vec_mergeh(y81, y91);
        vmrglw  13,21,23        #        x31 = vec_mergel(y81, y91);

        vmrghw  20,14,16        #        y80 = vec_mergeh(x41, x61);
        vmrghw  22,15,17        #        y90 = vec_mergeh(x51, x71);
        vmrglw  21,14,16        #        y81 = vec_mergel(x41, x61);
        vmrglw  23,15,17        #        y91 = vec_mergel(x51, x71);
        vmrghw  14,20,22        #        x41 = vec_mergeh(y80, y90);
        vmrglw  15,20,22        #        x51 = vec_mergel(y80, y90);
        vmrghw  16,21,23        #        x61 = vec_mergeh(y81, y91);
        vmrglw  17,21,23        #        x71 = vec_mergel(y81, y91);

        vspltw  20,28,3         #       W3 = vec_splat(wvec2, 3);
        vspltw  21,28,1         #       W7 = vec_splat(wvec2, 1);
        vspltw  22,29,0         #       W1_W7 = vec_splat(wvec3, 0);
        vspltw  23,29,1         #       W1pW7 = vec_splat(wvec3, 1);
        vspltw  24,29,2         #       W3_W5 = vec_splat(wvec3, 2);
        vspltw  25,29,3         #       W3pW5 = vec_splat(wvec3, 3);

                                #       /* first stage */
        vaddfp  26,1,7
        vmaddfp 8,21,26,31      #       x80 = vec_madd(W7, vec_add(x10, x70), 
z);
        vaddfp  27,11,17
        vmaddfp 18,21,27,31     #       x81 = vec_madd(W7, vec_add(x11, x71), 
z);
        vmaddfp 1,22,1,8        #       x10 = vec_madd(W1_W7, x10, x80);
        vmaddfp 11,22,11,18     #       x11 = vec_madd(W1_W7, x11, x81);
        vnmsubfp 7,23,7,8       #       x70 = vec_nmsub(W1pW7, x70, x80);
        vnmsubfp 17,23,17,18    #       x71 = vec_nmsub(W1pW7, x71, x81);
        vaddfp  26,5,3
        vmaddfp 8,20,26,31      #       x80 = vec_madd(W3, vec_add(x50, x30), 
z);
        vaddfp  27,15,13
        vmaddfp 18,20,27,31     #       x81 = vec_madd(W3, vec_add(x51, x31), 
z);
        vnmsubfp 5,24,5,8       #       x50 = vec_nmsub(W3_W5, x50, x80);
        vnmsubfp 15,24,15,18    #       x51 = vec_nmsub(W3_W5, x51, x81);
        vnmsubfp 3,25,3,8       #       x30 = vec_nmsub(W3pW5, x30, x80);
        vnmsubfp 13,25,13,18    #       x31 = vec_nmsub(W3pW5, x31, x81);
 
        vspltw  20,28,0         #       W6 = vec_splat(wvec2, 0);
        vspltw  21,30,0         #       W2_W6 = vec_splat(wvec4, 0);
        vspltw  22,30,1         #       W2pW6 = vec_splat(wvec4, 1);
        vspltw  23,28,2         #       SQRT0_5 = vec_splat(wvec2, 2);

                                #       /* second stage */
        vaddfp  8,0,4           #       x80 = vec_add(x00, x40);
        vaddfp  18,10,14        #       x81 = vec_add(x01, x41);
        vsubfp  0,0,4           #       x00 = vec_sub(x00, x40);
        vsubfp  10,10,14        #       x01 = vec_sub(x01, x41);
        vaddfp  26,2,6
        vmaddfp 4,20,26,31      #       x40 = vec_madd(W6, vec_add(x20, x60), 
z);
        vaddfp  27,12,16
        vmaddfp 14,20,27,31     #       x41 = vec_madd(W6, vec_add(x21, x61), 
z);
        vnmsubfp 6,22,6,4       #       x60 = vec_nmsub(W2pW6, x60, x40);
        vnmsubfp 16,22,16,14    #       x61 = vec_nmsub(W2pW6, x61, x41);
        vmaddfp 2,21,2,4        #       x20 = vec_madd(W2_W6, x20, x40);
        vmaddfp 12,21,12,14     #       x21 = vec_madd(W2_W6, x21, x41);
        vaddfp  4,1,5           #       x40 = vec_add(x10, x50);
        vaddfp  14,11,15        #       x41 = vec_add(x11, x51);
        vsubfp  1,1,5           #       x10 = vec_sub(x10, x50);
        vsubfp  11,11,15        #       x11 = vec_sub(x11, x51);
        vaddfp  5,7,3           #       x50 = vec_add(x70, x30);
        vaddfp  15,17,13        #       x51 = vec_add(x71, x31);
        vsubfp  7,7,3           #       x70 = vec_sub(x70, x30);
        vsubfp  17,17,13        #       x71 = vec_sub(x71, x31);
 
                                #       /* third stage */
        vaddfp  3,8,2           #       x30 = vec_add(x80, x20);
        vaddfp  13,18,12        #       x31 = vec_add(x81, x21);
        vsubfp  8,8,2           #       x80 = vec_sub(x80, x20);
        vsubfp  18,18,12        #       x81 = vec_sub(x81, x21);
        vaddfp  2,0,6           #       x20 = vec_add(x00, x60);
        vaddfp  12,10,16        #       x21 = vec_add(x01, x61);
        vsubfp  0,0,6           #       x00 = vec_sub(x00, x60);
        vsubfp  10,10,16        #       x01 = vec_sub(x01, x61);
        vaddfp  24,1,7
        vmaddfp 6,23,24,31      #       x60 = vec_madd(SQRT0_5, vec_add(x10, 
x70), z);
        vaddfp  25,11,17
        vmaddfp 16,23,25,31     #       x61 = vec_madd(SQRT0_5, vec_add(x11, 
x71), z);
        vsubfp  26,1,7
        vmaddfp 1,23,26,31      #       x10 = vec_madd(SQRT0_5, vec_sub(x10, 
x70), z);
        vsubfp  27,11,17
        vmaddfp 11,23,27,31     #       x11 = vec_madd(SQRT0_5, vec_sub(x11, 
x71), z);

                                #       /* fourth stage */
        vsubfp  7,3,4           #       x70 = vec_sub(x30, x40);
        vsubfp  17,13,14        #       x71 = vec_sub(x31, x41);
        vaddfp  9,3,4           #       x90 = vec_add(x30, x40);
        vaddfp  19,13,14        #       x91 = vec_add(x31, x41);
        vaddfp  3,8,5           #       x30 = vec_add(x80, x50);
        vaddfp  13,18,15        #       x31 = vec_add(x81, x51);
        vsubfp  4,8,5           #       x40 = vec_sub(x80, x50);
        vsubfp  14,18,15        #       x41 = vec_sub(x81, x51);
        vsubfp  5,0,1           #       x50 = vec_sub(x00, x10);
        vsubfp  15,10,11        #       x51 = vec_sub(x01, x11);
        vaddfp  8,0,1           #       x80 = vec_add(x00, x10);
        vaddfp  18,10,11        #       x81 = vec_add(x01, x11);
        vaddfp  1,2,6           #       x10 = vec_add(x20, x60);
        vaddfp  11,12,16        #       x11 = vec_add(x21, x61);
        vsubfp  6,2,6           #       x60 = vec_sub(x20, x60);
        vsubfp  16,12,16        #       x61 = vec_sub(x21, x61);
                                #       /* x0* is now in x9*, x2* is in x8* */

        cmpwi   6,0
        lis     6,[EMAIL PROTECTED]
        addi    6,6,[EMAIL PROTECTED]
        vctsxs  20,9,0          #       i0 = vec_cts(x90, 0);
        vctsxs  21,19,0         #       i1 = vec_cts(x91, 0);
        vpkswss 22,20,21        #       ih = vec_packs(i0, i1);
        beq     1f              #       if (accum) {
        lfd     0,0(4)
        stfd    0,0(6)          #               *(long long *)&d = *(long long 
*)dest;
        lvx     24,0,6
        vmrghb  23,31,24        #               dh = (vector signed short) 
vec_mergeh(zb, d);
        vaddshs 22,23,22        #               ih = vec_adds(dh, ih);
1:                              #       }
        vpkshus 24,22,31
        stvx    24,0,6          #       d = vec_packsu(ih, zh);
        lfd     0,0(6)  
        stfd    0,0(4)          #       *(long long *)dest = *(long long *)&d;
        add     4,4,5           #       dest += stride;
        vctsxs  20,1,0          #       i0 = vec_cts(x10, 0);
        vctsxs  21,11,0         #       i1 = vec_cts(x11, 0);
        vpkswss 22,20,21        #       ih = vec_packs(i0, i1);
        beq     1f              #       if (accum) {
        lfd     0,0(4)
        stfd    0,0(6)          #               *(long long *)&d = *(long long 
*)dest;
        lvx     24,0,6
        vmrghb  23,31,24        #               dh = (vector signed short) 
vec_mergeh(zb, d);
        vaddshs 22,23,22        #               ih = vec_adds(dh, ih);
1:                              #       }
        vpkshus 24,22,31
        stvx    24,0,6          #       d = vec_packsu(ih, zh);
        lfd     0,0(6)  
        stfd    0,0(4)          #       *(long long *)dest = *(long long *)&d;
        add     4,4,5           #       dest += stride;
        vctsxs  20,8,0          #       i0 = vec_cts(x80, 0);
        vctsxs  21,18,0         #       i1 = vec_cts(x81, 0);
        vpkswss 22,20,21        #       ih = vec_packs(i0, i1);
        beq     1f              #       if (accum) {
        lfd     0,0(4)
        stfd    0,0(6)          #               *(long long *)&d = *(long long 
*)dest;
        lvx     24,0,6
        vmrghb  23,31,24        #               dh = (vector signed short) 
vec_mergeh(zb, d);
        vaddshs 22,23,22        #               ih = vec_adds(dh, ih);
1:                              #       }
        vpkshus 24,22,31
        stvx    24,0,6          #       d = vec_packsu(ih, zh);
        lfd     0,0(6)  
        stfd    0,0(4)          #       *(long long *)dest = *(long long *)&d;
        add     4,4,5           #       dest += stride;
        vctsxs  20,3,0          #       i0 = vec_cts(x30, 0);
        vctsxs  21,13,0         #       i1 = vec_cts(x31, 0);
        vpkswss 22,20,21        #       ih = vec_packs(i0, i1);
        beq     1f              #       if (accum) {
        lfd     0,0(4)
        stfd    0,0(6)          #               *(long long *)&d = *(long long 
*)dest;
        lvx     24,0,6
        vmrghb  23,31,24        #               dh = (vector signed short) 
vec_mergeh(zb, d);
        vaddshs 22,23,22        #               ih = vec_adds(dh, ih);
1:                              #       }
        vpkshus 24,22,31
        stvx    24,0,6          #       d = vec_packsu(ih, zh);
        lfd     0,0(6)  
        stfd    0,0(4)          #       *(long long *)dest = *(long long *)&d;
        add     4,4,5           #       dest += stride;
        vctsxs  20,4,0          #       i0 = vec_cts(x40, 0);
        vctsxs  21,14,0         #       i1 = vec_cts(x41, 0);
        vpkswss 22,20,21        #       ih = vec_packs(i0, i1);
        beq     1f              #       if (accum) {
        lfd     0,0(4)
        stfd    0,0(6)          #               *(long long *)&d = *(long long 
*)dest;
        lvx     24,0,6
        vmrghb  23,31,24        #               dh = (vector signed short) 
vec_mergeh(zb, d);
        vaddshs 22,23,22        #               ih = vec_adds(dh, ih);
1:                              #       }
        vpkshus 24,22,31
        stvx    24,0,6          #       d = vec_packsu(ih, zh);
        lfd     0,0(6)  
        stfd    0,0(4)          #       *(long long *)dest = *(long long *)&d;
        add     4,4,5           #       dest += stride;
        vctsxs  20,5,0          #       i0 = vec_cts(x50, 0);
        vctsxs  21,15,0         #       i1 = vec_cts(x51, 0);
        vpkswss 22,20,21        #       ih = vec_packs(i0, i1);
        beq     1f              #       if (accum) {
        lfd     0,0(4)
        stfd    0,0(6)          #               *(long long *)&d = *(long long 
*)dest;
        lvx     24,0,6
        vmrghb  23,31,24        #               dh = (vector signed short) 
vec_mergeh(zb, d);
        vaddshs 22,23,22        #               ih = vec_adds(dh, ih);
1:                              #       }
        vpkshus 24,22,31
        stvx    24,0,6          #       d = vec_packsu(ih, zh);
        lfd     0,0(6)  
        stfd    0,0(4)          #       *(long long *)dest = *(long long *)&d;
        add     4,4,5           #       dest += stride;
        vctsxs  20,6,0          #       i0 = vec_cts(x60, 0);
        vctsxs  21,16,0         #       i1 = vec_cts(x61, 0);
        vpkswss 22,20,21        #       ih = vec_packs(i0, i1);
        beq     1f              #       if (accum) {
        lfd     0,0(4)
        stfd    0,0(6)          #               *(long long *)&d = *(long long 
*)dest;
        lvx     24,0,6
        vmrghb  23,31,24        #               dh = (vector signed short) 
vec_mergeh(zb, d);
        vaddshs 22,23,22        #               ih = vec_adds(dh, ih);
1:                              #       }
        vpkshus 24,22,31
        stvx    24,0,6          #       d = vec_packsu(ih, zh);
        lfd     0,0(6)  
        stfd    0,0(4)          #       *(long long *)dest = *(long long *)&d;
        add     4,4,5           #       dest += stride;
        vctsxs  20,7,0          #       i0 = vec_cts(x70, 0);
        vctsxs  21,17,0         #       i1 = vec_cts(x71, 0);
        vpkswss 22,20,21        #       ih = vec_packs(i0, i1);
        beq     1f              #       if (accum) {
        lfd     0,0(4)
        stfd    0,0(6)          #               *(long long *)&d = *(long long 
*)dest;
        lvx     24,0,6
        vmrghb  23,31,24        #               dh = (vector signed short) 
vec_mergeh(zb, d);
        vaddshs 22,23,22        #               ih = vec_adds(dh, ih);
1:                              #       }
        vpkshus 24,22,31
        stvx    24,0,6          #       d = vec_packsu(ih, zh);
        lfd     0,0(6)  
        stfd    0,0(4)          #       *(long long *)dest = *(long long *)&d;

        blr

Reply via email to