Finish providing SIMD versions for POWER8 VSX of functions in libswscale/input.c That should allow trac ticket #5570 to be closed. The speedups obtained for the functions are:
abgrToA_c 1.19 bgr24ToUV_c 1.23 bgr24ToUV_half_c 1.37 bgr24ToY_c_vsx 1.43 nv12ToUV_c 1.05 nv21ToUV_c 1.06 planar_rgb_to_uv 1.25 planar_rgb_to_y 1.26 rgb24ToUV_c 1.11 rgb24ToUV_half_c 1.10 rgb24ToY_c 0.92 rgbaToA_c 0.88 uyvyToUV_c 1.05 uyvyToY_c 1.15 yuy2ToUV_c 1.07 yuy2ToY_c 1.17 yvy2ToUV_c 1.05 --- libswscale/ppc/input_vsx.c | 1021 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 1017 insertions(+), 4 deletions(-) diff --git a/libswscale/ppc/input_vsx.c b/libswscale/ppc/input_vsx.c index d977a32..35edd5e 100644 --- a/libswscale/ppc/input_vsx.c +++ b/libswscale/ppc/input_vsx.c @@ -30,6 +30,7 @@ #include "libavutil/mathematics.h" #include "libavutil/pixdesc.h" #include "libavutil/avassert.h" +#include "libavutil/timer.h" #include "config.h" #include "libswscale/rgb2rgb.h" #include "libswscale/swscale.h" @@ -54,6 +55,7 @@ static void abgrToA_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unus for ( i = 0; i < width_adj; i += 8) { vector int v_rd0 = vec_vsx_ld(0, (int *)src_addr); vector int v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + vector int v_dst; v_rd0 = vec_and(v_rd0, vec_splats(0x0ff)); v_rd1 = vec_and(v_rd1, vec_splats(0x0ff)); @@ -61,8 +63,8 @@ static void abgrToA_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unus v_rd0 = vec_sl(v_rd0, vec_splats((unsigned)6)); v_rd1 = vec_sl(v_rd1, vec_splats((unsigned)6)); - vector int v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char) - {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); src_addr += 32; @@ -91,6 +93,7 @@ static void rgbaToA_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unus for ( i = 0; i < width_adj; i += 8) { vector int v_rd0 = vec_vsx_ld(0, (int *)src_addr); vector int v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16)); + vector int v_dst; v_rd0 = vec_sld(v_rd0, v_rd0, 13); v_rd1 = vec_sld(v_rd1, v_rd1, 13); @@ -101,8 +104,8 @@ static void rgbaToA_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unus v_rd0 = vec_sl(v_rd0, vec_splats((unsigned)6)); v_rd1 = vec_sl(v_rd1, vec_splats((unsigned)6)); - vector int v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char) - {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); + v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29})); vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); src_addr += 32; @@ -114,6 +117,175 @@ static void rgbaToA_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unus } } +static void monoblack2Y_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + int16_t *dst = (int16_t *)_dst; + int i, j, width_adj, frag_len; + + vector unsigned char v_rd; + vector signed short v_din, v_d, v_dst; + vector unsigned short v_opr; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width = (width + 7) >> 3; + + // compute integral number of vector-length items and length of final fragment + width_adj = width >> 3; + width_adj = width_adj << 3; + frag_len = width - width_adj; + + v_opr = (vector unsigned short) {7, 6, 5, 4, 3, 2, 1, 0}; + + for (i = 0; i < width_adj; i += 8) { + if (i & 0x0f) { + v_rd = vec_sld(v_rd, v_rd, 8); + } else { + v_rd = vec_vsx_ld(0, (unsigned char *)src_addr); + src_addr += 16; + } + + v_din = vec_unpackh((vector signed char)v_rd); + v_din = vec_and(v_din, vec_splats((short)0x00ff)); + + for (j = 0; j < 8; j++) { + switch(j) { + case 0: + v_d = vec_splat(v_din, 0); + break; + case 1: + v_d = vec_splat(v_din, 1); + break; + case 2: + v_d = vec_splat(v_din, 2); + break; + case 3: + v_d = vec_splat(v_din, 3); + break; + case 4: + v_d = vec_splat(v_din, 4); + break; + case 5: + v_d = vec_splat(v_din, 5); + break; + case 6: + v_d = vec_splat(v_din, 6); + break; + case 7: + v_d = vec_splat(v_din, 7); + break; + } + + v_dst = vec_sr(v_d, v_opr); + v_dst = vec_and(v_dst, vec_splats((short)1)); + v_dst = v_dst * vec_splats((short)16383); + + vec_vsx_st(v_dst, 0, (short *)dst_addr); + dst_addr += 16; + } + } + + for (i = width_adj; i < width_adj + frag_len; i++) { + int d = src[i]; + for (j = 0; j < 8; j++) + dst[8*i+j]= ((d>>(7-j))&1) * 16383; + } + + i = width; + if(width&7){ + int d= src[i]; + for (j = 0; j < (width&7); j++) + dst[8*i+j]= ((d>>(7-j))&1) * 16383; + } +} + +static void monowhite2Y_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *unused) +{ + int16_t *dst = (int16_t *)_dst; + int i, j, width_adj, frag_len; + + vector unsigned char v_rd; + vector signed short v_din, v_d, v_dst; + vector unsigned short v_opr; + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + width = (width + 7) >> 3; + + // compute integral number of vector-length items and length of final fragment + width_adj = width >> 3; + width_adj = width_adj << 3; + frag_len = width - width_adj; + + v_opr = (vector unsigned short) {7, 6, 5, 4, 3, 2, 1, 0}; + + for (i = 0; i < width_adj; i += 8) { + if (i & 0x0f) { + v_rd = vec_sld(v_rd, v_rd, 8); + } else { + v_rd = vec_vsx_ld(0, (unsigned char *)src_addr); + src_addr += 16; + } + + v_din = vec_unpackh((vector signed char)v_rd); + v_din = vec_and(v_din, vec_splats((short)0x00ff)); + v_din = vec_xor(v_din, vec_splats((short)0xffff)); + + for (j = 0; j < 8; j++) { + switch(j) { + case 0: + v_d = vec_splat(v_din, 0); + break; + case 1: + v_d = vec_splat(v_din, 1); + break; + case 2: + v_d = vec_splat(v_din, 2); + break; + case 3: + v_d = vec_splat(v_din, 3); + break; + case 4: + v_d = vec_splat(v_din, 4); + break; + case 5: + v_d = vec_splat(v_din, 5); + break; + case 6: + v_d = vec_splat(v_din, 6); + break; + case 7: + v_d = vec_splat(v_din, 7); + break; + } + + v_dst = vec_sr(v_d, v_opr); + v_dst = vec_and(v_dst, vec_splats((short)1)); + v_dst = v_dst * vec_splats((short)16383); + + vec_vsx_st(v_dst, 0, (short *)dst_addr); + dst_addr += 16; + } + } + + for (i = width_adj; i < width_adj + frag_len; i++) { + int d = ~src[i]; + for (j = 0; j < 8; j++) + dst[8*i+j]= ((d>>(7-j))&1) * 16383; + } + + i = width; + if(width&7){ + int d= ~src[i]; + for (j = 0; j < (width&7); j++) + dst[8*i+j]= ((d>>(7-j))&1) * 16383; + } +} + static void yuy2ToY_c_vsx(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused) { @@ -380,6 +552,806 @@ static void nv21ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, nvXXtoUV_c_vsx(dstV, dstU, src1, width); } +static void bgr24ToY_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *rgb2yuv) +{ + int16_t *dst = (int16_t *)_dst; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; + int i, j, width_adj, frag_len; + + vector unsigned char v_rd0, v_rd1, v_tmpb, v_tmpg, v_tmpr; + + vector short v_tmp_s, v_dst; + vector int v_r, v_g, v_b, v_rslt; + + vector int v_ry = vec_splats((int)ry); + vector int v_gy = vec_splats((int)gy); + vector int v_by = vec_splats((int)by); + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + // compute integral number of vector-length items and length of final fragment + width_adj = width >> 3; + width_adj = width_adj << 3; + frag_len = width - width_adj; + + for (i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned char *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned char *)(src_addr + 16)); + src_addr += 24; + + for (j = 0; j < 2; j++) { + v_tmpb = vec_perm(v_rd0, v_rd0, ((vector unsigned char) + {0, 3, 6, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpg = vec_perm(v_rd0, v_rd0, ((vector unsigned char) + {1, 4, 7, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpr = vec_perm(v_rd0, v_rd0, ((vector unsigned char) + {2, 5, 8, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + + v_rd0 = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 0, 0, 0, 0})); + + v_tmp_s = vec_unpackh((vector signed char)v_tmpb); + v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff)); + v_b = vec_unpackh(v_tmp_s); + v_tmp_s = vec_unpackh((vector signed char)v_tmpg); + v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff)); + v_g = vec_unpackh(v_tmp_s); + v_tmp_s = vec_unpackh((vector signed char)v_tmpr); + v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff)); + v_r = vec_unpackh(v_tmp_s); + + vector unsigned v_opr1 = vec_splats((unsigned)(RGB2YUV_SHIFT-1)); + vector unsigned v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-7)); + vector unsigned v_opr3 = vec_splats((unsigned)(RGB2YUV_SHIFT-6)); + + v_rslt = v_ry*v_r + v_gy*v_g + v_by*v_b; + v_rslt += vec_sl(vec_splats((int)32), v_opr1); + v_rslt += vec_sl(vec_splats((int)1), v_opr2); + v_rslt = vec_sr(v_rslt, v_opr3); + + v_tmp_s = vec_pack(v_rslt, v_rslt); + v_dst = vec_sld(v_dst, v_tmp_s, 8); + } + v_dst = vec_sld(v_dst, v_dst, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + dst_addr += 16; + } + + for (i = width_adj; i < width_adj + frag_len; i++) { + int b = src[i * 3 + 0]; + int g = src[i * 3 + 1]; + int r = src[i * 3 + 2]; + + dst[i] = ((ry*r + gy*g + by*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6)); + } +} + +static void bgr24ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv) +{ + int16_t *dstU = (int16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + + int i, j, width_adj, frag_len; + + vector unsigned char v_rd0, v_rd1, v_tmpb, v_tmpg, v_tmpr; + + vector short v_tmp_s, v_dstu, v_dstv; + vector int v_r, v_g, v_b, v_rslt; + + vector int v_ru = vec_splats((int)ru); + vector int v_gu = vec_splats((int)gu); + vector int v_bu = vec_splats((int)bu); + + vector int v_rv = vec_splats((int)rv); + vector int v_gv = vec_splats((int)gv); + vector int v_bv = vec_splats((int)bv); + + uintptr_t src1_addr = (uintptr_t)src1; + uintptr_t dstu_addr = (uintptr_t)dstU; + uintptr_t dstv_addr = (uintptr_t)dstV; + + // compute integral number of vector-length items and length of final fragment + width_adj = width >> 3; + width_adj = width_adj << 3; + frag_len = width - width_adj; + + for (i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned char *)src1_addr); + v_rd1 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 16)); + src1_addr += 24; + + for (j = 0; j < 2; j++) { + v_tmpb = vec_perm(v_rd0, v_rd0, ((vector unsigned char) + {0, 3, 6, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpg = vec_perm(v_rd0, v_rd0, ((vector unsigned char) + {1, 4, 7, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpr = vec_perm(v_rd0, v_rd0, ((vector unsigned char) + {2, 5, 8, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + + v_rd0 = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 0, 0, 0, 0})); + + v_tmp_s = vec_unpackh((vector signed char)v_tmpb); + v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff)); + v_b = vec_unpackh(v_tmp_s); + v_tmp_s = vec_unpackh((vector signed char)v_tmpg); + v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff)); + v_g = vec_unpackh(v_tmp_s); + v_tmp_s = vec_unpackh((vector signed char)v_tmpr); + v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff)); + v_r = vec_unpackh(v_tmp_s); + + vector unsigned v_opr1 = vec_splats((unsigned)(RGB2YUV_SHIFT-1)); + vector unsigned v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-7)); + vector unsigned v_opr3 = vec_splats((unsigned)(RGB2YUV_SHIFT-6)); + + v_rslt = v_ru*v_r + v_gu*v_g + v_bu*v_b; + v_rslt += vec_sl(vec_splats((int)256), v_opr1); + v_rslt += vec_sl(vec_splats((int)1), v_opr2); + v_rslt = vec_sr(v_rslt, v_opr3); + + v_tmp_s = vec_pack(v_rslt, v_rslt); + v_dstu = vec_sld(v_dstu, v_tmp_s, 8); + + v_rslt = v_rv*v_r + v_gv*v_g + v_bv*v_b; + v_rslt += vec_sl(vec_splats((int)256), v_opr1); + v_rslt += vec_sl(vec_splats((int)1), v_opr2); + v_rslt = vec_sr(v_rslt, v_opr3); + + v_tmp_s = vec_pack(v_rslt, v_rslt); + v_dstv = vec_sld(v_dstv, v_tmp_s, 8); + } + v_dstu = vec_sld(v_dstu, v_dstu, 8); + v_dstv = vec_sld(v_dstv, v_dstv, 8); + vec_vsx_st((vector unsigned char)v_dstu, 0, (unsigned char *)dstu_addr); + vec_vsx_st((vector unsigned char)v_dstv, 0, (unsigned char *)dstv_addr); + dstu_addr += 16; + dstv_addr += 16; + } + + for (i = width_adj; i < width_adj + frag_len; i++) { + int b = src1[3 * i + 0]; + int g = src1[3 * i + 1]; + int r = src1[3 * i + 2]; + + dstU[i] = (ru*r + gu*g + bu*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6); + dstV[i] = (rv*r + gv*g + bv*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6); + } + av_assert1(src1 == src2); +} + +static void bgr24ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv) +{ + int16_t *dstU = (int16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + + int i, j, width_adj, frag_len; + + vector unsigned char v_rd0, v_rd1, v_rd2, v_tmpb[2], v_tmpg[2], v_tmpr[2]; + + vector short v_tmp_s[2], v_dstu, v_dstv; + vector int v_r, v_g, v_b, v_rslt; + + vector int v_ru = vec_splats((int)ru); + vector int v_gu = vec_splats((int)gu); + vector int v_bu = vec_splats((int)bu); + + vector int v_rv = vec_splats((int)rv); + vector int v_gv = vec_splats((int)gv); + vector int v_bv = vec_splats((int)bv); + + uintptr_t src1_addr = (uintptr_t)src1; + uintptr_t dstu_addr = (uintptr_t)dstU; + uintptr_t dstv_addr = (uintptr_t)dstV; + + // compute integral number of vector-length items and length of final fragment + width_adj = width >> 3; + width_adj = width_adj << 3; + frag_len = width - width_adj; + + for (i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned char *)src1_addr); + v_rd1 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 16)); + v_rd2 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 32)); + src1_addr += 48; + + for (j = 0; j < 2; j++) { + v_tmpb[0] = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {0, 6, 12, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpb[1] = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {3, 9, 15, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpg[0] = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {1, 7, 13, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpg[1] = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {4, 10, 16, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpr[0] = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {2, 8, 14, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpr[1] = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {5, 11, 17, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + + v_rd0 = vec_perm(v_rd1, v_rd2, ((vector unsigned char) + {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23})); + v_rd1 = vec_perm(v_rd2, v_rd2, ((vector unsigned char) + {8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0})); + + v_tmp_s[0] = vec_unpackh((vector signed char)v_tmpb[0]); + v_tmp_s[0] = vec_and(v_tmp_s[0], vec_splats((short)0x0ff)); + v_tmp_s[1] = vec_unpackh((vector signed char)v_tmpb[1]); + v_tmp_s[1] = vec_and(v_tmp_s[1], vec_splats((short)0x0ff)); + v_b = vec_unpackh(v_tmp_s[0]) + vec_unpackh(v_tmp_s[1]); + + v_tmp_s[0] = vec_unpackh((vector signed char)v_tmpg[0]); + v_tmp_s[0] = vec_and(v_tmp_s[0], vec_splats((short)0x0ff)); + v_tmp_s[1] = vec_unpackh((vector signed char)v_tmpg[1]); + v_tmp_s[1] = vec_and(v_tmp_s[1], vec_splats((short)0x0ff)); + v_g = vec_unpackh(v_tmp_s[0]) + vec_unpackh(v_tmp_s[1]); + + v_tmp_s[0] = vec_unpackh((vector signed char)v_tmpr[0]); + v_tmp_s[0] = vec_and(v_tmp_s[0], vec_splats((short)0x0ff)); + v_tmp_s[1] = vec_unpackh((vector signed char)v_tmpr[1]); + v_tmp_s[1] = vec_and(v_tmp_s[1], vec_splats((short)0x0ff)); + v_r = vec_unpackh(v_tmp_s[0]) + vec_unpackh(v_tmp_s[1]); + + vector unsigned v_opr1 = vec_splats((unsigned)(RGB2YUV_SHIFT)); + vector unsigned v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-6)); + vector unsigned v_opr3 = vec_splats((unsigned)(RGB2YUV_SHIFT-5)); + + v_rslt = v_ru*v_r + v_gu*v_g + v_bu*v_b; + v_rslt += vec_sl(vec_splats((int)256), v_opr1); + v_rslt += vec_sl(vec_splats((int)1), v_opr2); + v_rslt = vec_sr(v_rslt, v_opr3); + + v_tmp_s[0] = vec_pack(v_rslt, v_rslt); + v_dstu = vec_sld(v_dstu, v_tmp_s[0], 8); + + v_rslt = v_rv*v_r + v_gv*v_g + v_bv*v_b; + v_rslt += vec_sl(vec_splats((int)256), v_opr1); + v_rslt += vec_sl(vec_splats((int)1), v_opr2); + v_rslt = vec_sr(v_rslt, v_opr3); + + v_tmp_s[0] = vec_pack(v_rslt, v_rslt); + v_dstv = vec_sld(v_dstv, v_tmp_s[0], 8); + } + v_dstu = vec_sld(v_dstu, v_dstu, 8); + v_dstv = vec_sld(v_dstv, v_dstv, 8); + vec_vsx_st((vector unsigned char)v_dstu, 0, (unsigned char *)dstu_addr); + vec_vsx_st((vector unsigned char)v_dstv, 0, (unsigned char *)dstv_addr); + dstu_addr += 16; + dstv_addr += 16; + } + + for (i = width_adj; i < width + frag_len; i++) { + int b = src1[6 * i + 0] + src1[6 * i + 3]; + int g = src1[6 * i + 1] + src1[6 * i + 4]; + int r = src1[6 * i + 2] + src1[6 * i + 5]; + + dstU[i] = (ru*r + gu*g + bu*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5); + dstV[i] = (rv*r + gv*g + bv*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5); + } + av_assert1(src1 == src2); +} + +static void rgb24ToY_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, + int width, uint32_t *rgb2yuv) +{ +//START_TIMER; + int16_t *dst = (int16_t *)_dst; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; + int i, j, width_adj, frag_len; + + vector unsigned char v_rd0, v_rd1, v_tmpb, v_tmpg, v_tmpr; + + vector short v_tmp_s, v_dst; + vector int v_r, v_g, v_b, v_rslt; + + vector int v_ry = vec_splats((int)ry); + vector int v_gy = vec_splats((int)gy); + vector int v_by = vec_splats((int)by); + + uintptr_t src_addr = (uintptr_t)src; + uintptr_t dst_addr = (uintptr_t)dst; + + // compute integral number of vector-length items and length of final fragment + width_adj = width >> 3; + width_adj = width_adj << 3; + frag_len = width - width_adj; + + for (i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned char *)src_addr); + v_rd1 = vec_vsx_ld(0, (unsigned char *)(src_addr + 16)); + src_addr += 24; + + for (j = 0; j < 2; j++) { + v_tmpr = vec_perm(v_rd0, v_rd0, ((vector unsigned char) + {0, 3, 6, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpg = vec_perm(v_rd0, v_rd0, ((vector unsigned char) + {1, 4, 7, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpb = vec_perm(v_rd0, v_rd0, ((vector unsigned char) + {2, 5, 8, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + + v_rd0 = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 0, 0, 0, 0})); + + v_tmp_s = vec_unpackh((vector signed char)v_tmpr); + v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff)); + v_r = vec_unpackh(v_tmp_s); + v_tmp_s = vec_unpackh((vector signed char)v_tmpg); + v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff)); + v_g = vec_unpackh(v_tmp_s); + v_tmp_s = vec_unpackh((vector signed char)v_tmpb); + v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff)); + v_b = vec_unpackh(v_tmp_s); + + vector unsigned v_opr1 = vec_splats((unsigned)(RGB2YUV_SHIFT-1)); + vector unsigned v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-7)); + vector unsigned v_opr3 = vec_splats((unsigned)(RGB2YUV_SHIFT-6)); + + v_rslt = v_ry*v_r + v_gy*v_g + v_by*v_b; + v_rslt += vec_sl(vec_splats((int)32), v_opr1); + v_rslt += vec_sl(vec_splats((int)1), v_opr2); + v_rslt = vec_sr(v_rslt, v_opr3); + + v_tmp_s = vec_pack(v_rslt, v_rslt); + v_dst = vec_sld(v_dst, v_tmp_s, 8); + } + v_dst = vec_sld(v_dst, v_dst, 8); + vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr); + dst_addr += 16; + } + + for (i = width_adj; i < width_adj + frag_len; i++) { + int r = src[i * 3 + 0]; + int g = src[i * 3 + 1]; + int b = src[i * 3 + 2]; + + dst[i] = ((ry*r + gy*g + by*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6)); + } +//STOP_TIMER("rgb24ToY_c_vsx"); +} + +static void rgb24ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv) +{ + int16_t *dstU = (int16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + + int i, j, width_adj, frag_len; + + vector unsigned char v_rd0, v_rd1, v_tmpb, v_tmpg, v_tmpr; + + vector short v_tmp_s, v_dstu, v_dstv; + vector int v_r, v_g, v_b, v_rslt; + + vector int v_ru = vec_splats((int)ru); + vector int v_gu = vec_splats((int)gu); + vector int v_bu = vec_splats((int)bu); + + vector int v_rv = vec_splats((int)rv); + vector int v_gv = vec_splats((int)gv); + vector int v_bv = vec_splats((int)bv); + + uintptr_t src1_addr = (uintptr_t)src1; + uintptr_t dstu_addr = (uintptr_t)dstU; + uintptr_t dstv_addr = (uintptr_t)dstV; + + // compute integral number of vector-length items and length of final fragment + width_adj = width >> 3; + width_adj = width_adj << 3; + frag_len = width - width_adj; + + av_assert1(src1 == src2); + for (i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned char *)src1_addr); + v_rd1 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 16)); + src1_addr += 24; + + for (j = 0; j < 2; j++) { + v_tmpr = vec_perm(v_rd0, v_rd0, ((vector unsigned char) + {0, 3, 6, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpg = vec_perm(v_rd0, v_rd0, ((vector unsigned char) + {1, 4, 7, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpb = vec_perm(v_rd0, v_rd0, ((vector unsigned char) + {2, 5, 8, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + + v_rd0 = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 0, 0, 0, 0})); + + v_tmp_s = vec_unpackh((vector signed char)v_tmpr); + v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff)); + v_r = vec_unpackh(v_tmp_s); + v_tmp_s = vec_unpackh((vector signed char)v_tmpg); + v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff)); + v_g = vec_unpackh(v_tmp_s); + v_tmp_s = vec_unpackh((vector signed char)v_tmpb); + v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff)); + v_b = vec_unpackh(v_tmp_s); + + vector unsigned v_opr1 = vec_splats((unsigned)(RGB2YUV_SHIFT-1)); + vector unsigned v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-7)); + vector unsigned v_opr3 = vec_splats((unsigned)(RGB2YUV_SHIFT-6)); + + v_rslt = v_ru*v_r + v_gu*v_g + v_bu*v_b; + v_rslt += vec_sl(vec_splats((int)256), v_opr1); + v_rslt += vec_sl(vec_splats((int)1), v_opr2); + v_rslt = vec_sr(v_rslt, v_opr3); + + v_tmp_s = vec_pack(v_rslt, v_rslt); + v_dstu = vec_sld(v_dstu, v_tmp_s, 8); + + v_rslt = v_rv*v_r + v_gv*v_g + v_bv*v_b; + v_rslt += vec_sl(vec_splats((int)256), v_opr1); + v_rslt += vec_sl(vec_splats((int)1), v_opr2); + v_rslt = vec_sr(v_rslt, v_opr3); + + v_tmp_s = vec_pack(v_rslt, v_rslt); + v_dstv = vec_sld(v_dstv, v_tmp_s, 8); + } + v_dstu = vec_sld(v_dstu, v_dstu, 8); + v_dstv = vec_sld(v_dstv, v_dstv, 8); + vec_vsx_st((vector unsigned char)v_dstu, 0, (unsigned char *)dstu_addr); + vec_vsx_st((vector unsigned char)v_dstv, 0, (unsigned char *)dstv_addr); + dstu_addr += 16; + dstv_addr += 16; + } + + for (i = width_adj; i < width_adj + frag_len; i++) { + int r = src1[3 * i + 0]; + int g = src1[3 * i + 1]; + int b = src1[3 * i + 2]; + + dstU[i] = (ru*r + gu*g + bu*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6); + dstV[i] = (rv*r + gv*g + bv*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6); + } +} + +static void rgb24ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1, + const uint8_t *src2, int width, uint32_t *rgb2yuv) +{ + int16_t *dstU = (int16_t *)_dstU; + int16_t *dstV = (int16_t *)_dstV; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + + int i, j, width_adj, frag_len; + + vector unsigned char v_rd0, v_rd1, v_rd2, v_tmpb[2], v_tmpg[2], v_tmpr[2]; + + vector short v_tmp_s[2], v_dstu, v_dstv; + vector int v_r, v_g, v_b, v_rslt; + + vector int v_ru = vec_splats((int)ru); + vector int v_gu = vec_splats((int)gu); + vector int v_bu = vec_splats((int)bu); + + vector int v_rv = vec_splats((int)rv); + vector int v_gv = vec_splats((int)gv); + vector int v_bv = vec_splats((int)bv); + + uintptr_t src1_addr = (uintptr_t)src1; + uintptr_t dstu_addr = (uintptr_t)dstU; + uintptr_t dstv_addr = (uintptr_t)dstV; + + // compute integral number of vector-length items and length of final fragment + width_adj = width >> 3; + width_adj = width_adj << 3; + frag_len = width - width_adj; + + for (i = 0; i < width_adj; i += 8) { + v_rd0 = vec_vsx_ld(0, (unsigned char *)src1_addr); + v_rd1 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 16)); + v_rd2 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 32)); + src1_addr += 48; + + av_assert1(src1 == src2); + for (j = 0; j < 2; j++) { + v_tmpr[0] = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {0, 6, 12, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpr[1] = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {3, 9, 15, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpg[0] = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {1, 7, 13, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpg[1] = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {4, 10, 16, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpb[0] = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {2, 8, 14, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + v_tmpb[1] = vec_perm(v_rd0, v_rd1, ((vector unsigned char) + {5, 11, 17, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0})); + + v_rd0 = vec_perm(v_rd1, v_rd2, ((vector unsigned char) + {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23})); + v_rd1 = vec_perm(v_rd2, v_rd2, ((vector unsigned char) + {8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0})); + + v_tmp_s[0] = vec_unpackh((vector signed char)v_tmpr[0]); + v_tmp_s[0] = vec_and(v_tmp_s[0], vec_splats((short)0x0ff)); + v_tmp_s[1] = vec_unpackh((vector signed char)v_tmpr[1]); + v_tmp_s[1] = vec_and(v_tmp_s[1], vec_splats((short)0x0ff)); + v_r = vec_unpackh(v_tmp_s[0]) + vec_unpackh(v_tmp_s[1]); + + v_tmp_s[0] = vec_unpackh((vector signed char)v_tmpg[0]); + v_tmp_s[0] = vec_and(v_tmp_s[0], vec_splats((short)0x0ff)); + v_tmp_s[1] = vec_unpackh((vector signed char)v_tmpg[1]); + v_tmp_s[1] = vec_and(v_tmp_s[1], vec_splats((short)0x0ff)); + v_g = vec_unpackh(v_tmp_s[0]) + vec_unpackh(v_tmp_s[1]); + + v_tmp_s[0] = vec_unpackh((vector signed char)v_tmpb[0]); + v_tmp_s[0] = vec_and(v_tmp_s[0], vec_splats((short)0x0ff)); + v_tmp_s[1] = vec_unpackh((vector signed char)v_tmpb[1]); + v_tmp_s[1] = vec_and(v_tmp_s[1], vec_splats((short)0x0ff)); + v_b = vec_unpackh(v_tmp_s[0]) + vec_unpackh(v_tmp_s[1]); + + vector unsigned v_opr1 = vec_splats((unsigned)(RGB2YUV_SHIFT)); + vector unsigned v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-6)); + vector unsigned v_opr3 = vec_splats((unsigned)(RGB2YUV_SHIFT-5)); + + v_rslt = v_ru*v_r + v_gu*v_g + v_bu*v_b; + v_rslt += vec_sl(vec_splats((int)256), v_opr1); + v_rslt += vec_sl(vec_splats((int)1), v_opr2); + v_rslt = vec_sr(v_rslt, v_opr3); + + v_tmp_s[0] = vec_pack(v_rslt, v_rslt); + v_dstu = vec_sld(v_dstu, v_tmp_s[0], 8); + + v_rslt = v_rv*v_r + v_gv*v_g + v_bv*v_b; + v_rslt += vec_sl(vec_splats((int)256), v_opr1); + v_rslt += vec_sl(vec_splats((int)1), v_opr2); + v_rslt = vec_sr(v_rslt, v_opr3); + + v_tmp_s[0] = vec_pack(v_rslt, v_rslt); + v_dstv = vec_sld(v_dstv, v_tmp_s[0], 8); + } + v_dstu = vec_sld(v_dstu, v_dstu, 8); + v_dstv = vec_sld(v_dstv, v_dstv, 8); + vec_vsx_st((vector unsigned char)v_dstu, 0, (unsigned char *)dstu_addr); + vec_vsx_st((vector unsigned char)v_dstv, 0, (unsigned char *)dstv_addr); + dstu_addr += 16; + dstv_addr += 16; + } + + for (i = width_adj; i < width + frag_len; i++) { + int r = src1[6 * i + 0] + src1[6 * i + 3]; + int g = src1[6 * i + 1] + src1[6 * i + 4]; + int b = src1[6 * i + 2] + src1[6 * i + 5]; + + dstU[i] = (ru*r + gu*g + bu*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5); + dstV[i] = (rv*r + gv*g + bv*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5); + } +} + +static void planar_rgb_to_y_vsx(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *rgb2yuv) +{ + uint16_t *dst = (uint16_t *)_dst; + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; + int i, width_adj, frag_len; + + vector unsigned char v_rd[3]; + vector short v_din[3], v_dst; + vector int v_r, v_g, v_b, v_rslt; + vector unsigned v_opr1, v_opr2; + + uintptr_t src0_addr, src1_addr, src2_addr, dst_addr; + + // compute integral number of vector-length items and length of final fragment + width_adj = width >> 3; + width_adj = width_adj << 3; + frag_len = width - width_adj; + + vector int v_ry = vec_splats((int)ry); + vector int v_gy = vec_splats((int)gy); + vector int v_by = vec_splats((int)by); + + src0_addr = (uintptr_t)src[0]; + src1_addr = (uintptr_t)src[1]; + src2_addr = (uintptr_t)src[2]; + dst_addr = (uintptr_t)dst; + + v_opr1 = vec_splats((unsigned)0x801); + v_opr1 = vec_sl(v_opr1, vec_splats((unsigned)(RGB2YUV_SHIFT-7))); + v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-6)); + + for (i = 0; i < width_adj; i += 8) { + if (i & 0x0f) { + v_din[0] = vec_unpackl((vector signed char)v_rd[0]); + v_din[1] = vec_unpackl((vector signed char)v_rd[1]); + v_din[2] = vec_unpackl((vector signed char)v_rd[2]); + } else { + v_rd[0] = vec_vsx_ld(0, (unsigned char *)src0_addr); + v_rd[1] = vec_vsx_ld(0, (unsigned char *)src1_addr); + v_rd[2] = vec_vsx_ld(0, (unsigned char *)src2_addr); + src0_addr += 16; + src1_addr += 16; + src2_addr += 16; + v_din[0] = vec_unpackh((vector signed char)v_rd[0]); + v_din[1] = vec_unpackh((vector signed char)v_rd[1]); + v_din[2] = vec_unpackh((vector signed char)v_rd[2]); + } + + v_din[0] = v_din[0] & vec_splats((short)0x00ff); + v_din[1] = v_din[1] & vec_splats((short)0x00ff); + v_din[2] = v_din[2] & vec_splats((short)0x00ff); + + v_g = vec_unpackh(v_din[0]); + v_b = vec_unpackh(v_din[1]); + v_r = vec_unpackh(v_din[2]); + + v_rslt = v_ry*v_r + v_gy*v_g + v_by*v_b; + v_rslt += v_opr1; + v_rslt = vec_sr(v_rslt, v_opr2); + v_dst = vec_sld(vec_pack(v_rslt, v_rslt), v_dst, 8); + + v_g = vec_unpackl(v_din[0]); + v_b = vec_unpackl(v_din[1]); + v_r = vec_unpackl(v_din[2]); + + v_rslt = v_ry*v_r + v_gy*v_g + v_by*v_b; + v_rslt += v_opr1; + v_rslt = vec_sr(v_rslt, v_opr2); + v_dst = vec_sld(vec_pack(v_rslt, v_rslt), v_dst, 8); + + vec_vsx_st(v_dst, 0, (short *)dst_addr); + dst_addr += 16; + } + + for (i = width_adj; i < width_adj + frag_len; i++) { + int g = src[0][i]; + int b = src[1][i]; + int r = src[2][i]; + + dst[i] = (ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6); + } +} + +static void planar_rgb_to_a_vsx(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *unused) +{ + uint16_t *dst = (uint16_t *)_dst; + int i, width_adj, frag_len; + + vector unsigned char v_rd; + vector short v_din, v_dst; + + uintptr_t src_addr, dst_addr; + + // compute integral number of vector-length items and length of final fragment + width_adj = width >> 3; + width_adj = width_adj << 3; + frag_len = width - width_adj; + + src_addr = (uintptr_t)src[3]; + dst_addr = (uintptr_t)dst; + + for (i = 0; i < width_adj; i += 8) { + if (i & 0x0f) { + v_din = vec_unpackl((vector signed char)v_rd); + } else { + v_rd = vec_vsx_ld(0, (unsigned char *)src_addr); + v_din = vec_unpackh((vector signed char)v_rd); + src_addr += 16; + } + + v_dst = v_din & vec_splats((short)0x00ff); + v_dst = v_dst << vec_splats((unsigned short)6); + + vec_vsx_st(v_dst, 0, (short *)dst_addr); + dst_addr += 16; + } + + for (i = width_adj; i < width_adj + frag_len; i++) + dst[i] = src[3][i] << 6; +} + +static void planar_rgb_to_uv_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv) +{ + uint16_t *dstU = (uint16_t *)_dstU; + uint16_t *dstV = (uint16_t *)_dstV; + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; + int i, width_adj, frag_len; + + vector unsigned char v_rd[3]; + vector short v_din[3], v_dstu, v_dstv; + vector int v_r, v_g, v_b, v_rslt; + vector unsigned v_opr1, v_opr2; + + uintptr_t src0_addr, src1_addr, src2_addr, dstu_addr, dstv_addr; + + // compute integral number of vector-length items and length of final fragment + width_adj = width >> 3; + width_adj = width_adj << 3; + frag_len = width - width_adj; + + vector int v_ru = vec_splats((int)ru); + vector int v_gu = vec_splats((int)gu); + vector int v_bu = vec_splats((int)bu); + + vector int v_rv = vec_splats((int)rv); + vector int v_gv = vec_splats((int)gv); + vector int v_bv = vec_splats((int)bv); + + src0_addr = (uintptr_t)src[0]; + src1_addr = (uintptr_t)src[1]; + src2_addr = (uintptr_t)src[2]; + dstu_addr = (uintptr_t)dstU; + dstv_addr = (uintptr_t)dstV; + + v_opr1 = vec_splats((unsigned)0x4001); + v_opr1 = vec_sl(v_opr1, vec_splats((unsigned)(RGB2YUV_SHIFT-7))); + v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-6)); + + for (i = 0; i < width_adj; i += 8) { + if (i & 0x0f) { + v_din[0] = vec_unpackl((vector signed char)v_rd[0]); + v_din[1] = vec_unpackl((vector signed char)v_rd[1]); + v_din[2] = vec_unpackl((vector signed char)v_rd[2]); + } else { + v_rd[0] = vec_vsx_ld(0, (unsigned char *)src0_addr); + v_rd[1] = vec_vsx_ld(0, (unsigned char *)src1_addr); + v_rd[2] = vec_vsx_ld(0, (unsigned char *)src2_addr); + src0_addr += 16; + src1_addr += 16; + src2_addr += 16; + v_din[0] = vec_unpackh((vector signed char)v_rd[0]); + v_din[1] = vec_unpackh((vector signed char)v_rd[1]); + v_din[2] = vec_unpackh((vector signed char)v_rd[2]); + } + + v_din[0] = v_din[0] & vec_splats((short)0x00ff); + v_din[1] = v_din[1] & vec_splats((short)0x00ff); + v_din[2] = v_din[2] & vec_splats((short)0x00ff); + + v_g = vec_unpackh(v_din[0]); + v_b = vec_unpackh(v_din[1]); + v_r = vec_unpackh(v_din[2]); + + v_rslt = v_ru*v_r + v_gu*v_g + v_bu*v_b; + v_rslt += v_opr1; + v_rslt = vec_sr(v_rslt, v_opr2); + v_dstu = vec_sld(vec_pack(v_rslt, v_rslt), v_dstu, 8); + + v_rslt = v_rv*v_r + v_gv*v_g + v_bv*v_b; + v_rslt += v_opr1; + v_rslt = vec_sr(v_rslt, v_opr2); + v_dstv = vec_sld(vec_pack(v_rslt, v_rslt), v_dstv, 8); + + v_g = vec_unpackl(v_din[0]); + v_b = vec_unpackl(v_din[1]); + v_r = vec_unpackl(v_din[2]); + + v_rslt = v_ru*v_r + v_gu*v_g + v_bu*v_b; + v_rslt += v_opr1; + v_rslt = vec_sr(v_rslt, v_opr2); + v_dstu = vec_sld(vec_pack(v_rslt, v_rslt), v_dstu, 8); + + v_rslt = v_rv*v_r + v_gv*v_g + v_bv*v_b; + v_rslt += v_opr1; + v_rslt = vec_sr(v_rslt, v_opr2); + v_dstv = vec_sld(vec_pack(v_rslt, v_rslt), v_dstv, 8); + + vec_vsx_st(v_dstu, 0, (short *)dstu_addr); + vec_vsx_st(v_dstv, 0, (short *)dstv_addr); + dstu_addr += 16; + dstv_addr += 16; + } + + for (i = width_adj; i < width_adj + frag_len; i++) { + int g = src[0][i]; + int b = src[1][i]; + int r = src[2][i]; + + dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6); + dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6); + } +} + #endif /* HAVE_VSX */ av_cold void ff_sws_init_input_funcs_vsx(SwsContext *c) @@ -404,9 +1376,38 @@ av_cold void ff_sws_init_input_funcs_vsx(SwsContext *c) case AV_PIX_FMT_NV21: c->chrToYV12 = nv21ToUV_c_vsx; break; + case AV_PIX_FMT_GBRAP: + case AV_PIX_FMT_GBRP: + c->readChrPlanar = planar_rgb_to_uv_vsx; + break; + } + + if (c->chrSrcHSubSample) { + switch (srcFormat) { + case AV_PIX_FMT_BGR24: + c->chrToYV12 = bgr24ToUV_half_c_vsx; + break; + case AV_PIX_FMT_RGB24: + c->chrToYV12 = rgb24ToUV_half_c_vsx; + break; + } + } else { + switch (srcFormat) { + case AV_PIX_FMT_BGR24: + c->chrToYV12 = bgr24ToUV_c_vsx; + break; + case AV_PIX_FMT_RGB24: + c->chrToYV12 = rgb24ToUV_c_vsx; + break; + } } switch (srcFormat) { + case AV_PIX_FMT_GBRAP: + c->readAlpPlanar = planar_rgb_to_a_vsx; + case AV_PIX_FMT_GBRP: + c->readLumPlanar = planar_rgb_to_y_vsx; + break; case AV_PIX_FMT_YUYV422: case AV_PIX_FMT_YVYU422: case AV_PIX_FMT_YA8: @@ -415,6 +1416,18 @@ av_cold void ff_sws_init_input_funcs_vsx(SwsContext *c) case AV_PIX_FMT_UYVY422: c->lumToYV12 = uyvyToY_c_vsx; break; + case AV_PIX_FMT_BGR24: + c->lumToYV12 = bgr24ToY_c_vsx; + break; + case AV_PIX_FMT_RGB24: + c->lumToYV12 = rgb24ToY_c_vsx; + break; + case AV_PIX_FMT_MONOBLACK: + c->lumToYV12 = monoblack2Y_c_vsx; + break; + case AV_PIX_FMT_MONOWHITE: + c->lumToYV12 = monowhite2Y_c_vsx; + break; } if (c->needAlpha) { -- 2.7.4 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel