use sse/sse2 intrinsic bitexact on x86_64 bar_time: rgb24: 12.601s 6.492s yuv444p: 14.495s 5.661s yuv422p: 10.514s 3.953s yuv420p: 8.795s 3.256s
Signed-off-by: Muhammad Faiz <mfc...@gmail.com> --- libavfilter/avf_showcqt.c | 20 ++- libavfilter/avf_showcqt.h | 2 + libavfilter/x86/avf_showcqt.c | 300 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 318 insertions(+), 4 deletions(-) diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c index 2d2644c..2528c0f 100644 --- a/libavfilter/avf_showcqt.c +++ b/libavfilter/avf_showcqt.c @@ -137,6 +137,7 @@ static void common_uninit(ShowCQTContext *s) av_freep(&s->fft_result); av_freep(&s->cqt_result); av_freep(&s->c_buf); + av_freep(&s->c_bar_buf); av_freep(&s->h_buf); av_freep(&s->rcp_h_buf); av_freep(&s->freq); @@ -1024,7 +1025,12 @@ static int plot_cqt(AVFilterContext *ctx, AVFrame **frameout) UPDATE_TIME(s->alloc_time); if (s->bar_h) { - s->draw_bar(out, s->h_buf, s->rcp_h_buf, s->c_buf, s->bar_h); + if (s->permute_color_bar) { + s->permute_color_bar(s->c_bar_buf, s->c_buf, s->width); + s->draw_bar(out, s->h_buf, s->rcp_h_buf, s->c_bar_buf, s->bar_h); + } else { + s->draw_bar(out, s->h_buf, s->rcp_h_buf, s->c_buf, s->bar_h); + } UPDATE_TIME(s->bar_time); } @@ -1228,12 +1234,18 @@ static int config_output(AVFilterLink *outlink) return AVERROR(ENOMEM); } - s->h_buf = av_malloc_array(s->cqt_len, sizeof (*s->h_buf)); - s->rcp_h_buf = av_malloc_array(s->width, sizeof(*s->rcp_h_buf)); - s->c_buf = av_malloc_array(s->width, sizeof(*s->c_buf)); + s->h_buf = av_calloc(FFALIGN(s->cqt_len, 32), sizeof (*s->h_buf)); + s->rcp_h_buf = av_calloc(FFALIGN(s->width, 32), sizeof(*s->rcp_h_buf)); + s->c_buf = av_calloc(FFALIGN(s->width, 32), sizeof(*s->c_buf)); if (!s->h_buf || !s->rcp_h_buf || !s->c_buf) return AVERROR(ENOMEM); + if (s->permute_color_bar) { + s->c_bar_buf = av_calloc(FFALIGN(s->width, 32), sizeof(*s->c_bar_buf)); + if (!s->c_bar_buf) + return AVERROR(ENOMEM); + } + s->sono_count = 0; s->next_pts = 0; s->sono_idx = 0; diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h index d01d90a..9de60f3 100644 --- a/libavfilter/avf_showcqt.h +++ b/libavfilter/avf_showcqt.h @@ -67,6 +67,7 @@ typedef struct { int cqt_len; int cqt_align; ColorFloat *c_buf; + ColorFloat *c_bar_buf; float *h_buf; float *rcp_h_buf; float *sono_v_buf; @@ -81,6 +82,7 @@ typedef struct { void (*update_sono)(AVFrame *sono, const ColorFloat *c, int idx); /* permute callback, for easier SIMD code */ void (*permute_coeffs)(float *val, int len); + void (*permute_color_bar)(ColorFloat *out, const ColorFloat *in, int len); /* performance debugging */ int64_t fft_time; int64_t cqt_time; diff --git a/libavfilter/x86/avf_showcqt.c b/libavfilter/x86/avf_showcqt.c index b8e9d32..d0c90a5 100644 --- a/libavfilter/x86/avf_showcqt.c +++ b/libavfilter/x86/avf_showcqt.c @@ -27,6 +27,10 @@ #include <xmmintrin.h> #endif +#if HAVE_SSE2_INTRINSIC +#include <emmintrin.h> +#endif + #if HAVE_SSE3_INTRINSIC #include <pmmintrin.h> #endif @@ -259,6 +263,282 @@ static void permute_coeffs_avx(float *v, int len) } #endif +#if HAVE_SSE2_INTRINSIC +static av_intrinsic_sse2 +void draw_bar_rgb24_sse2(AVFrame *out, const float *h, const float *rcp_h, + const ColorFloat *color, int bar_h) +{ + const float *c; + int x, y, w = out->width; + float rcp_bar_h = 1.0f / bar_h; + uint8_t *v = out->data[0]; + uint8_t *lp; + int ls = out->linesize[0]; + __m128i is_le, rri, rgi, rbi; + __m128 hx, ht, mul, rr, rg, rb; + uint32_t red, green, blue; + + for (y = 0; y < bar_h; y++) { + lp = v + ls * y; + ht = _mm_set1_ps((bar_h - y) * rcp_bar_h); + x = 0; + c = (const float *) color; + do { + hx = _mm_load_ps(h+x); + is_le = _mm_castps_si128(_mm_cmple_ps(hx, ht)); + is_le = _mm_packs_epi32(is_le, is_le); + is_le = _mm_packs_epi16(is_le, is_le); + if (-1 == _mm_cvtsi128_si32(is_le)) { + memset(lp, 0, 12); + } else { + mul = _mm_max_ps(_mm_sub_ps(hx, ht), _mm_setzero_ps()); + mul = _mm_mul_ps(mul, _mm_load_ps(rcp_h + x)); + rr = _mm_mul_ps(mul, _mm_load_ps(c)); + rg = _mm_mul_ps(mul, _mm_load_ps(c+4)); + rb = _mm_mul_ps(mul, _mm_load_ps(c+8)); + rri = _mm_cvtps_epi32(rr); + rgi = _mm_cvtps_epi32(rg); + rbi = _mm_cvtps_epi32(rb); + rri = _mm_packs_epi32(rri, rri); + rgi = _mm_packs_epi32(rgi, rgi); + rbi = _mm_packs_epi32(rbi, rbi); + rri = _mm_packus_epi16(rri, rri); + rgi = _mm_packus_epi16(rgi, rgi); + rbi = _mm_packus_epi16(rbi, rbi); + red = _mm_cvtsi128_si32(rri); + green = _mm_cvtsi128_si32(rgi); + blue = _mm_cvtsi128_si32(rbi); + lp[0] = red; lp[1] = green; lp[2] = blue; + red >>= 8; green >>= 8; blue >>= 8; + lp[3] = red; lp[4] = green; lp[5] = blue; + red >>= 8; green >>= 8; blue >>= 8; + lp[6] = red; lp[7] = green; lp[8] = blue; + red >>= 8; green >>= 8; blue >>= 8; + lp[9] = red; lp[10] = green; lp[11] = blue; + } + lp += 12; + x += 4; c += 12; + } while (x < w); + } +} + +static av_intrinsic_sse2 +void draw_bar_yuv_sse2(AVFrame *out, const float *h, const float *rcp_h, + const ColorFloat *color, int bar_h) +{ + const float *c; + int x, y, yh, w = out->width; + float rcp_bar_h = 1.0f / bar_h; + uint8_t *vy = out->data[0], *vu = out->data[1], *vv = out->data[2]; + uint8_t *lpy, *lpu, *lpv; + int lsy = out->linesize[0], lsu = out->linesize[1], lsv = out->linesize[2]; + int fmt = out->format; + __m128i is_le, ryi, ryi2, rui, rvi; + __m128 ht, hx, hx2, mul, mul2, mul3, ry, ry2, ru, rv; + +#define DRAW_BAR_LINE_FULL_CHROMA() \ +do { \ + x = 0; \ + c = (const float *) color; \ + do { \ + hx = _mm_load_ps(h+x); \ + is_le = _mm_castps_si128(_mm_cmple_ps(hx, ht)); \ + is_le = _mm_packs_epi32(is_le, is_le); \ + is_le = _mm_packs_epi16(is_le, is_le); \ + if (-1 == _mm_cvtsi128_si32(is_le)) { \ + *((uint32_t *) lpy) = 0x10101010; \ + *((uint32_t *) lpu) = 0x80808080; \ + *((uint32_t *) lpv) = 0x80808080; \ + } else { \ + mul = _mm_max_ps(_mm_sub_ps(hx, ht), _mm_setzero_ps()); \ + mul = _mm_mul_ps(mul, _mm_load_ps(rcp_h + x)); \ + ry = _mm_mul_ps(mul, _mm_load_ps(c)); \ + ru = _mm_mul_ps(mul, _mm_load_ps(c+4)); \ + rv = _mm_mul_ps(mul, _mm_load_ps(c+8)); \ + ry = _mm_add_ps(ry, _mm_set1_ps(16.0f)); \ + ru = _mm_add_ps(ru, _mm_set1_ps(128.0f)); \ + rv = _mm_add_ps(rv, _mm_set1_ps(128.0f)); \ + ryi = _mm_cvtps_epi32(ry); \ + rui = _mm_cvtps_epi32(ru); \ + rvi = _mm_cvtps_epi32(rv); \ + ryi = _mm_packs_epi32(ryi, ryi); \ + rui = _mm_packs_epi32(rui, rui); \ + rvi = _mm_packs_epi32(rvi, rvi); \ + ryi = _mm_packus_epi16(ryi, ryi); \ + rui = _mm_packus_epi16(rui, rui); \ + rvi = _mm_packus_epi16(rvi, rvi); \ + *((int32_t *) lpy) = _mm_cvtsi128_si32(ryi); \ + *((int32_t *) lpu) = _mm_cvtsi128_si32(rui); \ + *((int32_t *) lpv) = _mm_cvtsi128_si32(rvi); \ + } \ + lpy += 4; lpu += 4; lpv += 4; \ + x += 4; c += 12; \ + } while (x < w); \ +} while (0) + +#define DRAW_BAR_LINE_HALF_CHROMA() \ +do { \ + x = 0; \ + c = (const float *) color; \ + do { \ + hx = _mm_load_ps(h+x); \ + hx2 = _mm_load_ps(h+x+4); \ + is_le = _mm_castps_si128(_mm_cmple_ps(hx, ht)); \ + is_le = _mm_packs_epi32(is_le, _mm_castps_si128(_mm_cmple_ps(hx2, ht))); \ + is_le = _mm_packs_epi16(is_le, is_le); \ + is_le = _mm_packs_epi16(is_le, is_le); \ + if (-1 == _mm_cvtsi128_si32(is_le)) { \ + *((uint32_t *) lpy) = 0x10101010; \ + *((uint32_t *) (lpy+4)) = 0x10101010; \ + *((uint32_t *) lpu) = 0x80808080; \ + *((uint32_t *) lpv) = 0x80808080; \ + } else { \ + mul = _mm_max_ps(_mm_sub_ps(hx, ht), _mm_setzero_ps()); \ + mul = _mm_mul_ps(mul, _mm_load_ps(rcp_h + x)); \ + mul2 = _mm_max_ps(_mm_sub_ps(hx2, ht), _mm_setzero_ps()); \ + mul2 = _mm_mul_ps(mul2, _mm_load_ps(rcp_h + x + 4)); \ + mul3 = _mm_shuffle_ps(mul, mul2, _MM_SHUFFLE(2,0,2,0)); \ + ry = _mm_mul_ps(mul, _mm_load_ps(c)); \ + ry2 = _mm_mul_ps(mul2, _mm_load_ps(c+8)); \ + ru = _mm_mul_ps(mul3, _mm_load_ps(c+4)); \ + rv = _mm_mul_ps(mul3, _mm_load_ps(c+12)); \ + ry = _mm_add_ps(ry, _mm_set1_ps(16.0f)); \ + ry2 = _mm_add_ps(ry2, _mm_set1_ps(16.0f)); \ + ru = _mm_add_ps(ru, _mm_set1_ps(128.0f)); \ + rv = _mm_add_ps(rv, _mm_set1_ps(128.0f)); \ + ryi = _mm_cvtps_epi32(ry); \ + ryi2 = _mm_cvtps_epi32(ry2); \ + rui = _mm_cvtps_epi32(ru); \ + rvi = _mm_cvtps_epi32(rv); \ + ryi = _mm_packs_epi32(ryi, ryi); \ + ryi2 = _mm_packs_epi32(ryi2, ryi2); \ + rui = _mm_packs_epi32(rui, rui); \ + rvi = _mm_packs_epi32(rvi, rvi); \ + ryi = _mm_packus_epi16(ryi, ryi); \ + ryi2 = _mm_packus_epi16(ryi2, ryi2); \ + rui = _mm_packus_epi16(rui, rui); \ + rvi = _mm_packus_epi16(rvi, rvi); \ + *((int32_t *) lpy) = _mm_cvtsi128_si32(ryi); \ + *((int32_t *) (lpy+4)) = _mm_cvtsi128_si32(ryi2); \ + *((int32_t *) lpu) = _mm_cvtsi128_si32(rui); \ + *((int32_t *) lpv) = _mm_cvtsi128_si32(rvi); \ + } \ + lpy += 8; lpu += 4; lpv += 4; \ + x += 8; c += 16; \ + } while (x < w); \ +} while (0) + +#define DRAW_BAR_LINE_NO_CHROMA() \ +do { \ + x = 0; \ + c = (const float *) color; \ + do { \ + hx = _mm_load_ps(h+x); \ + hx2 = _mm_load_ps(h+x+4); \ + is_le = _mm_castps_si128(_mm_cmple_ps(hx, ht)); \ + is_le = _mm_packs_epi32(is_le, _mm_castps_si128(_mm_cmple_ps(hx2, ht))); \ + is_le = _mm_packs_epi16(is_le, is_le); \ + is_le = _mm_packs_epi16(is_le, is_le); \ + if (-1 == _mm_cvtsi128_si32(is_le)) { \ + *((uint32_t *) lpy) = 0x10101010; \ + *((uint32_t *) (lpy+4)) = 0x10101010; \ + } else { \ + mul = _mm_max_ps(_mm_sub_ps(hx, ht), _mm_setzero_ps()); \ + mul = _mm_mul_ps(mul, _mm_load_ps(rcp_h + x)); \ + mul2 = _mm_max_ps(_mm_sub_ps(hx2, ht), _mm_setzero_ps()); \ + mul2 = _mm_mul_ps(mul2, _mm_load_ps(rcp_h + x + 4)); \ + ry = _mm_mul_ps(mul, _mm_load_ps(c)); \ + ry2 = _mm_mul_ps(mul2, _mm_load_ps(c+8)); \ + ry = _mm_add_ps(ry, _mm_set1_ps(16.0f)); \ + ry2 = _mm_add_ps(ry2, _mm_set1_ps(16.0f)); \ + ryi = _mm_cvtps_epi32(ry); \ + ryi2 = _mm_cvtps_epi32(ry2); \ + ryi = _mm_packs_epi32(ryi, ryi); \ + ryi2 = _mm_packs_epi32(ryi2, ryi2); \ + ryi = _mm_packus_epi16(ryi, ryi); \ + ryi2 = _mm_packus_epi16(ryi2, ryi2); \ + *((int32_t *) lpy) = _mm_cvtsi128_si32(ryi); \ + *((int32_t *) (lpy+4)) = _mm_cvtsi128_si32(ryi2); \ + } \ + lpy += 8; \ + x += 8; c += 16; \ + } while (x < w); \ +} while (0) + + for (y = 0; y < bar_h; y += 2) { + yh = (fmt == AV_PIX_FMT_YUV420P) ? y / 2 : y; + ht = _mm_set1_ps((bar_h - y) * rcp_bar_h); + lpy = vy + y * lsy; + lpu = vu + yh * lsu; + lpv = vv + yh * lsv; + if (fmt == AV_PIX_FMT_YUV444P) + DRAW_BAR_LINE_FULL_CHROMA(); + else + DRAW_BAR_LINE_HALF_CHROMA(); + + ht = _mm_set1_ps((bar_h - (y+1)) * rcp_bar_h); + lpy = vy + (y+1) * lsy; + lpu = vu + (y+1) * lsu; + lpv = vv + (y+1) * lsv; + if (fmt == AV_PIX_FMT_YUV444P) + DRAW_BAR_LINE_FULL_CHROMA(); + else if (fmt == AV_PIX_FMT_YUV422P) + DRAW_BAR_LINE_HALF_CHROMA(); + else + DRAW_BAR_LINE_NO_CHROMA(); + } +#undef DRAW_BAR_LINE_FULL_CHROMA +#undef DRAW_BAR_LINE_HALF_CHROMA +#undef DRAW_BAR_LINE_NO_CHROMA +} + +static void permute_color_bar_full_chroma_sse2(ColorFloat *out, const ColorFloat *in, int len) +{ + float *c = (float *) out; + int k; + + for (k = 0; k < len; k += 4, c += 12) { + c[0] = in[k].yuv.y; + c[1] = in[k+1].yuv.y; + c[2] = in[k+2].yuv.y; + c[3] = in[k+3].yuv.y; + c[4] = in[k].yuv.u; + c[5] = in[k+1].yuv.u; + c[6] = in[k+2].yuv.u; + c[7] = in[k+3].yuv.u; + c[8] = in[k].yuv.v; + c[9] = in[k+1].yuv.v; + c[10] = in[k+2].yuv.v; + c[11] = in[k+3].yuv.v; + } +} + +static void permute_color_bar_half_chroma_sse2(ColorFloat *out, const ColorFloat *in, int len) +{ + float *c = (float *) out; + int k; + + for (k = 0; k < len; k += 8, c += 16) { + c[0] = in[k].yuv.y; + c[1] = in[k+1].yuv.y; + c[2] = in[k+2].yuv.y; + c[3] = in[k+3].yuv.y; + c[8] = in[k+4].yuv.y; + c[9] = in[k+5].yuv.y; + c[10] = in[k+6].yuv.y; + c[11] = in[k+7].yuv.y; + c[4] = in[k].yuv.u; + c[5] = in[k+2].yuv.u; + c[6] = in[k+4].yuv.u; + c[7] = in[k+6].yuv.u; + c[12] = in[k].yuv.v; + c[13] = in[k+2].yuv.v; + c[14] = in[k+4].yuv.v; + c[15] = in[k+6].yuv.v; + } +} +#endif + av_cold void ff_showcqt_init_x86(ShowCQTContext *s) { int cpu_flags = av_get_cpu_flags(); @@ -286,4 +566,24 @@ av_cold void ff_showcqt_init_x86(ShowCQTContext *s) s->cqt_align = 8; } #endif + +#if HAVE_SSE2_INTRINSIC + if (cpu_flags & AV_CPU_FLAG_SSE2) { + switch (s->format) { + case AV_PIX_FMT_RGB24: + s->permute_color_bar = permute_color_bar_full_chroma_sse2; + s->draw_bar = draw_bar_rgb24_sse2; + break; + case AV_PIX_FMT_YUV444P: + s->permute_color_bar = permute_color_bar_full_chroma_sse2; + s->draw_bar = draw_bar_yuv_sse2; + break; + case AV_PIX_FMT_YUV422P: + case AV_PIX_FMT_YUV420P: + s->permute_color_bar = permute_color_bar_half_chroma_sse2; + s->draw_bar = draw_bar_yuv_sse2; + break; + } + } +#endif } -- 2.5.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel