[FFmpeg-devel] [PATCH 2/5] wmalossless: allow calling madd_int16

Christophe Gisquet Sat, 30 Apr 2016 11:30:07 -0700

This is done by actually handling the cascaded LMS data as if it
were int16_t, thus requiring switching at various locations the
computations.
---
 libavcodec/wmalosslessdec.c | 146 +++++++++++++++++++++++++-------------------
 1 file changed, 84 insertions(+), 62 deletions(-)


diff --git a/libavcodec/wmalosslessdec.c b/libavcodec/wmalosslessdec.c
index 9d56d97..f3a2217 100644
--- a/libavcodec/wmalosslessdec.c
+++ b/libavcodec/wmalosslessdec.c
@@ -147,9 +147,9 @@ typedef struct WmallDecodeCtx {
         int scaling;
         int coefsend;
         int bitsend;
-        DECLARE_ALIGNED(16, int32_t, coefs)[MAX_ORDER + 
WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
-        DECLARE_ALIGNED(16, int32_t, lms_prevvalues)[MAX_ORDER * 2 + 
WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
-        DECLARE_ALIGNED(16, int32_t, lms_updates)[MAX_ORDER * 2 + 
WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
+        DECLARE_ALIGNED(16, int32_t, coefs)[MAX_ORDER + 
WMALL_COEFF_PAD_SIZE/sizeof(int32_t)];
+        DECLARE_ALIGNED(16, int32_t, lms_prevvalues)[MAX_ORDER * 2 + 
WMALL_COEFF_PAD_SIZE/sizeof(int32_t)];
+        DECLARE_ALIGNED(16, int32_t, lms_updates)[MAX_ORDER * 2 + 
WMALL_COEFF_PAD_SIZE/sizeof(int32_t)];
         int recent;
     } cdlms[WMALL_MAX_CHANNELS][9];
 
@@ -458,6 +458,7 @@ static int decode_cdlms(WmallDecodeCtx *s)
     int cdlms_send_coef = get_bits1(&s->gb);
 
     for (c = 0; c < s->num_channels; c++) {
+        int shift = s->bits_per_sample > 16 ? 0 : 1;
         s->cdlms_ttl[c] = get_bits(&s->gb, 3) + 1;
         for (i = 0; i < s->cdlms_ttl[c]; i++) {
             s->cdlms[c][i].order = (get_bits(&s->gb, 7) + 1) * 8;
@@ -495,14 +496,20 @@ static int decode_cdlms(WmallDecodeCtx *s)
                 s->cdlms[c][i].bitsend = get_bitsz(&s->gb, cbits) + 2;
                 shift_l = 32 - s->cdlms[c][i].bitsend;
                 shift_r = 32 - s->cdlms[c][i].scaling - 2;
+                if (s->bits_per_sample > 16) {
                 for (j = 0; j < s->cdlms[c][i].coefsend; j++)
                     s->cdlms[c][i].coefs[j] =
                         (get_bits(&s->gb, s->cdlms[c][i].bitsend) << shift_l) 
>> shift_r;
+                } else {
+                    int16_t *ptr = (int16_t*)s->cdlms[c][i].coefs;
+                    for (j = 0; j < s->cdlms[c][i].coefsend; j++)
+                        ptr[j] = (get_bits(&s->gb, s->cdlms[c][i].bitsend) << 
shift_l) >> shift_r;
+                }
             }
         }
 
         for (i = 0; i < s->cdlms_ttl[c]; i++)
-            memset(s->cdlms[c][i].coefs + s->cdlms[c][i].order,
+            memset(s->cdlms[c][i].coefs + (s->cdlms[c][i].order >> shift),
                    0, WMALL_COEFF_PAD_SIZE);
     }
 
@@ -694,32 +701,6 @@ static void revert_mclms(WmallDecodeCtx *s, int tile_size)
     }
 }
 
-static void lms_update(WmallDecodeCtx *s, int ich, int ilms, int input)
-{
-    int recent = s->cdlms[ich][ilms].recent;
-    int range  = 1 << s->bits_per_sample - 1;
-    int order  = s->cdlms[ich][ilms].order;
-
-    if (recent)
-        recent--;
-    else {
-        memcpy(s->cdlms[ich][ilms].lms_prevvalues + order,
-               s->cdlms[ich][ilms].lms_prevvalues, 
sizeof(*s->cdlms[ich][ilms].lms_prevvalues) * order);
-        memcpy(s->cdlms[ich][ilms].lms_updates + order,
-               s->cdlms[ich][ilms].lms_updates, 
sizeof(*s->cdlms[ich][ilms].lms_updates) * order);
-        recent = order - 1;
-    }
-
-    s->cdlms[ich][ilms].lms_prevvalues[recent] = av_clip(input, -range, range 
- 1);
-    s->cdlms[ich][ilms].lms_updates[recent] = WMASIGN(input) * 
s->update_speed[ich];
-
-    s->cdlms[ich][ilms].lms_updates[recent + (order >> 4)] >>= 2;
-    s->cdlms[ich][ilms].lms_updates[recent + (order >> 3)] >>= 1;
-    s->cdlms[ich][ilms].recent = recent;
-    memset(s->cdlms[ich][ilms].lms_updates + recent + order, 0,
-           sizeof(s->cdlms[ich][ilms].lms_updates) - 4*(recent+order));
-}
-
 static void use_high_update_speed(WmallDecodeCtx *s, int ich)
 {
     int ilms, recent, icoef;
@@ -727,12 +708,16 @@ static void use_high_update_speed(WmallDecodeCtx *s, int 
ich)
         recent = s->cdlms[ich][ilms].recent;
         if (s->update_speed[ich] == 16)
             continue;
-        if (s->bV3RTM) {
+        if (s->bits_per_sample > 16) {
+            int32_t *updates = s->cdlms[ich][ilms].lms_updates;
+            if (s->bV3RTM) updates += recent;
             for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-                s->cdlms[ich][ilms].lms_updates[icoef + recent] *= 2;
+                updates[icoef] *= 2;
         } else {
+            int16_t *updates = (int16_t *)s->cdlms[ich][ilms].lms_updates;
+            if (s->bV3RTM) updates += recent;
             for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-                s->cdlms[ich][ilms].lms_updates[icoef] *= 2;
+                updates[icoef] *= 2;
         }
     }
     s->update_speed[ich] = 16;
@@ -745,42 +730,76 @@ static void use_normal_update_speed(WmallDecodeCtx *s, 
int ich)
         recent = s->cdlms[ich][ilms].recent;
         if (s->update_speed[ich] == 8)
             continue;
-        if (s->bV3RTM)
+        if (s->bits_per_sample > 16) {
+            int32_t *updates = s->cdlms[ich][ilms].lms_updates;
+            if (s->bV3RTM) updates += recent;
             for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-                s->cdlms[ich][ilms].lms_updates[icoef + recent] /= 2;
-        else
+                updates[icoef] /= 2;
+        } else {
+            int16_t *updates = (int16_t *)s->cdlms[ich][ilms].lms_updates;
+            if (s->bV3RTM) updates += recent;
             for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-                s->cdlms[ich][ilms].lms_updates[icoef] /= 2;
+                updates[icoef] /= 2;
+        }
     }
     s->update_speed[ich] = 8;
 }
 
-static void revert_cdlms(WmallDecodeCtx *s, int ch,
-                         int coef_begin, int coef_end)
-{
-    int icoef, pred, ilms, num_lms, residue, input;
-
-    num_lms = s->cdlms_ttl[ch];
-    for (ilms = num_lms - 1; ilms >= 0; ilms--) {
-        for (icoef = coef_begin; icoef < coef_end; icoef++) {
-            pred = 1 << (s->cdlms[ch][ilms].scaling - 1);
-            residue = s->channel_residues[ch][icoef];
-            pred += 
s->dsp.scalarproduct_and_madd_int32(s->cdlms[ch][ilms].coefs,
-                                                        
s->cdlms[ch][ilms].lms_prevvalues
-                                                            + 
s->cdlms[ch][ilms].recent,
-                                                        
s->cdlms[ch][ilms].lms_updates
-                                                            + 
s->cdlms[ch][ilms].recent,
-                                                        
FFALIGN(s->cdlms[ch][ilms].order,
-                                                                
WMALL_COEFF_PAD_SIZE),
-                                                        WMASIGN(residue));
-            input = residue + (pred >> s->cdlms[ch][ilms].scaling);
-            lms_update(s, ch, ilms, input);
-            s->channel_residues[ch][icoef] = input;
-        }
-    }
-    emms_c();
+#define CD_LMS(bits, ROUND) \
+static void lms_update ## bits (WmallDecodeCtx *s, int ich, int ilms, int 
input) \
+{ \
+    int recent = s->cdlms[ich][ilms].recent; \
+    int range  = 1 << s->bits_per_sample - 1; \
+    int order  = s->cdlms[ich][ilms].order; \
+    int ##bits##_t *prev = (int##bits##_t 
*)s->cdlms[ich][ilms].lms_prevvalues; \
+    int ##bits##_t *upd = (int##bits##_t *)s->cdlms[ich][ilms].lms_updates; \
+ \
+    if (recent) \
+        recent--; \
+    else { \
+        memcpy(prev + order, prev, (bits/8) * order); \
+        memcpy(upd + order, upd, (bits/8) * order); \
+        recent = order - 1; \
+    } \
+ \
+    prev[recent] = av_clip(input, -range, range - 1); \
+    upd[recent] = WMASIGN(input) * s->update_speed[ich]; \
+ \
+    upd[recent + (order >> 4)] >>= 2; \
+    upd[recent + (order >> 3)] >>= 1; \
+    s->cdlms[ich][ilms].recent = recent; \
+    memset(upd + recent + order, 0, (bits/8)*(MAX_ORDER * 2 - recent - 
order)); \
+} \
+ \
+static void revert_cdlms ## bits (WmallDecodeCtx *s, int ch, \
+                                  int coef_begin, int coef_end) \
+{ \
+    int icoef, pred, ilms, num_lms, residue, input; \
+ \
+    num_lms = s->cdlms_ttl[ch]; \
+    for (ilms = num_lms - 1; ilms >= 0; ilms--) { \
+        for (icoef = coef_begin; icoef < coef_end; icoef++) { \
+            int##bits##_t *coeffs = (int##bits##_t *)s->cdlms[ch][ilms].coefs; 
\
+            int##bits##_t *prevvalues = (int##bits##_t 
*)s->cdlms[ch][ilms].lms_prevvalues; \
+            int##bits##_t *updates = (int##bits##_t 
*)s->cdlms[ch][ilms].lms_updates; \
+            pred = 1 << (s->cdlms[ch][ilms].scaling - 1); \
+            residue = s->channel_residues[ch][icoef]; \
+            pred += s->dsp.scalarproduct_and_madd_int## bits (coeffs, \
+                                                        prevvalues + 
s->cdlms[ch][ilms].recent, \
+                                                        updates + 
s->cdlms[ch][ilms].recent, \
+                                                        
FFALIGN(s->cdlms[ch][ilms].order, ROUND), \
+                                                        WMASIGN(residue)); \
+            input = residue + (pred >> s->cdlms[ch][ilms].scaling); \
+            lms_update ## bits(s, ch, ilms, input); \
+            s->channel_residues[ch][icoef] = input; \
+        } \
+    } \
+    if (bits <= 16) emms_c(); \
 }
 
+CD_LMS(16, WMALL_COEFF_PAD_SIZE)
+CD_LMS(32, 8)
+
 static void revert_inter_ch_decorr(WmallDecodeCtx *s, int tile_size)
 {
     if (s->num_channels != 2)
@@ -953,7 +972,10 @@ static int decode_subframe(WmallDecodeCtx *s)
                     use_high_update_speed(s, i);
                 else
                     use_normal_update_speed(s, i);
-                revert_cdlms(s, i, 0, subframe_len);
+                if (s->bits_per_sample > 16)
+                    revert_cdlms32(s, i, 0, subframe_len);
+                else
+                    revert_cdlms16(s, i, 0, subframe_len);
             } else {
                 memset(s->channel_residues[i], 0, 
sizeof(**s->channel_residues) * subframe_len);
             }
-- 
2.8.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH 2/5] wmalossless: allow calling madd_int16

Reply via email to