From 306d57d9c26da57d59405045a0878cb51e96f069 Mon Sep 17 00:00:00 2001
From: Martin Vignali <martin.vignali@gmail.com>
Date: Fri, 24 Aug 2018 14:56:44 +0200
Subject: [PATCH 1/4] swscale/input : add bit_exact for float to uint16
 conversion

reuse the float to uint16 func currently use inside exr decoder

A test with a psd file (grayfbe) convert to gray16be using bitexact
show a speed improvment around 20% (x86_64 clang os10.12)
---
 libswscale/input.c            | 48 +++++++++++++++++++++++++++++++++++++++----
 libswscale/swscale_internal.h | 20 ++++++++++++++++++
 2 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/libswscale/input.c b/libswscale/input.c
index 4099c19c2b..8ddb552b9c 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -954,6 +954,18 @@ static av_always_inline void grayf32ToY16_c(uint8_t *_dst, const uint8_t *_src,
     }
 }
 
+static av_always_inline void grayf32ToY16_bitexact_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
+                                                     const uint8_t *unused2, int width, uint32_t *unused)
+{
+    int i;
+    const uint32_t *src = (const uint32_t *)_src;
+    uint16_t *dst    = (uint16_t *)_dst;
+
+    for (i = 0; i < width; ++i) {
+        dst[i] = flt_2_uint16(src[i]);
+    }
+}
+
 static av_always_inline void grayf32ToY16_bswap_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
                                                   const uint8_t *unused2, int width, uint32_t *unused)
 {
@@ -966,6 +978,18 @@ static av_always_inline void grayf32ToY16_bswap_c(uint8_t *_dst, const uint8_t *
     }
 }
 
+static av_always_inline void grayf32ToY16_bswap_bitexact_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
+                                                           const uint8_t *unused2, int width, uint32_t *unused)
+{
+    int i;
+    const uint32_t *src = (const uint32_t *)_src;
+    uint16_t *dst    = (uint16_t *)_dst;
+
+    for (i = 0; i < width; ++i){
+        dst[i] = flt_2_uint16(av_bswap32(src[i]));
+    }
+}
+
 #define rgb9plus_planar_funcs_endian(nbits, endian_name, endian)                                    \
 static void planar_rgb##nbits##endian_name##_to_y(uint8_t *dst, const uint8_t *src[4],              \
                                                   int w, int32_t *rgb2yuv)                          \
@@ -1564,16 +1588,32 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
         break;
     case AV_PIX_FMT_GRAYF32LE:
 #if HAVE_BIGENDIAN
-        c->lumToYV12 = grayf32ToY16_bswap_c;
+        if (c->flags & SWS_BITEXACT) {
+            c->lumToYV12 = grayf32ToY16_bswap_bitexact_c;
+        } else {
+            c->lumToYV12 = grayf32ToY16_bswap_c;
+        }
 #else
-        c->lumToYV12 = grayf32ToY16_c;
+        if (c->flags & SWS_BITEXACT) {
+            c->lumToYV12 = grayf32ToY16_bitexact_c;
+        } else {
+            c->lumToYV12 = grayf32ToY16_c;
+        }
 #endif
         break;
     case AV_PIX_FMT_GRAYF32BE:
 #if HAVE_BIGENDIAN
-        c->lumToYV12 = grayf32ToY16_c;
+        if (c->flags & SWS_BITEXACT) {
+            c->lumToYV12 = grayf32ToY16_bitexact_c;
+        } else {
+            c->lumToYV12 = grayf32ToY16_c;
+        }
 #else
-        c->lumToYV12 = grayf32ToY16_bswap_c;
+        if (c->flags & SWS_BITEXACT) {
+            c->lumToYV12 = grayf32ToY16_bswap_bitexact_c;
+        } else {
+            c->lumToYV12 = grayf32ToY16_bswap_c;
+        }
 #endif
         break;
     }
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 4fa59386a6..3bbb486b4d 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -922,6 +922,26 @@ static inline void fillPlane16(uint8_t *plane, int stride, int width, int height
     }
 }
 
+/**
+ * Convert from 32-bit float as uint32_t to uint16_t.
+ *
+ * @param v 32-bit float
+ *
+ * @return normalized 16-bit unsigned int
+ */
+static inline uint16_t flt_2_uint16(int32_t v)
+{
+    int32_t exp = v >> 23;
+    // "HACK": negative values result in exp<  0, so clipping them to 0
+    // is also handled by this condition, avoids explicit check for sign bit.
+    if (exp <= 127 + 7 - 24) // we would shift out all bits anyway
+        return 0;
+    if (exp >= 127)
+        return 0xffff;
+    v &= 0x007fffff;
+    return (v + (1 << 23)) >> (127 + 7 - exp);
+}
+
 #define MAX_SLICE_PLANES 4
 
 /// Slice plane
-- 
2.14.3 (Apple Git-98)