For ease of implementation, existing line-conversion functions for 24-bit formats write each byte individually. Optimize the performance by writing 4 pixels in 3 32-bit stores.
Signed-off-by: Thomas Zimmermann <tzimmerm...@suse.de> --- drivers/gpu/drm/drm_format_helper.c | 36 ++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_format_helper.c b/drivers/gpu/drm/drm_format_helper.c index a926aa6671fc..b9c9c712aa9c 100644 --- a/drivers/gpu/drm/drm_format_helper.c +++ b/drivers/gpu/drm/drm_format_helper.c @@ -274,10 +274,44 @@ static __always_inline void drm_fb_xfrm_line_32to24(void *dbuf, const void *sbuf unsigned int pixels, u32 (*xfrm_pixel)(u32)) { - u8 *dbuf8 = dbuf; + __le32 *dbuf32 = dbuf; + u8 *dbuf8; const __le32 *sbuf32 = sbuf; const __le32 *send32 = sbuf32 + pixels; + /* write pixels in chunks of 4 */ + send32 -= pixels & GENMASK(1, 0); + while (sbuf32 < send32) { + u32 val24[4] = { + xfrm_pixel(le32_to_cpup(sbuf32++)), + xfrm_pixel(le32_to_cpup(sbuf32++)), + xfrm_pixel(le32_to_cpup(sbuf32++)), + xfrm_pixel(le32_to_cpup(sbuf32++)), + }; + u32 out32[3] = { + /* write output bytes in reverse order for little endianness */ + ((val24[0] & 0x000000ff)) | + ((val24[0] & 0x0000ff00)) | + ((val24[0] & 0x00ff0000)) | + ((val24[1] & 0x000000ff) << 24), + ((val24[1] & 0x0000ff00) >> 8) | + ((val24[1] & 0x00ff0000) >> 8) | + ((val24[2] & 0x000000ff) << 16) | + ((val24[2] & 0x0000ff00) << 16), + ((val24[2] & 0x00ff0000) >> 16) | + ((val24[3] & 0x000000ff) << 8) | + ((val24[3] & 0x0000ff00) << 8) | + ((val24[3] & 0x00ff0000) << 8), + }; + + *dbuf32++ = cpu_to_le32(out32[0]); + *dbuf32++ = cpu_to_le32(out32[1]); + *dbuf32++ = cpu_to_le32(out32[2]); + } + send32 += pixels & GENMASK(1, 0); + + /* write trailing pixel */ + dbuf8 = (u8 __force *)dbuf32; while (sbuf32 < send32) { u32 val24 = xfrm_pixel(le32_to_cpup(sbuf32++)); /* write output in reverse order for little endianness */ -- 2.48.1