Looks good. Thanks for the update. Jose
----- Original Message ----- > From: Roland Scheidegger <srol...@vmware.com> > > srgb-to-linear is using 3rd degree polynomial for now which should be _just_ > good enough. Reverse is using some rational polynomials and is quite > accurate, > though not hooked into llvmpipe's blend code yet and hence unused (untested). > Using a table might also be an option (for srgb-to-linear especially). > This does not enable any new features yet because EXT_texture_srgb was > already > supported via util_format fallbacks, but performance was lacking probably due > to the external function call (the table used by the util_format_srgb code > may > not be all that much slower on its own). > Some performance figures (taken from modified gloss, replaced both base and > sphere texture to use GL_SRGB instead of GL_RGB, measured on 1Ghz Sandy > Bridge, > the numbers aren't terribly accurate): > > normal gloss, aos, 8-wide: 47 fps > normal gloss, aos, 4-wide: 48 fps > > normal gloss, forced to soa, 8-wide: 48 fps > normal gloss, forced to soa, 4-wide: 47 fps > > patched gloss, old code, soa, 8-wide: 21 fps > patched gloss, old code, soa, 4-wide: 24 fps > > patched gloss, new code, soa, 8-wide: 41 fps > patched gloss, new code, soa, 4-wide: 38 fps > > So there's a performance hit but it seems acceptable, certainly better > than using the fallback. > Note the new code only works for 4x8bit srgb formats, others (L8/L8A8) will > continue to use the old util_format fallback, because I can't be bothered > to write code for formats noone uses anyway (as decoding is done as part of > lp_build_unpack_rgba_soa which can only handle block type width of 32). > Compressed srgb formats should get their own path though eventually (it is > going to be expensive in any case, first decompress, then convert). > No piglit regressions. > > v2: use lp_build_polynomial instead of ad-hoc polynomial construction, also > since keeping both linear to srgb functions for now make sure both are > compiled (since they share quite some code just integrate into the same > function). > --- > src/gallium/auxiliary/Makefile.sources | 1 + > src/gallium/auxiliary/gallivm/lp_bld_arit.c | 2 +- > src/gallium/auxiliary/gallivm/lp_bld_arit.h | 6 + > src/gallium/auxiliary/gallivm/lp_bld_format.h | 11 + > src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 25 +- > src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c | 291 > ++++++++++++++++++++ > 6 files changed, 329 insertions(+), 7 deletions(-) > create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c > > diff --git a/src/gallium/auxiliary/Makefile.sources > b/src/gallium/auxiliary/Makefile.sources > index 4751762..8cffeb0 100644 > --- a/src/gallium/auxiliary/Makefile.sources > +++ b/src/gallium/auxiliary/Makefile.sources > @@ -172,6 +172,7 @@ GALLIVM_SOURCES := \ > gallivm/lp_bld_format_aos.c \ > gallivm/lp_bld_format_aos_array.c \ > gallivm/lp_bld_format_float.c \ > + gallivm/lp_bld_format_srgb.c \ > gallivm/lp_bld_format_soa.c \ > gallivm/lp_bld_format_yuv.c \ > gallivm/lp_bld_gather.c \ > diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c > b/src/gallium/auxiliary/gallivm/lp_bld_arit.c > index 7d6fe04..e7955aa 100644 > --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c > +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c > @@ -2896,7 +2896,7 @@ lp_build_log(struct lp_build_context *bld, > * Generate polynomial. > * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. > */ > -static LLVMValueRef > +LLVMValueRef > lp_build_polynomial(struct lp_build_context *bld, > LLVMValueRef x, > const double *coeffs, > diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h > b/src/gallium/auxiliary/gallivm/lp_bld_arit.h > index 920e339..04e180c 100644 > --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h > +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h > @@ -239,6 +239,12 @@ lp_build_fast_rsqrt(struct lp_build_context *bld, > LLVMValueRef a); > > LLVMValueRef > +lp_build_polynomial(struct lp_build_context *bld, > + LLVMValueRef x, > + const double *coeffs, > + unsigned num_coeffs); > + > +LLVMValueRef > lp_build_cos(struct lp_build_context *bld, > LLVMValueRef a); > > diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h > b/src/gallium/auxiliary/gallivm/lp_bld_format.h > index 12a0318..744d002 100644 > --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h > +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h > @@ -158,4 +158,15 @@ lp_build_rgb9e5_to_float(struct gallivm_state *gallivm, > LLVMValueRef src, > LLVMValueRef *dst); > > +LLVMValueRef > +lp_build_linear_to_srgb(struct gallivm_state *gallivm, > + struct lp_type src_type, > + LLVMValueRef src); > + > +LLVMValueRef > +lp_build_srgb_to_linear(struct gallivm_state *gallivm, > + struct lp_type src_type, > + LLVMValueRef src); > + > + > #endif /* !LP_BLD_FORMAT_H */ > diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c > b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c > index 4c6bd81..114ce03 100644 > --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c > +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c > @@ -163,11 +163,23 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm, > */ > > if (type.floating) { > - if(format_desc->channel[chan].normalized) > - input = lp_build_unsigned_norm_to_float(gallivm, width, type, > input); > - else > - input = LLVMBuildSIToFP(builder, input, > - lp_build_vec_type(gallivm, type), > ""); > + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { > + assert(width == 8); > + if (format_desc->swizzle[3] == chan) { > + input = lp_build_unsigned_norm_to_float(gallivm, width, > type, input); > + } > + else { > + struct lp_type conv_type = lp_uint_type(type); > + input = lp_build_srgb_to_linear(gallivm, conv_type, > input); > + } > + } > + else { > + if(format_desc->channel[chan].normalized) > + input = lp_build_unsigned_norm_to_float(gallivm, width, > type, input); > + else > + input = LLVMBuildSIToFP(builder, input, > + lp_build_vec_type(gallivm, type), > ""); > + } > } > else if (format_desc->channel[chan].pure_integer) { > /* Nothing to do */ > @@ -344,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, > > if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && > (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || > + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB || > format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && > format_desc->block.width == 1 && > format_desc->block.height == 1 && > @@ -394,7 +407,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, > packed = lp_build_gather(gallivm, type.length, > format_desc->block.bits, > type.width, base_ptr, offset, > - FALSE); > + FALSE); > if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) { > lp_build_r11g11b10_to_float(gallivm, packed, rgba_out); > } > diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c > b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c > new file mode 100644 > index 0000000..72cd05f > --- /dev/null > +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c > @@ -0,0 +1,291 @@ > +/************************************************************************** > + * > + * Copyright 2013 VMware, Inc. > + * All Rights Reserved. > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the > + * "Software"), to deal in the Software without restriction, including > + * without limitation the rights to use, copy, modify, merge, publish, > + * distribute, sub license, and/or sell copies of the Software, and to > + * permit persons to whom the Software is furnished to do so, subject to > + * the following conditions: > + * > + * The above copyright notice and this permission notice (including the > + * next paragraph) shall be included in all copies or substantial portions > + * of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS > + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. > + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR > + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, > + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE > + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. > + * > + **************************************************************************/ > + > + > +/** > + * @file > + * Format conversion code for srgb formats. > + * > + * Functions for converting from srgb to linear and vice versa. > + * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt: > + * > + * srgb->linear: > + * cl = cs / 12.92, cs <= 0.04045 > + * cl = ((cs + 0.055)/1.055)^2.4, cs > 0.04045 > + * > + * linear->srgb: > + * if (isnan(cl)) { > + * Map IEEE-754 Not-a-number to zero. > + * cs = 0.0; > + * } else if (cl > 1.0) { > + * cs = 1.0; > + * } else if (cl < 0.0) { > + * cs = 0.0; > + * } else if (cl < 0.0031308) { > + * cs = 12.92 * cl; > + * } else { > + * cs = 1.055 * pow(cl, 0.41666) - 0.055; > + * } > + * > + * This does not need to be accurate, however at least for d3d10 > + * > (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx): > + * 1) For srgb->linear, it is required that the error on the srgb side is > + * not larger than 0.5f, which I interpret that if you map the value back > + * to srgb from linear using the ideal conversion, it would not be off by > + * more than 0.5f (that is, it would map to the same 8-bit integer value > + * as it was before conversion to linear). > + * 2) linear->srgb is permitted 0.6f which luckily looks like quite a large > + * error is allowed. > + * 3) Additionally, all srgb values converted to linear and back must result > + * in the same value as they were originally. > + * > + * @author Roland Scheidegger <srol...@vmware.com> > + */ > + > + > +#include "util/u_debug.h" > + > +#include "lp_bld_type.h" > +#include "lp_bld_const.h" > +#include "lp_bld_arit.h" > +#include "lp_bld_bitarit.h" > +#include "lp_bld_logic.h" > +#include "lp_bld_format.h" > + > + > + > +/** > + * Convert srgb int values to linear float values. > + * Several possibilities how to do this, e.g. > + * - table > + * - doing the pow() with int-to-float and float-to-int tricks > + * > (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) > + * - just using standard polynomial approximation > + * (3rd order polynomial is required for crappy but just sufficient > accuracy) > + * > + * @param src integer (vector) value(s) to convert > + * (8 bit values unpacked to 32 bit already). > + */ > +LLVMValueRef > +lp_build_srgb_to_linear(struct gallivm_state *gallivm, > + struct lp_type src_type, > + LLVMValueRef src) > +{ > + struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32); > + struct lp_build_context f32_bld; > + LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh; > + double coeffs[4] = {0.0023f, > + 0.0030f / 255.0f, > + 0.6935f / (255.0f * 255.0f), > + 0.3012f / (255.0f * 255.0f * 255.0f) > + }; > + > + assert(src_type.width == 32); > + > + lp_build_context_init(&f32_bld, gallivm, f32_type); > + > + /* > + * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + > 0.0023) > + * ( poly = 0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023) > + * (found with octave polyfit and some magic as I couldn't get the error > + * function right). Using the above mentioned error function, the values > stay > + * within +-0.35, except for the lowest values - hence tweaking linear > segment > + * to cover the first 16 instead of the first 11 values (the error stays > + * just about acceptable there too). > + * Hence: lin = src > 15 ? poly : src / 12.6 > + * This function really only makes sense for vectors, should use LUT > otherwise. > + * All in all (including float conversion) 11 instructions (with sse4.1), > + * 6 constants (polynomial could be done with 1 instruction less at the > cost > + * of slightly worse dependency chain, fma should also help). > + */ > + /* doing the 1/255 mul as part of the approximation */ > + srcf = lp_build_int_to_float(&f32_bld, src); > + lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * > 255.0f)); > + part_lin = lp_build_mul(&f32_bld, srcf, lin_const); > + > + part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4); > + > + lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f); > + is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, > lin_thresh); > + return lp_build_select(&f32_bld, is_linear, part_lin, part_pow); > +} > + > + > +/** > + * Convert linear float values to srgb int values. > + * Several possibilities how to do this, e.g. > + * - use table (based on exponent/highest order mantissa bits) and do > + * linear interpolation (https://gist.github.com/rygorous/2203834) > + * - Chebyshev polynomial > + * - Approximation using reciprocals > + * - using int-to-float and float-to-int tricks for pow() > + * > (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) > + * > + * @param src float (vector) value(s) to convert. > + */ > +LLVMValueRef > +lp_build_linear_to_srgb(struct gallivm_state *gallivm, > + struct lp_type src_type, > + LLVMValueRef src) > +{ > + LLVMBuilderRef builder = gallivm->builder; > + struct lp_build_context f32_bld; > + LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final; > + > + lp_build_context_init(&f32_bld, gallivm, src_type); > + > + src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one); > + > + if (0) { > + /* > + * using int-to-float and float-to-int trick for pow(). > + * This is much more accurate than necessary thanks to the correction, > + * but it most certainly makes no sense without rsqrt available. > + * Bonus points if you understand how this works... > + * All in all (including min/max clamp, conversion) 19 instructions. > + */ > + > + float exp_f = 2.0f/3.0f; > + float coeff_f = 0.62996f; > + LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2; > + struct lp_type int_type = lp_int_type(src_type); > + > + /* > + * First calculate approx x^8/12 > + */ > + exponent = lp_build_const_vec(gallivm, src_type, exp_f); > + coeff = lp_build_const_vec(gallivm, src_type, > + exp2f(127 / exp_f - 127) * powf(coeff_f, > 1.0f /exp_f)); > + > + /* premultiply src */ > + tmp = lp_build_mul(&f32_bld, coeff, src); > + /* "log2" */ > + tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, > int_type), ""); > + tmp = lp_build_int_to_float(&f32_bld, tmp); > + /* multiply for pow */ > + tmp = lp_build_mul(&f32_bld, tmp, exponent); > + /* "exp2" */ > + pow_approx = lp_build_itrunc(&f32_bld, tmp); > + pow_approx = LLVMBuildBitCast(builder, tmp, > + lp_build_vec_type(gallivm, src_type), > ""); > + > + /* > + * Since that pow was inaccurate (like 3 bits, though each sqrt step > would > + * give another bit), compensate the error (which is why we chose > another > + * exponent in the first place). > + */ > + /* x * x^(8/12) = x^(20/12) */ > + pow_1 = lp_build_mul(&f32_bld, pow_approx, src); > + > + /* x * x * x^(-4/12) = x^(20/12) */ > + /* Should avoid using rsqrt if it's not available, but > + * using x * x^(4/12) * x^(4/12) instead will change error weight */ > + tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx); > + x2 = lp_build_mul(&f32_bld, src, src); > + pow_2 = lp_build_mul(&f32_bld, x2, tmp); > + > + /* average the values so the errors cancel out, compensate bias, > + * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 > mul > + * for conversion to int in here */ > + tmp = lp_build_add(&f32_bld, pow_1, pow_2); > + coeff = lp_build_const_vec(gallivm, src_type, > + 1.0f/(3.0f*coeff_f) * 0.999852f * > powf(1.055f * 255.0f, 4.0f)); > + pow_final = lp_build_mul(&f32_bld, tmp, coeff); > + > + /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */ > + if (lp_build_fast_rsqrt_available(src_type)) { > + pow_final = lp_build_fast_rsqrt(&f32_bld, > lp_build_fast_rsqrt(&f32_bld, pow_final)); > + } > + else { > + pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, > pow_final)); > + } > + pow_final = lp_build_add(&f32_bld, pow_final, > + lp_build_const_vec(gallivm, src_type, > -0.055f)); > + } > + > + else { > + /* > + * using "rational polynomial" approximation here. > + * Essentially y = a*x^0.375 + b*x^0.5 + c, with also > + * factoring in the 255.0 mul and the scaling mul. > + * (a is closer to actual value so has higher weight than b.) > + * Note: the constants are magic values. They were found empirically, > + * possibly could be improved but good enough (be VERY careful with > + * error metric if you'd want to tweak them, they also MUST fit with > + * the crappy polynomial above for srgb->linear since it is required > + * that each srgb value maps back to the same value). > + * This function has an error of max +-0.17 (and we'd only require > +-0.6), > + * for the approximated srgb->linear values the error is naturally > larger > + * (+-0.42) but still accurate enough (required +-0.5 essentially). > + * All in all (including min/max clamp, conversion) 15 instructions. > + * FMA would help (minus 2 instructions). > + */ > + > + LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2; > + > + if (lp_build_fast_rsqrt_available(src_type)) { > + tmp = lp_build_fast_rsqrt(&f32_bld, src); > + x05 = lp_build_mul(&f32_bld, src, tmp); > + } > + else { > + /* > + * I don't really expect this to be practical without rsqrt > + * but there's no reason for triple punishment so at least > + * save the otherwise resulting division and unnecessary mul... > + */ > + x05 = lp_build_sqrt(&f32_bld, src); > + } > + > + tmp = lp_build_mul(&f32_bld, x05, src); > + if (lp_build_fast_rsqrt_available(src_type)) { > + x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, > tmp)); > + } > + else { > + x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp)); > + } > + > + a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * > 255.0f); > + b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * > 255.0f); > + c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f); > + > + tmp = lp_build_mul(&f32_bld, a_const, x0375); > + tmp2 = lp_build_mul(&f32_bld, b_const, x05); > + tmp2 = lp_build_add(&f32_bld, tmp2, c_const); > + pow_final = lp_build_add(&f32_bld, tmp, tmp2); > + } > + > + /* linear part is easy */ > + lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f); > + lin = lp_build_mul(&f32_bld, src, lin_const); > + > + lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f); > + is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, > lin_thresh); > + tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final); > + > + f32_bld.type.sign = 0; > + return lp_build_iround(&f32_bld, tmp); > +} > -- > 1.7.9.5 > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev