This patch is Reviewed-by: Ian Romanick <ian.d.roman...@intel.com>
On 01/26/2017 02:50 PM, Francisco Jerez wrote: > This addresses several issues of the current atan2 implementation: > > - Negative zero (and negative denorms which end up getting flushed to > zero) isn't handled correctly by the current implementation. The > reason is that it does 'y >= 0' and 'x < 0' comparisons to decide > on which side of the branch cut the argument is, which causes us to > return incorrect results (off by up to 2π) for very small negative > values. > > - There is a serious precision problem for x values of large enough > magnitude introduced by the floating point division operation being > implemented as a mul+rcp sequence. This can lead to the quotient > getting flushed to zero in some cases introducing an error of over > 8e6 ULP in the result -- Or in the most catastrophic case will > cause us to return NaN instead of the correct value ±π/2 for y=±∞ > and x very large. We can fix this easily by scaling down both > arguments when the absolute value of the denominator goes above > certain threshold. The error of this atan2 implementation remains > below 25 ULP in most of its domain except for a neighborhood of y=0 > where it reaches a maximum error of about 180 ULP. > > - It emits a bunch of instructions including no less than three > if-else branches per scalar component that don't seem to get > optimized out later on. This implementation uses about 13% less > instructions on Intel SKL hardware and doesn't emit any control > flow instructions. > > v2: Fix up argument scaling to take into account the range and > precision of exotic FP24 hardware. Flip coordinate system for > arguments along the vertical line as if they were on the left > half-plane in order to avoid division by zero which may give > unspecified results on non-GLSL 4.1-capable hardware. Sprinkle in > some more comments. > --- > src/compiler/glsl/builtin_functions.cpp | 96 > ++++++++++++++++++++------------- > 1 file changed, 60 insertions(+), 36 deletions(-) > > diff --git a/src/compiler/glsl/builtin_functions.cpp > b/src/compiler/glsl/builtin_functions.cpp > index 4a6c5af..432df65 100644 > --- a/src/compiler/glsl/builtin_functions.cpp > +++ b/src/compiler/glsl/builtin_functions.cpp > @@ -3560,44 +3560,68 @@ builtin_builder::_acos(const glsl_type *type) > ir_function_signature * > builtin_builder::_atan2(const glsl_type *type) > { > - ir_variable *vec_y = in_var(type, "vec_y"); > - ir_variable *vec_x = in_var(type, "vec_x"); > - MAKE_SIG(type, always_available, 2, vec_y, vec_x); > - > - ir_variable *vec_result = body.make_temp(type, "vec_result"); > - ir_variable *r = body.make_temp(glsl_type::float_type, "r"); > - for (int i = 0; i < type->vector_elements; i++) { > - ir_variable *y = body.make_temp(glsl_type::float_type, "y"); > - ir_variable *x = body.make_temp(glsl_type::float_type, "x"); > - body.emit(assign(y, swizzle(vec_y, i, 1))); > - body.emit(assign(x, swizzle(vec_x, i, 1))); > - > - /* If |x| >= 1.0e-8 * |y|: */ > - ir_if *outer_if = > - new(mem_ctx) ir_if(greater(abs(x), mul(imm(1.0e-8f), abs(y)))); > - > - ir_factory outer_then(&outer_if->then_instructions, mem_ctx); > - > - /* Then...call atan(y/x) */ > - do_atan(outer_then, glsl_type::float_type, r, div(y, x)); > - > - /* ...and fix it up: */ > - ir_if *inner_if = new(mem_ctx) ir_if(less(x, imm(0.0f))); > - inner_if->then_instructions.push_tail( > - if_tree(gequal(y, imm(0.0f)), > - assign(r, add(r, imm(M_PIf))), > - assign(r, sub(r, imm(M_PIf))))); > - outer_then.emit(inner_if); > - > - /* Else... */ > - outer_if->else_instructions.push_tail( > - assign(r, mul(sign(y), imm(M_PI_2f)))); > + const unsigned n = type->vector_elements; > + ir_variable *y = in_var(type, "y"); > + ir_variable *x = in_var(type, "x"); > + MAKE_SIG(type, always_available, 2, y, x); > > - body.emit(outer_if); > + /* If we're on the left half-plane rotate the coordinates π/2 clock-wise > + * for the y=0 discontinuity to end up aligned with the vertical > + * discontinuity of atan(s/t) along t=0. This also makes sure that we > + * don't attempt to divide by zero along the vertical line, which may give > + * unspecified results on non-GLSL 4.1-capable hardware. > + */ > + ir_variable *flip = body.make_temp(glsl_type::bvec(n), "flip"); > + body.emit(assign(flip, gequal(imm(0.0f, n), x))); > + ir_variable *s = body.make_temp(type, "s"); > + body.emit(assign(s, csel(flip, abs(x), y))); > + ir_variable *t = body.make_temp(type, "t"); > + body.emit(assign(t, csel(flip, y, abs(x)))); > > - body.emit(assign(vec_result, r, 1 << i)); > - } > - body.emit(ret(vec_result)); > + /* If the magnitude of the denominator exceeds some huge value, scale down > + * the arguments in order to prevent the reciprocal operation from > flushing > + * its result to zero, which would cause precision problems, and for s > + * infinite would cause us to return a NaN instead of the correct finite > + * value. > + * > + * If fmin and fmax are respectively the smallest and largest positive > + * normalized floating point values representable by the implementation, > + * the constants below should be in agreement with: > + * > + * huge <= 1 / fmin > + * scale <= 1 / fmin / fmax (for |t| >= huge) > + * > + * In addition scale should be a negative power of two in order to avoid > + * loss of precision. The values chosen below should work for most usual > + * floating point representations with at least the dynamic range of ATI's > + * 24-bit representation. > + */ > + ir_constant *huge = imm(1e18f, n); > + ir_variable *scale = body.make_temp(type, "scale"); > + body.emit(assign(scale, csel(gequal(abs(t), huge), > + imm(0.25f, n), imm(1.0f, n)))); > + ir_variable *rcp_scaled_t = body.make_temp(type, "rcp_scaled_t"); > + body.emit(assign(rcp_scaled_t, rcp(mul(t, scale)))); > + ir_expression *s_over_t = mul(mul(s, scale), rcp_scaled_t); > + > + /* Calculate the arctangent and fix up the result if we had flipped the > + * coordinate system. > + */ > + ir_variable *arc = body.make_temp(type, "arc"); > + do_atan(body, type, arc, abs(s_over_t)); > + body.emit(assign(arc, add(arc, mul(b2f(flip), imm(M_PI_2f))))); > + > + /* Rather convoluted calculation of the sign of the result. When x < 0 we > + * cannot use fsign because we need to be able to distinguish between > + * negative and positive zero. Unfortunately we cannot use bitwise > + * arithmetic tricks either because of back-ends without integer support. > + * When x >= 0 rcp_scaled_t will always be non-negative so this won't be > + * able to distinguish between negative and positive zero, but we don't > + * care because atan2 is continuous along the whole positive y = 0 > + * half-line, so it won't affect the result significantly. > + */ > + body.emit(ret(csel(less(min2(y, rcp_scaled_t), imm(0.0f, n)), > + neg(arc), arc))); > > return sig; > } > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev