Looks good. Thanks for the updates. Jose
----- Original Message ----- > From: Roland Scheidegger <srol...@vmware.com> > > We had to disable fast rsqrt before because it wasn't precise enough etc. > However in situations when we know we're not going to need more precision > we can still use a fast rsqrt (which can be several times faster than > the quite expensive sqrt). Hence introduce a new helper which does exactly > that - it is probably not useful calling it in some situations if there's > no fast rsqrt available so make it queryable if it's available too. > > v2: use fast_rsqrt consistently instead of rsqrt_fast, fix indentation, > let rsqrt use fast_rsqrt. > --- > src/gallium/auxiliary/gallivm/lp_bld_arit.c | 75 > +++++++++++++++++++++------ > src/gallium/auxiliary/gallivm/lp_bld_arit.h | 7 +++ > 2 files changed, 66 insertions(+), 16 deletions(-) > > diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c > b/src/gallium/auxiliary/gallivm/lp_bld_arit.c > index c006ac5..fd7c22e 100644 > --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c > +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c > @@ -2306,19 +2306,14 @@ lp_build_rsqrt(struct lp_build_context *bld, > /* > * This should be faster but all denormals will end up as infinity. > */ > - if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) > || > - (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) { > + if (0 && lp_build_fast_rsqrt_available(type)) { > const unsigned num_iterations = 1; > LLVMValueRef res; > unsigned i; > - const char *intrinsic = NULL; > > - if (type.length == 4) { > - intrinsic = "llvm.x86.sse.rsqrt.ps"; > - } > - else { > - intrinsic = "llvm.x86.avx.rsqrt.ps.256"; > - } > + /* rsqrt(1.0) != 1.0 here */ > + res = lp_build_fast_rsqrt(bld, a); > + > if (num_iterations) { > /* > * Newton-Raphson will result in NaN instead of infinity for zero, > @@ -2338,8 +2333,6 @@ lp_build_rsqrt(struct lp_build_context *bld, > > inf = LLVMBuildBitCast(builder, inf, > lp_build_vec_type(bld->gallivm, type), ""); > > - res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, > a); > - > for (i = 0; i < num_iterations; ++i) { > res = lp_build_rsqrt_refine(bld, a, res); > } > @@ -2350,11 +2343,6 @@ lp_build_rsqrt(struct lp_build_context *bld, > cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, > bld->one); > res = lp_build_select(bld, cmp, bld->one, res); > } > - else { > - /* rsqrt(1.0) != 1.0 here */ > - res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, > a); > - > - } > > return res; > } > @@ -2362,6 +2350,61 @@ lp_build_rsqrt(struct lp_build_context *bld, > return lp_build_rcp(bld, lp_build_sqrt(bld, a)); > } > > +/** > + * If there's a fast (inaccurate) rsqrt instruction available > + * (caller may want to avoid to call rsqrt_fast if it's not available, > + * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if > + * unavailable it would result in sqrt/div/mul so obviously > + * much better to just call sqrt, skipping both div and mul). > + */ > +boolean > +lp_build_fast_rsqrt_available(struct lp_type type) > +{ > + > + assert(type.floating); > + > + if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || > + (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { > + return true; > + } > + return false; > +} > + > + > +/** > + * Generate 1/sqrt(a). > + * Result is undefined for values < 0, infinity for +0. > + * Precision is limited, only ~10 bits guaranteed > + * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0). > + */ > +LLVMValueRef > +lp_build_fast_rsqrt(struct lp_build_context *bld, > + LLVMValueRef a) > +{ > + LLVMBuilderRef builder = bld->gallivm->builder; > + const struct lp_type type = bld->type; > + > + assert(lp_check_value(type, a)); > + > + assert(type.floating); > + > + if (lp_build_fast_rsqrt_available(type)) { > + const char *intrinsic = NULL; > + > + if (type.length == 4) { > + intrinsic = "llvm.x86.sse.rsqrt.ps"; > + } > + else { > + intrinsic = "llvm.x86.avx.rsqrt.ps.256"; > + } > + return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); > + } > + else { > + debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", > __FUNCTION__); > + } > + return lp_build_rcp(bld, lp_build_sqrt(bld, a)); > +} > + > > /** > * Generate sin(a) using SSE2 > diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h > b/src/gallium/auxiliary/gallivm/lp_bld_arit.h > index 966796c..920e339 100644 > --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h > +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h > @@ -231,6 +231,13 @@ LLVMValueRef > lp_build_rsqrt(struct lp_build_context *bld, > LLVMValueRef a); > > +boolean > +lp_build_fast_rsqrt_available(struct lp_type type); > + > +LLVMValueRef > +lp_build_fast_rsqrt(struct lp_build_context *bld, > + LLVMValueRef a); > + > LLVMValueRef > lp_build_cos(struct lp_build_context *bld, > LLVMValueRef a); > -- > 1.7.9.5 > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev