From: Roland Scheidegger <srol...@vmware.com> We had to disable fast rsqrt before because it wasn't precise enough etc. However in situations when we know we're not going to need more precision we can still use a fast rsqrt (which can be several times faster than the quite expensive sqrt). Hence introduce a new helper which does exactly that - it is probably not useful calling it in some situations if there's no fast rsqrt available so make it queryable if it's available too. --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 55 +++++++++++++++++++++++++++ src/gallium/auxiliary/gallivm/lp_bld_arit.h | 7 ++++ 2 files changed, 62 insertions(+)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 08aec79..2e84543 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -2361,6 +2361,61 @@ lp_build_rsqrt(struct lp_build_context *bld, return lp_build_rcp(bld, lp_build_sqrt(bld, a)); } +/** + * If there's a fast (inaccurate) rsqrt instruction available + * (caller may want to avoid to call rsqrt_fast if it's not available, + * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if + * unavailable it would result in sqrt/div/mul so obviously + * much better to just call sqrt, skipping both div and mul). + */ +boolean +lp_build_fast_rsqrt_available(struct lp_type type) { + + assert(type.floating); + + if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || + (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { + return true; + } + return false; +} + + +/** + * Generate 1/sqrt(a). + * Result is undefined for values < 0, infinity for +0. + * Precision is limited, only ~10 bits guaranteed + * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0). + */ +LLVMValueRef +lp_build_rsqrt_fast(struct lp_build_context *bld, + LLVMValueRef a) +{ + LLVMBuilderRef builder = bld->gallivm->builder; + const struct lp_type type = bld->type; + + assert(lp_check_value(type, a)); + + assert(type.floating); + + if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || + (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { + const char *intrinsic = NULL; + + if (type.length == 4) { + intrinsic = "llvm.x86.sse.rsqrt.ps"; + } + else { + intrinsic = "llvm.x86.avx.rsqrt.ps.256"; + } + return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); + } + else { + debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__); + } + return lp_build_rcp(bld, lp_build_sqrt(bld, a)); +} + /** * Generate sin(a) using SSE2 diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h index 966796c..d53e471 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h @@ -231,6 +231,13 @@ LLVMValueRef lp_build_rsqrt(struct lp_build_context *bld, LLVMValueRef a); +boolean +lp_build_fast_rsqrt_available(struct lp_type type); + +LLVMValueRef +lp_build_rsqrt_fast(struct lp_build_context *bld, + LLVMValueRef a); + LLVMValueRef lp_build_cos(struct lp_build_context *bld, LLVMValueRef a); -- 1.7.9.5 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev