From: Roland Scheidegger <srol...@vmware.com> We should do transpose, not extract/insert, at least with "sufficient" amount of channels (for 4 channels, extract/insert shuffles generated otherwise look truly terrifying). Albeit we shouldn't fallback to that so often in any case. --- src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 83 +++++++++++++++++++---- 1 file changed, 70 insertions(+), 13 deletions(-)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c index 389bfa0..902c763 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c @@ -40,6 +40,39 @@ #include "lp_bld_debug.h" #include "lp_bld_format.h" #include "lp_bld_arit.h" +#include "lp_bld_pack.h" + + +static void +convert_to_soa(struct gallivm_state *gallivm, + LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32], + LLVMValueRef dst_soa[4], + const struct lp_type soa_type) +{ + unsigned j, k; + struct lp_type aos_channel_type = soa_type; + + LLVMValueRef aos_channels[4]; + unsigned pixels_per_channel = soa_type.length / 4; + + debug_assert((soa_type.length % 4) == 0); + + aos_channel_type.length >>= 1; + + for (j = 0; j < 4; ++j) { + LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 }; + + assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH); + + for (k = 0; k < pixels_per_channel; ++k) { + channel[k] = src_aos[j + 4 * k]; + } + + aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel); + } + + lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa); +} void @@ -48,9 +81,6 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc, const LLVMValueRef *unswizzled, LLVMValueRef swizzled_out[4]) { - assert(PIPE_SWIZZLE_0 == (int)PIPE_SWIZZLE_0); - assert(PIPE_SWIZZLE_1 == (int)PIPE_SWIZZLE_1); - if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { enum pipe_swizzle swizzle; LLVMValueRef depth_or_stencil; @@ -547,9 +577,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, { unsigned k, chan; struct lp_type tmp_type; + LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32]; + boolean vec_transpose = FALSE; if (gallivm_debug & GALLIVM_DEBUG_PERF) { - debug_printf("%s: scalar unpacking of %s\n", + debug_printf("%s: AoS fetch fallback for %s\n", __FUNCTION__, format_desc->short_name); } @@ -560,12 +592,31 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, rgba_out[chan] = lp_build_undef(gallivm, type); } + if (format_desc->nr_channels > 2 || + format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) { + /* + * Note that vector transpose can be worse. This is because + * llvm will ensure the missing channels have the correct + * values, in particular typically 1.0 for the last channel + * (if they are used or not doesn't matter, usually llvm can't + * figure this out here probably due to the transpose). + * But with the extract/insert path, since those missing elements + * were just directly inserted/extracted llvm can optimize this + * somewhat (though it still doesn't look great - and not for + * the compressed formats due to their external fetch funcs). + * So restrict to cases where we are sure it helps (albeit + * with 2 channels it MIGHT be worth it at least with AVX). + * In any case, this is just a bandaid, it does NOT replace proper + * SoA format unpack. + */ + vec_transpose = TRUE; + } + /* loop over number of pixels */ for(k = 0; k < type.length; ++k) { LLVMValueRef index = lp_build_const_int32(gallivm, k); LLVMValueRef offset_elem; LLVMValueRef i_elem, j_elem; - LLVMValueRef tmp; offset_elem = LLVMBuildExtractElement(builder, offset, index, ""); @@ -574,20 +625,26 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, j_elem = LLVMBuildExtractElement(builder, j, index, ""); /* Get a single float[4]={R,G,B,A} pixel */ - tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, - aligned, base_ptr, offset_elem, - i_elem, j_elem, cache); + aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, + aligned, base_ptr, offset_elem, + i_elem, j_elem, cache); /* * Insert the AoS tmp value channels into the SoA result vectors at * position = 'index'. */ - for (chan = 0; chan < 4; ++chan) { - LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan), - tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, ""); - rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan], - tmp_chan, index, ""); + if (!vec_transpose) { + for (chan = 0; chan < 4; ++chan) { + LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan), + tmp_chan = LLVMBuildExtractElement(builder, aos_fetch[k], chan_val, ""); + rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan], + tmp_chan, index, ""); + } } } + if (vec_transpose) { + convert_to_soa(gallivm, aos_fetch, rgba_out, type); + } + } } -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev