Ok so before someone else notices that, ignore the rgb9e5 part. The format isn't quite what I thought it was...
Roland Am 21.03.2013 23:28, schrieb srol...@vmware.com: > From: Roland Scheidegger <srol...@vmware.com> > > New conversion code to handle conversion from/to r11g11b10 AoS to/from > SoA floats, and also add code for conversion from rgb9e5 AoS to float SoA > (which works pretty much the same as r11g11b10 except for the packing). > (This code should also be used for texture sampling instead of > relying on u_format conversion but it's not yet, so rgb9e5 is unused.) > Unfortunately a crazy amount of hacks is necessary to get the conversion > code running in llvmpipe's generate_unswizzled_blend, which isn't well > suited for formats where the storage representation has nothing to do > with what's needed for blending (moreover, the conversion will convert > from packed AoS values, which is the storage format, to float SoA values, > because this is much more natural for the conversion, and likewise from > SoA values to packed AoS values - but the "blend" (which includes > trivial things like partial mask) works on AoS values, so incoming fs > values will go SoA->AoS, values from destination will go packed > AoS->SoA->AoS, then do blend, then AoS->SoA->packed AoS which probably > isn't the most efficient way though the shuffles are probably bearable). > > Passes piglit fbo-blending-formats (with GL_EXT_packed_float parameter), > still need to verify Inf/NaNs (where most of the complexity in the > conversion comes from actually). > --- > src/gallium/auxiliary/gallivm/lp_bld_conv.c | 314 > +++++++++++++++++++++++++++ > src/gallium/auxiliary/gallivm/lp_bld_conv.h | 14 ++ > src/gallium/drivers/llvmpipe/lp_screen.c | 6 +- > src/gallium/drivers/llvmpipe/lp_state_fs.c | 126 +++++++++++ > 4 files changed, 458 insertions(+), 2 deletions(-) > > diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c > b/src/gallium/auxiliary/gallivm/lp_bld_conv.c > index dc3649d..4fce1bc 100644 > --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c > +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c > @@ -155,6 +155,320 @@ lp_build_bswap_vec(struct gallivm_state *gallivm, > > > /** > + * Convert float32 to a float-like value with less exponent and mantissa > + * bits. The mantissa is still biased, and the mantissa still has an implied > 1, > + * but there's no sign bit. > + * > + * @param src (vector) float value to convert > + * @param mantissa_bits the number of mantissa bits > + * @param exponent_bits the number of exponent bits > + * > + * Unlike float_to_half using accurate method here. > + * This implements round-towards-zero (trunc) hence too large numbers get > + * converted to largest representable number, not infinity. > + * Small numbers may get converted to denorms, depending on normal > + * float denorm handling of the cpu. > + * Note that compared to the references, below, we skip any rounding bias > + * and do strict rounding towards zero (if I got the constants right...) > + * - OpenGL allows rounding towards zero (though not preferred) and > + * DX10 even seems to require it. > + * Note that this will not try to pack the values somehow - they will > + * look like "rescaled floats" (except for Inf/NaN) (but returned as > + * (vector) int32). > + * > + * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ > + * ref https://gist.github.com/rygorous/2156668 > + */ > +static LLVMValueRef > +lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm, > + LLVMValueRef src, > + unsigned mantissa_bits, > + unsigned exponent_bits) > +{ > + LLVMBuilderRef builder = gallivm->builder; > + LLVMTypeRef src_type = LLVMTypeOf(src); > + LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal; > + LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs; > + LLVMValueRef isnan, isposinf, isnanorposinf, i32_qnanbit, nanorposinfnum; > + unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ? > + LLVMGetVectorSize(src_type) : 1; > + struct lp_type f32_type = lp_type_float_vec(32, 32 * length); > + struct lp_type i32_type = lp_type_int_vec(32, 32 * length); > + struct lp_build_context f32_bld, i32_bld; > + LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f); > + > + lp_build_context_init(&f32_bld, gallivm, f32_type); > + lp_build_context_init(&i32_bld, gallivm, i32_type); > + > + i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type, > + ((1 << exponent_bits) - 1) << > 23); > + i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23); > + > + /* "ordinary" number */ > + /* clamp to pos range (can still have sign bit if NaN but doesn't matter) > */ > + clamped = lp_build_max(&f32_bld, src, zero); > + clamped = LLVMBuildBitCast(builder, clamped, i32_bld.vec_type, ""); > + /* get rid of excess mantissa bits */ > + /* really not sure about that constant */ > + i32_roundmask = lp_build_const_int_vec(gallivm, i32_type, > + ~((1 << (23 - mantissa_bits)) - > 1)); > + > + tmp = lp_build_and(&i32_bld, clamped, i32_roundmask); > + tmp = LLVMBuildBitCast(builder, tmp, f32_bld.vec_type, ""); > + /* bias exponent (and denormalize if necessary) */ > + magic = lp_build_const_int_vec(gallivm, i32_type, > + ((1 << (exponent_bits - 1)) - 1) << 23); > + magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, ""); > + normal = lp_build_mul(&f32_bld, tmp, magic); > + > + /* clamp to max value */ > + small_max = lp_build_const_int_vec(gallivm, i32_type, > + (((1 << exponent_bits) - 2) << 23) | > + (((1 << mantissa_bits) - 1) << (23 - > mantissa_bits))); > + small_max = LLVMBuildBitCast(builder, small_max, f32_bld.vec_type, ""); > + normal = lp_build_min(&f32_bld, normal, small_max); > + normal = LLVMBuildBitCast(builder, normal, i32_bld.vec_type, ""); > + > + /* > + * handle nan/inf cases > + * a little bit tricky since -Inf -> 0, +Inf -> +Inf, +-Nan -> +Nan > + * Note that on a lucky day, we could simplify this a bit, > + * by just using the max(src, zero) result - this will have -Inf > + * clamped to 0, and MIGHT preserve the NaNs. > + */ > + src_abs = lp_build_abs(&f32_bld, src); > + src_abs = LLVMBuildBitCast(builder, src_abs, i32_bld.vec_type, ""); > + src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, ""); > + isnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, > + src_abs, i32_floatexpmask); > + isposinf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_EQUAL, > + src, i32_floatexpmask); > + isnanorposinf = lp_build_and(&i32_bld, isnan, isposinf); > + /* could also set more mantissa bits but need at least the highest > mantissa bit */ > + i32_qnanbit = lp_build_const_vec(gallivm, i32_type, 1 << 22); > + /* combine maxexp with qnanbit */ > + nanorposinfnum = lp_build_or(&i32_bld, i32_smallexpmask, > + lp_build_and(&i32_bld, isnan, i32_qnanbit)); > + > + return lp_build_select(&i32_bld, isnanorposinf, nanorposinfnum, normal); > +} > + > + > +/** > + * Convert a float-like value with less exponent and mantissa > + * bits than a normal float32 to a float32. The mantissa of > + * the source value is assumed to have an implied 1, and the exponent > + * is biased. There are no negative values. > + * The source value already is in "rescaled float" format, with the > + * exponent starting at bit 23 (and the relevant mantissa bits immediately > + * below that). > + * > + * @param src (vector) value to convert > + * @param mantissa_bits the number of mantissa bits > + * @param exponent_bits the number of exponent bits > + * > + * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ > + * ref https://gist.github.com/rygorous/2156668 > + */ > +static LLVMValueRef > +lp_build_smallfloat_nosign_to_float(struct gallivm_state *gallivm, > + LLVMValueRef src, > + unsigned mantissa_bits, > + unsigned exponent_bits) > +{ > + LLVMBuilderRef builder = gallivm->builder; > + LLVMTypeRef src_type = LLVMTypeOf(src); > + LLVMValueRef smallexpmask, i32_floatexpmask, magic; > + LLVMValueRef wasinfnan, tmp, res; > + unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ? > + LLVMGetVectorSize(src_type) : 1; > + struct lp_type f32_type = lp_type_float_vec(32, 32 * length); > + struct lp_type i32_type = lp_type_int_vec(32, 32 * length); > + struct lp_build_context f32_bld, i32_bld; > + > + lp_build_context_init(&f32_bld, gallivm, f32_type); > + lp_build_context_init(&i32_bld, gallivm, i32_type); > + > + smallexpmask = lp_build_const_int_vec(gallivm, i32_type, > + ((1 << exponent_bits) - 1) << 23); > + i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23); > + /* > + * magic number has exponent new exp bias + (new exp bias - old exp bias), > + * mantissa is 0. > + */ > + magic = lp_build_const_int_vec(gallivm, i32_type, > + (255 - (1 << (exponent_bits - 1))) << 23); > + magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, ""); > + > + /* adjust exponent and fix denorms */ > + res = lp_build_mul(&f32_bld, src, magic); > + > + /* > + * if exp was max (== NaN or Inf) set new exp to max (keep mantissa), > + * so a simple "or" will do (because exp adjust will leave mantissa > intact) > + */ > + /* use float compare (better for AVX 8-wide / no AVX2 though otherwise > should use int) */ > + smallexpmask = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, ""); > + wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, src, > smallexpmask); > + res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, ""); > + tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan); > + res = lp_build_or(&i32_bld, tmp, res); > + > + return LLVMBuildBitCast(builder, res, f32_bld.vec_type, ""); > +} > + > + > +/** > + * Convert rgba float SoA values to packed r11g11b10 values. > + * > + * @param src SoA float (vector) values to convert. > + */ > +LLVMValueRef > +lp_build_float_to_r11g11b10(struct gallivm_state *gallivm, > + LLVMValueRef *src) > +{ > + LLVMValueRef dst, rcomp, bcomp, gcomp, shift, mask; > + struct lp_build_context i32_bld; > + LLVMTypeRef src_type = LLVMTypeOf(*src); > + unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ? > + LLVMGetVectorSize(src_type) : 1; > + struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length); > + > + lp_build_context_init(&i32_bld, gallivm, i32_type); > + > + /* "rescale" - this does the actual conversion except the packing */ > + rcomp = lp_build_float_to_smallfloat_nosign(gallivm, src[0], 6, 5); > + gcomp = lp_build_float_to_smallfloat_nosign(gallivm, src[1], 6, 5); > + bcomp = lp_build_float_to_smallfloat_nosign(gallivm, src[2], 5, 5); > + > + /* pack rescaled SoA floats to r11g11b10 AoS values */ > + shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 6); > + rcomp = lp_build_shr(&i32_bld, rcomp, shift); > + > + shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 17); > + mask = lp_build_const_int_vec(gallivm, i32_type, 0x7ff << 11); > + gcomp = lp_build_shr(&i32_bld, gcomp, shift); > + gcomp = lp_build_and(&i32_bld, gcomp, mask); > + > + shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23); > + mask = lp_build_const_int_vec(gallivm, i32_type, 0x3ff << 22); > + bcomp = lp_build_shl(&i32_bld, bcomp, shift); > + bcomp = lp_build_and(&i32_bld, bcomp, mask); > + > + dst = lp_build_or(&i32_bld, rcomp, gcomp); > + return lp_build_or(&i32_bld, dst, bcomp); > +} > + > + > +/** > + * Convert packed float format (r11g11b10) value(s) to rgba float SoA values. > + * > + * @param src packed AoS r11g11b10 values (as (vector) int32) > + * @param dst pointer to the SoA result values > + */ > +void > +lp_build_r11g11b10_to_float(struct gallivm_state *gallivm, > + LLVMValueRef src, > + LLVMValueRef *dst) > +{ > + LLVMBuilderRef builder = gallivm->builder; > + LLVMTypeRef src_type = LLVMTypeOf(src); > + LLVMValueRef rcomp, bcomp, gcomp, shift, mask; > + unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ? > + LLVMGetVectorSize(src_type) : 1; > + struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length); > + struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length); > + struct lp_build_context i32_bld, f32_bld; > + > + lp_build_context_init(&i32_bld, gallivm, i32_type); > + lp_build_context_init(&f32_bld, gallivm, f32_type); > + > + /* put mantissa/exp into "rescaled float" format */ > + mask = lp_build_const_int_vec(gallivm, i32_type, 0x7ff << (23 - 6)); > + shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 6); > + rcomp = lp_build_shl(&i32_bld, src, shift); > + rcomp = lp_build_and(&i32_bld, rcomp, mask); > + rcomp = LLVMBuildBitCast(builder, rcomp, f32_bld.vec_type, ""); > + dst[0] = lp_build_smallfloat_nosign_to_float(gallivm, rcomp, 6, 5); > + > + shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 17); > + gcomp = lp_build_shl(&i32_bld, src, shift); > + gcomp = lp_build_and(&i32_bld, gcomp, mask); > + gcomp = LLVMBuildBitCast(builder, gcomp, f32_bld.vec_type, ""); > + dst[1] = lp_build_smallfloat_nosign_to_float(gallivm, gcomp, 6, 5); > + > + shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23); > + mask = lp_build_const_int_vec(gallivm, i32_type, 0x3ff << (23 - 5)); > + /* really logical shift but gets masked out anyway */ > + bcomp = lp_build_shr(&i32_bld, src, shift); > + bcomp = lp_build_and(&i32_bld, bcomp, mask); > + bcomp = LLVMBuildBitCast(builder, bcomp, f32_bld.vec_type, ""); > + dst[2] = lp_build_smallfloat_nosign_to_float(gallivm, bcomp, 5, 5); > + > + /* Just set alpha to one */ > + dst[3] = f32_bld.one; > +} > + > + > +/** > + * Convert shared exponent format (rgb9e5) value(s) to rgba float SoA values. > + * > + * @param src packed AoS rgb9e5 values (as (vector) int32) > + * @param dst pointer to the SoA result values > + */ > +void > +lp_build_rgb9e5_to_float(struct gallivm_state *gallivm, > + LLVMValueRef src, > + LLVMValueRef *dst) > +{ > + LLVMBuilderRef builder = gallivm->builder; > + LLVMTypeRef src_type = LLVMTypeOf(src); > + LLVMValueRef rcomp, bcomp, gcomp, exp, shift, mask; > + unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ? > + LLVMGetVectorSize(src_type) : 1; > + struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length); > + struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length); > + struct lp_build_context i32_bld, f32_bld; > + > + lp_build_context_init(&i32_bld, gallivm, i32_type); > + lp_build_context_init(&f32_bld, gallivm, f32_type); > + > + /* extract exponent */ > + mask = lp_build_const_int_vec(gallivm, i32_type, 0x1f << 23); > + shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23); > + exp = lp_build_shl(&i32_bld, src, shift); > + exp = lp_build_and(&i32_bld, exp, mask); > + > + /* put mantissa/exp into "rescaled float" format */ > + mask = lp_build_const_int_vec(gallivm, i32_type, 0x1ff << (23 - 9)); > + shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 9); > + rcomp = lp_build_shl(&i32_bld, src, shift); > + rcomp = lp_build_and(&i32_bld, rcomp, mask); > + rcomp = lp_build_or(&i32_bld, rcomp, exp); > + rcomp = LLVMBuildBitCast(builder, rcomp, f32_bld.vec_type, ""); > + dst[0] = lp_build_smallfloat_nosign_to_float(gallivm, rcomp, 9, 5); > + > + shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 18); > + gcomp = lp_build_shl(&i32_bld, src, shift); > + gcomp = lp_build_and(&i32_bld, gcomp, mask); > + gcomp = lp_build_or(&i32_bld, gcomp, exp); > + gcomp = LLVMBuildBitCast(builder, gcomp, f32_bld.vec_type, ""); > + dst[1] = lp_build_smallfloat_nosign_to_float(gallivm, gcomp, 9, 5); > + > + shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23); > + /* really logical shift but gets masked out anyway */ > + bcomp = lp_build_shr(&i32_bld, src, shift); > + bcomp = lp_build_and(&i32_bld, bcomp, mask); > + bcomp = lp_build_or(&i32_bld, bcomp, exp); > + bcomp = LLVMBuildBitCast(builder, bcomp, f32_bld.vec_type, ""); > + dst[2] = lp_build_smallfloat_nosign_to_float(gallivm, bcomp, 9, 5); > + > + /* Just set alpha to one */ > + dst[3] = f32_bld.one; > +} > + > + > +/** > * Converts int16 half-float to float32 > * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i > think?) > * [llvm.x86.vcvtph2ps / _mm_cvtph_ps] > diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h > b/src/gallium/auxiliary/gallivm/lp_bld_conv.h > index d7dfed8..d8bc294 100644 > --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h > +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h > @@ -62,6 +62,20 @@ lp_build_float_to_half(struct gallivm_state *gallivm, > LLVMValueRef src); > > LLVMValueRef > +lp_build_float_to_r11g11b10(struct gallivm_state *gallivm, > + LLVMValueRef *src); > + > +void > +lp_build_r11g11b10_to_float(struct gallivm_state *gallivm, > + LLVMValueRef src, > + LLVMValueRef *dst); > + > +void > +lp_build_rgb9e5_to_float(struct gallivm_state *gallivm, > + LLVMValueRef src, > + LLVMValueRef *dst); > + > +LLVMValueRef > lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm, > struct lp_type src_type, > unsigned dst_width, > diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c > b/src/gallium/drivers/llvmpipe/lp_screen.c > index 93e125d..ece7679 100644 > --- a/src/gallium/drivers/llvmpipe/lp_screen.c > +++ b/src/gallium/drivers/llvmpipe/lp_screen.c > @@ -321,7 +321,8 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen, > if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB) > return FALSE; > > - if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) > + if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN && > + format != PIPE_FORMAT_R11G11B10_FLOAT) > return FALSE; > assert(format_desc->block.width == 1); > assert(format_desc->block.height == 1); > @@ -329,7 +330,8 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen, > if (format_desc->is_mixed) > return FALSE; > > - if (!format_desc->is_array && !format_desc->is_bitmask) > + if (!format_desc->is_array && !format_desc->is_bitmask && > + format != PIPE_FORMAT_R11G11B10_FLOAT) > return FALSE; > > /* > diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c > b/src/gallium/drivers/llvmpipe/lp_state_fs.c > index d8369b4..953a5c1 100644 > --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c > +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c > @@ -972,6 +972,17 @@ lp_mem_type_from_format_desc(const struct > util_format_description *format_desc, > unsigned i; > unsigned chan; > > + if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) { > + /* just make this a 32bit uint */ > + type->floating = false; > + type->fixed = false; > + type->sign = false; > + type->norm = false; > + type->width = 32; > + type->length = 1; > + return; > + } > + > for (i = 0; i < 4; i++) > if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) > break; > @@ -1009,6 +1020,17 @@ lp_blend_type_from_format_desc(const struct > util_format_description *format_desc > unsigned i; > unsigned chan; > > + if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) { > + /* always use ordinary floats for blending */ > + type->floating = true; > + type->fixed = false; > + type->sign = true; > + type->norm = false; > + type->width = 32; > + type->length = 4; > + return; > + } > + > for (i = 0; i < 4; i++) > if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) > break; > @@ -1122,6 +1144,48 @@ convert_to_blend_type(struct gallivm_state *gallivm, > unsigned pixels = 16 / num_srcs; > bool is_arith; > > + /* > + * full custom path for packed floats - none of the later functions would > do > + * anything useful, and given the lp_type representation they can't be > fixed. > + */ > + if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) { > + LLVMValueRef tmpsrc[4]; > + /* > + * This is pretty suboptimal for this case blending in SoA would be > much > + * better, since conversion gets us SoA values so need to convert back. > + */ > + assert(src_type.width == 32); > + assert(dst_type.floating); > + assert(dst_type.width = 32); > + assert(dst_type.length % 4 == 0); > + for (i = 0; i < 4; i++) { > + tmpsrc[i] = src[i]; > + } > + for (i = 0; i < num_srcs / 4; i++) { > + LLVMValueRef tmpsoa[4]; > + LLVMValueRef tmps = tmpsrc[i]; > + if (num_srcs == 8) { > + LLVMValueRef shuffles[8]; > + unsigned j; > + /* fetch was 4 values but need 8-wide output values */ > + tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2); > + /* > + * for 8-wide aos transpose would give us wrong order not > matching > + * incoming converted fs values and mask. ARGH. > + */ > + for (j = 0; j < 4; j++) { > + shuffles[j] = lp_build_const_int32(gallivm, j * 2); > + shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1); > + } > + tmps = LLVMBuildShuffleVector(builder, tmps, tmps, > + LLVMConstVector(shuffles, 8), ""); > + } > + lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa); > + lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]); > + } > + return; > + } > + > lp_mem_type_from_format_desc(src_fmt, &mem_type); > lp_blend_type_from_format_desc(src_fmt, &blend_type); > > @@ -1225,6 +1289,47 @@ convert_from_blend_type(struct gallivm_state *gallivm, > unsigned pixels = 16 / num_srcs; > bool is_arith; > > + /* > + * full custom path for packed floats - none of the later functions would > do > + * anything useful, and given the lp_type representation they can't be > fixed. > + */ > + if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) { > + /* > + * This is pretty suboptimal for this case blending in SoA would be > much > + * better - we need to transpose the AoS values back to SoA values for > + * conversion/packing. > + */ > + assert(src_type.floating); > + assert(src_type.width = 32); > + assert(src_type.length % 4 == 0); > + assert(dst_type.width == 32); > + for (i = 0; i < num_srcs / 4; i++) { > + LLVMValueRef tmpsoa[4], tmpdst; > + lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa); > + tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa); > + if (num_srcs == 8) { > + LLVMValueRef tmpaos, shuffles[8]; > + unsigned j; > + /* > + * for 8-wide aos transpose has given us wrong order not matching > + * output order. HMPF. Also need to split the output values > manually. > + */ > + for (j = 0; j < 4; j++) { > + shuffles[j * 2] = lp_build_const_int32(gallivm, j); > + shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4); > + } > + tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst, > + LLVMConstVector(shuffles, 8), > ""); > + src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4); > + src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4); > + } > + else { > + src[i] = tmpdst; > + } > + } > + return; > + } > + > lp_mem_type_from_format_desc(src_fmt, &mem_type); > lp_blend_type_from_format_desc(src_fmt, &blend_type); > > @@ -1532,6 +1637,17 @@ generate_unswizzled_blend(struct gallivm_state > *gallivm, > } > } > > + if (out_format == PIPE_FORMAT_R11G11B10_FLOAT) { > + /* the code above can't work for layout_other */ > + dst_channels = 4; /* HACK: this is fake 4 really but need it due to > transpose stuff later */ > + has_alpha = true; > + swizzle[0] = 0; > + swizzle[1] = 1; > + swizzle[2] = 2; > + swizzle[3] = 3; > + pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion > later */ > + } > + > /* If 3 channels then pad to include alpha for 4 element transpose */ > if (dst_channels == 3 && !has_alpha) { > for (i = 0; i < TGSI_NUM_CHANNELS; i++) { > @@ -1756,6 +1872,16 @@ generate_unswizzled_blend(struct gallivm_state > *gallivm, > > dst_type.length *= 16 / dst_count; > > + if (out_format == PIPE_FORMAT_R11G11B10_FLOAT) { > + /* > + * we need multiple values at once for the conversion, so can as well > + * load them vectorized here too instead of concatenating later. > + * (Still need concatenation later for 8-wide vectors). > + */ > + dst_count = block_height; > + dst_type.length = block_width; > + } > + > load_unswizzled_block(gallivm, color_ptr, stride, block_width, > block_height, > dst, dst_type, dst_count, dst_alignment); > > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev