When used in GPU drivers, translate can be used to simultaneously perform a gather operation, and convert away from unsupported formats.
In this use case, input and output formats will often be identical: clearly it would make sense to use a memcpy in this case. Instead, translate will insist to convert to and from 32-bit floating point numbers. This is not only extremely expensive, but it also loses precision for 32/64-bit integers and 64-bit floating point numbers. This patch changes translate_generic to just use memcpy if the formats are identical, non-blocked, and with an integral number of bytes per pixel (note that all sensible vertex formats are like this). --- .../auxiliary/translate/translate_generic.c | 93 +++++++++++++------ 1 files changed, 63 insertions(+), 30 deletions(-) diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c index 42cfd76..57a42b7 100644 --- a/src/gallium/auxiliary/translate/translate_generic.c +++ b/src/gallium/auxiliary/translate/translate_generic.c @@ -63,6 +63,7 @@ struct translate_generic { const uint8_t *input_ptr; unsigned input_stride; unsigned max_index; + int copy_size; } attrib[PIPE_MAX_ATTRIBS]; @@ -380,9 +381,10 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate, float data[4]; char *dst = vert + tg->attrib[attr].output_offset; - if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { + if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { const uint8_t *src; unsigned index; + int copy_size; if (tg->attrib[attr].instance_divisor) { index = instance_id / tg->attrib[attr].instance_divisor; @@ -396,27 +398,34 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate, src = tg->attrib[attr].input_ptr + tg->attrib[attr].input_stride * index; - tg->attrib[attr].fetch( data, src, 0, 0 ); - - if (0) - debug_printf("Fetch elt attr %d from %p stride %d div %u max %u index %d: " - " %f, %f, %f, %f \n", - attr, - tg->attrib[attr].input_ptr, - tg->attrib[attr].input_stride, - tg->attrib[attr].instance_divisor, - tg->attrib[attr].max_index, - index, - data[0], data[1],data[2], data[3]); + copy_size = tg->attrib[attr].copy_size; + if(likely(copy_size >= 0)) + memcpy(dst, src, tg->attrib[attr].copy_size); + else + { + tg->attrib[attr].fetch( data, src, 0, 0 ); + + if (0) + debug_printf("Fetch elt attr %d from %p stride %d div %u max %u index %d: " + " %f, %f, %f, %f \n", + attr, + tg->attrib[attr].input_ptr, + tg->attrib[attr].input_stride, + tg->attrib[attr].instance_divisor, + tg->attrib[attr].max_index, + index, + data[0], data[1],data[2], data[3]); + tg->attrib[attr].emit( data, dst ); + } } else { - data[0] = (float)instance_id; + if(likely(tg->attrib[attr].copy_size >= 0)) + memcpy(data, &instance_id, 4); + else + { + data[0] = (float)instance_id; + tg->attrib[attr].emit( data, dst ); + } } - - if (0) - debug_printf("vert %d/%d attr %d: %f %f %f %f\n", - i, elt, attr, data[0], data[1], data[2], data[3]); - - tg->attrib[attr].emit( data, dst ); } vert += tg->translate.key.output_stride; } @@ -448,6 +457,7 @@ static void PIPE_CDECL generic_run( struct translate *translate, if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { const uint8_t *src; unsigned index; + int copy_size; if (tg->attrib[attr].instance_divisor) { index = instance_id / tg->attrib[attr].instance_divisor; @@ -462,25 +472,33 @@ static void PIPE_CDECL generic_run( struct translate *translate, src = tg->attrib[attr].input_ptr + tg->attrib[attr].input_stride * index; - tg->attrib[attr].fetch( data, src, 0, 0 ); + copy_size = tg->attrib[attr].copy_size; + if(likely(copy_size >= 0)) + memcpy(dst, src, tg->attrib[attr].copy_size); + else + { + tg->attrib[attr].fetch( data, src, 0, 0 ); - if (0) - debug_printf("Fetch linear attr %d from %p stride %d index %d: " + if (0) + debug_printf("Fetch linear attr %d from %p stride %d index %d: " " %f, %f, %f, %f \n", attr, tg->attrib[attr].input_ptr, tg->attrib[attr].input_stride, index, data[0], data[1],data[2], data[3]); + + tg->attrib[attr].emit( data, dst ); + } } else { - data[0] = (float)instance_id; + if(likely(tg->attrib[attr].copy_size >= 0)) + memcpy(data, &instance_id, 4); + else + { + data[0] = (float)instance_id; + tg->attrib[attr].emit( data, dst ); + } } - - if (0) - debug_printf("vert %d attr %d: %f %f %f %f\n", - i, attr, data[0], data[1], data[2], data[3]); - - tg->attrib[attr].emit( data, dst ); } vert += tg->translate.key.output_stride; @@ -547,6 +565,21 @@ struct translate *translate_generic_create( const struct translate_key *key ) tg->attrib[i].emit = get_emit_func(key->element[i].output_format); tg->attrib[i].output_offset = key->element[i].output_offset; + tg->attrib[i].copy_size = -1; + if (tg->attrib[i].type == TRANSLATE_ELEMENT_INSTANCE_ID) + { + if(key->element[i].output_format == PIPE_FORMAT_R32_USCALED + || key->element[i].output_format == PIPE_FORMAT_R32_SSCALED) + tg->attrib[i].copy_size = 4; + } + else + { + if(key->element[i].input_format == key->element[i].output_format + && format_desc->block.width == 1 + && format_desc->block.height == 1 + && !(format_desc->block.bits & 7)) + tg->attrib[i].copy_size = format_desc->block.bits >> 3; + } } tg->nr_attrib = key->nr_elements; -- 1.7.0.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev