FWIW this change is built on top of "llvmpipe: Remove x/y from cmd_bin", which was discussed on the list but hasn't been commited yet.
Roland Am 29.05.2013 03:41, schrieb srol...@vmware.com: > From: Roland Scheidegger <srol...@vmware.com> > > The overallocation was very bad especially for things like 1d array > textures which got blown up by a factor of 64. (Even ordinary smallish > 2d textures benefit a lot from this, a mipmapped 64x64 rgba8 texture > previously used 7*16kB = 112kB instead of now ~22kB.) > 4x4 is chosen because this is the size the jit functions run on, so > making it smaller is going to be a bit more complicated. > It is actually not strictly 4x4 pixel, since we'd want to avoid situations > where different threads are rendering to the same cacheline so we keep > cacheline size alignment in x direction (often 64bytes). > To make this work introduce new task width/height parameters and make > sure clears don't clear the whole tile if it's a partial tile. Likewise, > the rasterizer may produce fragments outside the 4x4 blocks present in a > tile, so don't call the jit function for them. > This does not yet fix rendering to buffers (which cannot have any y > alignment at all), and 1d/1d array textures are still overallocated by a > factor of 4. > --- > src/gallium/drivers/llvmpipe/lp_rast.c | 56 > ++++++++++++++++----------- > src/gallium/drivers/llvmpipe/lp_rast_priv.h | 37 +++++++++++------- > src/gallium/drivers/llvmpipe/lp_scene.c | 2 + > src/gallium/drivers/llvmpipe/lp_scene.h | 4 ++ > src/gallium/drivers/llvmpipe/lp_setup.c | 3 +- > src/gallium/drivers/llvmpipe/lp_texture.c | 26 ++++++------- > 6 files changed, 75 insertions(+), 53 deletions(-) > > diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c > b/src/gallium/drivers/llvmpipe/lp_rast.c > index 5c837a0..be5a286 100644 > --- a/src/gallium/drivers/llvmpipe/lp_rast.c > +++ b/src/gallium/drivers/llvmpipe/lp_rast.c > @@ -95,6 +95,10 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task, > task->bin = bin; > task->x = x * TILE_SIZE; > task->y = y * TILE_SIZE; > + task->width = TILE_SIZE + x * TILE_SIZE > task->scene->width_aligned ? > + task->scene->width_aligned - x * TILE_SIZE : TILE_SIZE; > + task->height = TILE_SIZE + y * TILE_SIZE > task->scene->height_aligned ? > + task->scene->height_aligned - y * TILE_SIZE : TILE_SIZE; > > /* reset pointers to color and depth tile(s) */ > memset(task->color_tiles, 0, sizeof(task->color_tiles)); > @@ -144,8 +148,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task, > scene->cbufs[i].stride, > task->x, > task->y, > - TILE_SIZE, > - TILE_SIZE, > + task->width, > + task->height, > &uc); > } > } > @@ -172,8 +176,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task, > scene->cbufs[i].stride, > task->x, > task->y, > - TILE_SIZE, > - TILE_SIZE, > + task->width, > + task->height, > &uc); > } > } > @@ -198,8 +202,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, > uint64_t clear_mask64 = arg.clear_zstencil.mask; > uint32_t clear_value = (uint32_t) clear_value64; > uint32_t clear_mask = (uint32_t) clear_mask64; > - const unsigned height = TILE_SIZE; > - const unsigned width = TILE_SIZE; > + const unsigned height = task->height; > + const unsigned width = task->width; > const unsigned block_size = scene->zsbuf.blocksize; > const unsigned dst_stride = scene->zsbuf.stride; > uint8_t *dst; > @@ -325,8 +329,8 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task, > variant = state->variant; > > /* render the whole 64x64 tile in 4x4 chunks */ > - for (y = 0; y < TILE_SIZE; y += 4){ > - for (x = 0; x < TILE_SIZE; x += 4) { > + for (y = 0; y < task->height; y += 4){ > + for (x = 0; x < task->width; x += 4) { > uint8_t *color[PIPE_MAX_COLOR_BUFS]; > unsigned stride[PIPE_MAX_COLOR_BUFS]; > uint8_t *depth = NULL; > @@ -434,21 +438,27 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task > *task, > > assert(lp_check_alignment(state->jit_context.u8_blend_color, 16)); > > - /* run shader on 4x4 block */ > - BEGIN_JIT_CALL(state, task); > - variant->jit_function[RAST_EDGE_TEST](&state->jit_context, > - x, y, > - inputs->frontfacing, > - GET_A0(inputs), > - GET_DADX(inputs), > - GET_DADY(inputs), > - color, > - depth, > - mask, > - &task->thread_data, > - stride, > - depth_stride); > - END_JIT_CALL(); > + /* > + * The rasterizer may produce fragments outside our > + * allocated 4x4 blocks hence need to filter them out here. > + */ > + if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) { > + /* run shader on 4x4 block */ > + BEGIN_JIT_CALL(state, task); > + variant->jit_function[RAST_EDGE_TEST](&state->jit_context, > + x, y, > + inputs->frontfacing, > + GET_A0(inputs), > + GET_DADX(inputs), > + GET_DADY(inputs), > + color, > + depth, > + mask, > + &task->thread_data, > + stride, > + depth_stride); > + END_JIT_CALL(); > + } > } > > > diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h > b/src/gallium/drivers/llvmpipe/lp_rast_priv.h > index e4b6e5b..4876d74 100644 > --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h > +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h > @@ -86,6 +86,7 @@ struct lp_rasterizer_task > > struct lp_scene *scene; > unsigned x, y; /**< Pos of this tile in framebuffer, in pixels */ > + unsigned width, height; /**< width, height of current tile, in pixels */ > > uint8_t *color_tiles[PIPE_MAX_COLOR_BUFS]; > uint8_t *depth_tile; > @@ -293,21 +294,27 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task > *task, > depth_stride = scene->zsbuf.stride; > } > > - /* run shader on 4x4 block */ > - BEGIN_JIT_CALL(state, task); > - variant->jit_function[RAST_WHOLE]( &state->jit_context, > - x, y, > - inputs->frontfacing, > - GET_A0(inputs), > - GET_DADX(inputs), > - GET_DADY(inputs), > - color, > - depth, > - 0xffff, > - &task->thread_data, > - stride, > - depth_stride); > - END_JIT_CALL(); > + /* > + * The rasterizer may produce fragments outside our > + * allocated 4x4 blocks hence need to filter them out here. > + */ > + if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) { > + /* run shader on 4x4 block */ > + BEGIN_JIT_CALL(state, task); > + variant->jit_function[RAST_WHOLE]( &state->jit_context, > + x, y, > + inputs->frontfacing, > + GET_A0(inputs), > + GET_DADX(inputs), > + GET_DADY(inputs), > + color, > + depth, > + 0xffff, > + &task->thread_data, > + stride, > + depth_stride); > + END_JIT_CALL(); > + } > } > > void lp_rast_triangle_1( struct lp_rasterizer_task *, > diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c > b/src/gallium/drivers/llvmpipe/lp_scene.c > index 771ad08..1d0dbdf 100644 > --- a/src/gallium/drivers/llvmpipe/lp_scene.c > +++ b/src/gallium/drivers/llvmpipe/lp_scene.c > @@ -505,6 +505,8 @@ void lp_scene_begin_binning( struct lp_scene *scene, > > scene->tiles_x = align(fb->width, TILE_SIZE) / TILE_SIZE; > scene->tiles_y = align(fb->height, TILE_SIZE) / TILE_SIZE; > + scene->width_aligned = align(fb->width, 4); > + scene->height_aligned = align(fb->height, 4); > > assert(scene->tiles_x <= TILES_X); > assert(scene->tiles_y <= TILES_Y); > diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h > b/src/gallium/drivers/llvmpipe/lp_scene.h > index fa5bbca..bc6c448 100644 > --- a/src/gallium/drivers/llvmpipe/lp_scene.h > +++ b/src/gallium/drivers/llvmpipe/lp_scene.h > @@ -144,6 +144,10 @@ struct lp_scene { > /** list of resources referenced by the scene commands */ > struct resource_ref *resources; > > + /** aligned scene width, height */ > + unsigned width_aligned; > + unsigned height_aligned; > + > /** Total memory used by the scene (in bytes). This sums all the > * data blocks and counts all bins, state, resource references and > * other random allocations within the scene. > diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c > b/src/gallium/drivers/llvmpipe/lp_setup.c > index eb39b1c..8f2baf0 100644 > --- a/src/gallium/drivers/llvmpipe/lp_setup.c > +++ b/src/gallium/drivers/llvmpipe/lp_setup.c > @@ -691,8 +691,7 @@ lp_setup_set_fragment_sampler_views(struct > lp_setup_context *setup, > assert(last_level <= res->last_level); > > /* > - * The complexity here is only necessary for depth textures > which > - * still are tiled. > + * The complexity here should no longer be necessary. > */ > mip_ptr = llvmpipe_get_texture_image_all(lp_tex, first_level, > LP_TEX_USAGE_READ); > diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c > b/src/gallium/drivers/llvmpipe/lp_texture.c > index 0ac3528..8e9ebdf 100644 > --- a/src/gallium/drivers/llvmpipe/lp_texture.c > +++ b/src/gallium/drivers/llvmpipe/lp_texture.c > @@ -84,15 +84,15 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen, > { > unsigned alignment, nblocksx, nblocksy, block_size; > > - /* For non-compressed formats we need to align the texture size > - * to the tile size to facilitate render-to-texture. > - * XXX this blows up 1d/1d array textures by unreasonable > - * amount (factor 64), probably should do something about it. > + /* For non-compressed formats we need 4x4 pixel alignment > + * (for now). We also want cache line size in x direction, > + * otherwise same cache line could end up in multiple threads. > + * XXX this blows up 1d/1d array textures by a factor of 4. > */ > if (util_format_is_compressed(pt->format)) > alignment = 1; > else > - alignment = TILE_SIZE; > + alignment = 4; > > nblocksx = util_format_get_nblocksx(pt->format, > align(width, alignment)); > @@ -100,7 +100,10 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen, > align(height, alignment)); > block_size = util_format_get_blocksize(pt->format); > > - lpr->row_stride[level] = align(nblocksx * block_size, 16); > + if (util_format_is_compressed(pt->format)) > + lpr->row_stride[level] = nblocksx * block_size; > + else > + lpr->row_stride[level] = align(nblocksx * block_size, > util_cpu_caps.cacheline); > > /* if row_stride * height > LP_MAX_TEXTURE_SIZE */ > if (lpr->row_stride[level] > LP_MAX_TEXTURE_SIZE / nblocksy) { > @@ -244,7 +247,8 @@ llvmpipe_resource_create(struct pipe_screen *_screen, > assert(templat->height0 == 1); > assert(templat->depth0 == 1); > assert(templat->last_level == 0); > - lpr->data = align_malloc(bytes, 16); > + /* align to 64 bytes (4xfloat4) in case we render to them */ > + lpr->data = align_malloc(bytes, 64); > /* > * buffers don't really have stride but it's probably safer > * (for code doing same calculations for buffers and textures) > @@ -327,7 +331,6 @@ llvmpipe_resource_map(struct pipe_resource *resource, > struct llvmpipe_screen *screen = llvmpipe_screen(resource->screen); > struct sw_winsys *winsys = screen->winsys; > unsigned dt_usage; > - uint8_t *map2; > > if (tex_usage == LP_TEX_USAGE_READ) { > dt_usage = PIPE_TRANSFER_READ; > @@ -345,14 +348,11 @@ llvmpipe_resource_map(struct pipe_resource *resource, > /* install this linear image in texture data structure */ > lpr->linear_img.data = map; > > - /* make sure tiled data gets converted to linear data */ > - map2 = llvmpipe_get_texture_image(lpr, 0, 0, tex_usage); > - return map2; > + return map; > } > else if (llvmpipe_resource_is_texture(resource)) { > > - map = llvmpipe_get_texture_image(lpr, layer, level, > - tex_usage); > + map = llvmpipe_get_texture_image(lpr, layer, level, tex_usage); > return map; > } > else { > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev