Move to per-generation backend, since these are likely to be fairly generation specific, and that is nicer than having it split between freedreno_screen (for the global case) and fd5_compute (for the kernel-specific limits case)
Signed-off-by: Rob Clark <robdcl...@gmail.com> --- Not totally working yet, so there might still be another constraint about max # of threads in a WG that I don't understand yet. The blob *mostly* seems to follow the formula that: num_threads <= 1024 num_threads * ((2 * num_regs) + num_half_regs) <= 8192 Except in a few cases where it uses a lower value for some reason. And in practice this formula mostly seems to work, except in a few cases where the GPU still locks up. But regardless, the first patch in the series is the right thing to do. src/gallium/drivers/freedreno/a5xx/fd5_compute.c | 126 ++++++++++++++++++++++- src/gallium/drivers/freedreno/a5xx/fd5_compute.h | 2 + src/gallium/drivers/freedreno/a5xx/fd5_program.c | 4 + src/gallium/drivers/freedreno/a5xx/fd5_screen.c | 2 + src/gallium/drivers/freedreno/freedreno_screen.c | 75 ++------------ src/gallium/drivers/freedreno/freedreno_screen.h | 9 ++ src/gallium/drivers/freedreno/ir3/ir3_shader.c | 2 + 7 files changed, 152 insertions(+), 68 deletions(-) diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c index 9d3039c3805..52b60e0c5e2 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c @@ -68,6 +68,124 @@ fd5_delete_compute_state(struct pipe_context *pctx, void *hwcso) free(so); } +// TODO move this somewhere to be shared with fd5_program.c.. +static unsigned +max_threads(struct ir3_info *info) +{ + /* blob seems to advertise 1024 as max threads for all a5xx. Either + * that is wrong, or when they scale up/down the number of shader core + * units it is always a multiples of a thing that can (in best case) + * run 1024 threads. (Ie. the bigger variants can run 4 or however + * many blocks at a time, while the smaller could only run 1 or 2). + */ + unsigned threads = 1024; + + if (info) { + unsigned hregs; + + /* seems like we have 1024 threads and 4096 full registers (or + * 8192 half-regs), once a shader is using more than 4 full regs + * it starts to cut down on threads in flight: + * + * XXX maybe this is 3k full / 6k half registers.. + */ + hregs = (2 * (info->max_reg + 1)) + (info->max_half_reg + 1); + threads /= DIV_ROUND_UP(hregs, 8); + } + + return threads; +} + +#define RET(x) do { \ + if (ret) \ + memcpy(ret, x, sizeof(x)); \ + return sizeof(x); \ +} while (0); break; + +int +fd5_get_compute_param(struct fd_screen *screen, enum pipe_compute_cap param, + void *hwcso, void *ret) +{ + const char * const ir = "ir3"; + /* blob seems to advertise 1024 as max threads for all a5xx. Either + * that is wrong, or when they scale up/down the number of shader core + * units it is always a multiples of a thing that can (in best case) + * run 1024 threads. (Ie. the bigger variants can run 4 or however + * many blocks at a time, while the smaller could only run 1 or 2). + */ + unsigned threads; + + // XXX blob appears to not care unless there is a barrier instruction + if (hwcso) { + struct fd5_compute_stateobj *so = hwcso; + struct ir3_shader_key key = {0}; + struct ir3_shader_variant *v; + + v = ir3_shader_variant(so->shader, key, NULL); + + threads = max_threads(&v->info); + } else { + threads = max_threads(NULL); + } + + switch (param) { + case PIPE_COMPUTE_CAP_ADDRESS_BITS: +// don't expose 64b pointer support yet, until ir3 supports 64b +// math, otherwise spir64 target is used and we get 64b pointer +// calculations that we can't do yet +// if (is_a5xx(screen)) +// RET((uint32_t []){ 64 }); + RET((uint32_t []){ 32 }); + + case PIPE_COMPUTE_CAP_IR_TARGET: + if (ret) + sprintf(ret, ir); + return strlen(ir) * sizeof(char); + + case PIPE_COMPUTE_CAP_GRID_DIMENSION: + RET((uint64_t []) { 3 }); + + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: + RET(((uint64_t []) { 65535, 65535, 65535 })); + + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: + RET(((uint64_t []) { threads, threads, threads })); + + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: + RET((uint64_t []) { threads }); + + case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: + RET((uint64_t []) { screen->ram_size }); + + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: + RET((uint64_t []) { 32768 }); + + case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: + case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: + RET((uint64_t []) { 4096 }); + + case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: + RET((uint64_t []) { screen->ram_size }); + + case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: + RET((uint32_t []) { screen->max_freq / 1000000 }); + + case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: + RET((uint32_t []) { 9999 }); // TODO + + case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: + RET((uint32_t []) { 0 }); + + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: + RET((uint32_t []) { 32 }); // TODO + + case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: + RET((uint64_t []) { 1024 }); // TODO + } + + return 0; +} + /* maybe move to fd5_program? */ static void cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v) @@ -76,7 +194,7 @@ cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v) enum a3xx_threadsize thrsz; /* note: blob uses local_size_x/y/z threshold to choose threadsize: */ - thrsz = FOUR_QUADS; + thrsz = (max_threads(&v->info) < 1024) ? TWO_QUADS : FOUR_QUADS; OUT_PKT4(ring, REG_A5XX_SP_SP_CNTL, 1); OUT_RING(ring, 0x00000000); /* SP_SP_CNTL */ @@ -214,9 +332,9 @@ fd5_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_6 */ OUT_PKT4(ring, REG_A5XX_HLSQ_CS_KERNEL_GROUP_X, 3); - OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_X */ - OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */ - OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */ + OUT_RING(ring, num_groups[0]); /* HLSQ_CS_KERNEL_GROUP_X */ + OUT_RING(ring, num_groups[1]); /* HLSQ_CS_KERNEL_GROUP_Y */ + OUT_RING(ring, num_groups[2]); /* HLSQ_CS_KERNEL_GROUP_Z */ if (info->indirect) { struct fd_resource *rsc = fd_resource(info->indirect); diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_compute.h b/src/gallium/drivers/freedreno/a5xx/fd5_compute.h index d5cc8b8a0ca..ae03c2bc374 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_compute.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_compute.h @@ -29,6 +29,8 @@ #include "pipe/p_context.h" +int fd5_get_compute_param(struct fd_screen *screen, enum pipe_compute_cap param, + void *hwcso, void *ret); void fd5_compute_init(struct pipe_context *pctx); #endif /* FD5_COMPUTE_H_ */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.c b/src/gallium/drivers/freedreno/a5xx/fd5_program.c index 81fe7d4b582..886589fdb9d 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_program.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.c @@ -337,6 +337,10 @@ fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, setup_stages(emit, s); + // should also consider half-regs.. but if # of registers used + // means that we only have 512 or fewer threads in flight, then + // use TWO_QUAD mode to reduce branch divergence penalty. See + // the calculation used for cs_program_emit() fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS; pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS); diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c index 7d7e76e869c..3e21030333e 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c @@ -29,6 +29,7 @@ #include "fd5_screen.h" #include "fd5_blitter.h" +#include "fd5_compute.h" #include "fd5_context.h" #include "fd5_format.h" #include "fd5_resource.h" @@ -109,6 +110,7 @@ fd5_screen_init(struct pipe_screen *pscreen) struct fd_screen *screen = fd_screen(pscreen); screen->max_rts = A5XX_MAX_RENDER_TARGETS; screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id); + screen->get_compute_param = fd5_get_compute_param; pscreen->context_create = fd5_context_create; pscreen->is_format_supported = fd5_screen_is_format_supported; diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 6f35d5dafbd..f4fdcef9ee3 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -622,81 +622,27 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, return 0; } -/* TODO depending on how much the limits differ for a3xx/a4xx, maybe move this - * into per-generation backend? - */ static int fd_get_compute_param(struct pipe_screen *pscreen, enum pipe_shader_ir ir_type, enum pipe_compute_cap param, void *ret) { struct fd_screen *screen = fd_screen(pscreen); - const char * const ir = "ir3"; - if (!has_compute(screen)) + if (!screen->get_compute_param) return 0; -#define RET(x) do { \ - if (ret) \ - memcpy(ret, x, sizeof(x)); \ - return sizeof(x); \ -} while (0); break; - - switch (param) { - case PIPE_COMPUTE_CAP_ADDRESS_BITS: -// don't expose 64b pointer support yet, until ir3 supports 64b -// math, otherwise spir64 target is used and we get 64b pointer -// calculations that we can't do yet -// if (is_a5xx(screen)) -// RET((uint32_t []){ 64 }); - RET((uint32_t []){ 32 }); - - case PIPE_COMPUTE_CAP_IR_TARGET: - if (ret) - sprintf(ret, ir); - return strlen(ir) * sizeof(char); - - case PIPE_COMPUTE_CAP_GRID_DIMENSION: - RET((uint64_t []) { 3 }); - - case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: - RET(((uint64_t []) { 65535, 65535, 65535 })); - - case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: - RET(((uint64_t []) { 256, 256, 256 })); - - case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: - RET((uint64_t []) { 256 }); - - case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: - RET((uint64_t []) { screen->ram_size }); - - case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: - RET((uint64_t []) { 32768 }); - - case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: - case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: - RET((uint64_t []) { 4096 }); - - case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: - RET((uint64_t []) { screen->ram_size }); - - case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: - RET((uint32_t []) { screen->max_freq / 1000000 }); - - case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: - RET((uint32_t []) { 9999 }); // TODO - - case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: - RET((uint32_t []) { 0 }); + return screen->get_compute_param(screen, param, NULL, ret); +} - case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: - RET((uint32_t []) { 32 }); // TODO +static int fd_get_kernel_param(struct pipe_screen *pscreen, void *hwcso, + enum pipe_compute_cap param, void *ret) +{ + struct fd_screen *screen = fd_screen(pscreen); - case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: - RET((uint64_t []) { 1024 }); // TODO - } + if (!screen->get_compute_param) + return 0; - return 0; + return screen->get_compute_param(screen, param, hwcso, ret); } static const void * @@ -906,6 +852,7 @@ fd_screen_create(struct fd_device *dev) pscreen->get_paramf = fd_screen_get_paramf; pscreen->get_shader_param = fd_screen_get_shader_param; pscreen->get_compute_param = fd_get_compute_param; + pscreen->get_kernel_param = fd_get_kernel_param; pscreen->get_compiler_options = fd_get_compiler_options; fd_resource_screen_init(pscreen); diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h index 6be739ae287..e2c481074ff 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.h +++ b/src/gallium/drivers/freedreno/freedreno_screen.h @@ -84,6 +84,15 @@ struct fd_screen { uint32_t (*setup_slices)(struct fd_resource *rsc); unsigned (*tile_mode)(const struct pipe_resource *prsc); + /* for backends that support compute, access compute param. If hwcso + * is not NULL, then it is the compute_state cso, in which case the + * returned param value should take into account limits imposed by + * resources used by compute shader, such as # of registers used. + * Otherwise the best-case value is returned. + */ + int (*get_compute_param)(struct fd_screen *screen, enum pipe_compute_cap param, + void *hwcso, void *ret); + int64_t cpu_gpu_time_delta; struct fd_batch_cache batch_cache; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c index 3a2c06f5963..55f28e0eee4 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -353,6 +353,8 @@ ir3_shader_create_compute(struct ir3_compiler *compiler, shader->compiler = compiler; shader->id = ++shader->compiler->shader_count; shader->type = SHADER_COMPUTE; + // TODO if we figure this out by scanning input params we could + // avoid a shader recompile by dropping PIPE_SHADER_DEP_INPUT_MEM shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4; /* byte->dword */ // TODO we need a way to differentiate clover vs glsl compute! -- 2.14.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev