On 27 April 2016 at 06:07, Roland Scheidegger <srol...@vmware.com> wrote: > Am 26.04.2016 um 06:42 schrieb Dave Airlie: >> From: Dave Airlie <airl...@redhat.com> >> >> This enables ARB_compute_shader on softpipe. I've only >> tested this with piglit so far, and I hopefully plan >> on integrating it with my vulkan work. I'll get to >> testing it with deqp more later. >> >> The basic premise is to create up to 1024 restartable >> TGSI machines, and execute workgroups of those machines. >> >> Signed-off-by: Dave Airlie <airl...@redhat.com> >> --- >> src/gallium/drivers/softpipe/Makefile.sources | 1 + >> src/gallium/drivers/softpipe/sp_compute.c | 211 >> +++++++++++++++++++++++++ >> src/gallium/drivers/softpipe/sp_context.c | 3 + >> src/gallium/drivers/softpipe/sp_context.h | 4 +- >> src/gallium/drivers/softpipe/sp_screen.c | 48 +++++- >> src/gallium/drivers/softpipe/sp_state.h | 9 ++ >> src/gallium/drivers/softpipe/sp_state_shader.c | 51 ++++++ >> 7 files changed, 324 insertions(+), 3 deletions(-) >> create mode 100644 src/gallium/drivers/softpipe/sp_compute.c >> >> diff --git a/src/gallium/drivers/softpipe/Makefile.sources >> b/src/gallium/drivers/softpipe/Makefile.sources >> index 1d42351..d72266f 100644 >> --- a/src/gallium/drivers/softpipe/Makefile.sources >> +++ b/src/gallium/drivers/softpipe/Makefile.sources >> @@ -4,6 +4,7 @@ C_SOURCES := \ >> sp_clear.h \ >> sp_context.c \ >> sp_context.h \ >> + sp_compute.c \ >> sp_draw_arrays.c \ >> sp_fence.c \ >> sp_fence.h \ >> diff --git a/src/gallium/drivers/softpipe/sp_compute.c >> b/src/gallium/drivers/softpipe/sp_compute.c >> new file mode 100644 >> index 0000000..7467686 >> --- /dev/null >> +++ b/src/gallium/drivers/softpipe/sp_compute.c >> @@ -0,0 +1,211 @@ >> +#include "util/u_inlines.h" >> +#include "util/u_math.h" >> +#include "util/u_memory.h" >> +#include "util/u_pstipple.h" >> +#include "pipe/p_shader_tokens.h" >> +#include "draw/draw_context.h" >> +#include "draw/draw_vertex.h" >> +#include "sp_context.h" >> +#include "sp_screen.h" >> +#include "sp_state.h" >> +#include "sp_texture.h" >> +#include "sp_tex_sample.h" >> +#include "sp_tex_tile_cache.h" >> +#include "tgsi/tgsi_parse.h" >> + >> +static void >> +cs_prepare(const struct sp_compute_shader *cs, >> + struct tgsi_exec_machine *machine, >> + int w, int h, int d, >> + int g_w, int g_h, int g_d, >> + int b_w, int b_h, int b_d, >> + struct tgsi_sampler *sampler, >> + struct tgsi_image *image, >> + struct tgsi_buffer *buffer ) >> +{ >> + int j; >> + /* >> + * Bind tokens/shader to the interpreter's machine state. >> + */ >> + tgsi_exec_machine_bind_shader(machine, >> + cs->tokens, >> + sampler, image, buffer); >> + >> + if (machine->SysSemanticToIndex[TGSI_SEMANTIC_THREAD_ID] != -1) { >> + unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_THREAD_ID]; >> + for (j = 0; j < TGSI_QUAD_SIZE; j++) { >> + machine->SystemValue[i].xyzw[0].i[j] = w; >> + machine->SystemValue[i].xyzw[1].i[j] = h; >> + machine->SystemValue[i].xyzw[2].i[j] = d; >> + } >> + } >> + >> + if (machine->SysSemanticToIndex[TGSI_SEMANTIC_GRID_SIZE] != -1) { >> + unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_GRID_SIZE]; >> + for (j = 0; j < TGSI_QUAD_SIZE; j++) { >> + machine->SystemValue[i].xyzw[0].i[j] = g_w; >> + machine->SystemValue[i].xyzw[1].i[j] = g_h; >> + machine->SystemValue[i].xyzw[2].i[j] = g_d; >> + } >> + } >> + >> + if (machine->SysSemanticToIndex[TGSI_SEMANTIC_BLOCK_SIZE] != -1) { >> + unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_BLOCK_SIZE]; >> + for (j = 0; j < TGSI_QUAD_SIZE; j++) { >> + machine->SystemValue[i].xyzw[0].i[j] = b_w; >> + machine->SystemValue[i].xyzw[1].i[j] = b_h; >> + machine->SystemValue[i].xyzw[2].i[j] = b_d; >> + } >> + } >> +} >> + >> +static bool >> +cs_run(const struct sp_compute_shader *cs, >> + int g_w, int g_h, int g_d, >> + struct tgsi_exec_machine *machine, bool restart) >> +{ >> + if (!restart) { >> + if (machine->SysSemanticToIndex[TGSI_SEMANTIC_BLOCK_ID] != -1) { >> + unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_BLOCK_ID]; >> + int j; >> + for (j = 0; j < TGSI_QUAD_SIZE; j++) { >> + machine->SystemValue[i].xyzw[0].i[j] = g_w; >> + machine->SystemValue[i].xyzw[1].i[j] = g_h; >> + machine->SystemValue[i].xyzw[2].i[j] = g_d; >> + } >> + } >> + machine->NonHelperMask = (1 << 1) - 1; >> + } >> + >> + tgsi_exec_machine_run(machine, restart ? machine->pc : 0); >> + >> + if (machine->pc != -1) >> + return true; >> + return false; >> +} >> + >> +static void >> +run_workgroup(const struct sp_compute_shader *cs, >> + int g_w, int g_h, int g_d, int num_threads, >> + struct tgsi_exec_machine **machines) >> +{ >> + int i; >> + bool grp_hit_barrier, restart_threads = false; >> + >> + do { >> + grp_hit_barrier = false; >> + for (i = 0; i < num_threads; i++) { >> + grp_hit_barrier |= cs_run(cs, g_w, g_h, g_d, machines[i], >> restart_threads); >> + } >> + restart_threads = false; >> + if (grp_hit_barrier) { >> + grp_hit_barrier = false; >> + restart_threads = true; >> + } >> + } while (restart_threads); >> +} >> + >> +static void >> +cs_delete(const struct sp_compute_shader *cs, >> + struct tgsi_exec_machine *machine) >> +{ >> + if (machine->Tokens == cs->tokens) { >> + tgsi_exec_machine_bind_shader(machine, NULL, NULL, NULL, NULL); >> + } >> +} >> + >> +static void >> +fill_grid_size(struct pipe_context *context, >> + const struct pipe_grid_info *info, >> + uint32_t grid_size[3]) >> +{ >> + struct pipe_transfer *transfer; >> + uint32_t *params; >> + if (!info->indirect) { >> + grid_size[0] = info->grid[0]; >> + grid_size[1] = info->grid[1]; >> + grid_size[2] = info->grid[2]; >> + return; >> + } >> + params = pipe_buffer_map_range(context, info->indirect, >> + info->indirect_offset, >> + 3 * sizeof(uint32_t), >> + PIPE_TRANSFER_READ, >> + &transfer); >> + >> + if (!transfer) >> + return; >> + >> + grid_size[0] = params[0]; >> + grid_size[1] = params[1]; >> + grid_size[2] = params[2]; >> + pipe_buffer_unmap(context, transfer); >> +} >> + >> +void >> +softpipe_launch_grid(struct pipe_context *context, >> + const struct pipe_grid_info *info) >> +{ >> + struct softpipe_context *softpipe = softpipe_context(context); >> + struct sp_compute_shader *cs = softpipe->cs; >> + int num_threads_in_group; >> + struct tgsi_exec_machine **machines; >> + int bwidth, bheight, bdepth; >> + int w, h, d, i; >> + int g_w, g_h, g_d; >> + uint32_t grid_size[3]; >> + void *local_mem = NULL; >> + >> + bwidth = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]; >> + bheight = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT]; >> + bdepth = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; >> + num_threads_in_group = bwidth * bheight * bdepth; >> + >> + fill_grid_size(context, info, grid_size); >> + >> + if (cs->shader.req_local_mem) { >> + local_mem = CALLOC(1, cs->shader.req_local_mem); >> + } >> + >> + machines = CALLOC(sizeof(struct tgsi_exec_machine *), >> num_threads_in_group); >> + if (!machines) >> + return; >> + >> + /* initialise machines + GRID_SIZE + THREAD_ID + BLOCK_SIZE */ >> + for (d = 0; d < bdepth; d++) { >> + for (h = 0; h < bheight; h++) { >> + for (w = 0; w < bwidth; w++) { >> + int idx = w + (h * bwidth) + (d * bheight * bwidth); >> + machines[idx] = tgsi_exec_machine_create(PIPE_SHADER_COMPUTE); >> + >> + machines[idx]->LocalMem = local_mem; >> + machines[idx]->LocalMemSize = cs->shader.req_local_mem; >> + cs_prepare(cs, machines[idx], >> + w, h, d, >> + grid_size[0], grid_size[1], grid_size[2], >> + bwidth, bheight, bdepth, >> + (struct tgsi_sampler >> *)softpipe->tgsi.sampler[PIPE_SHADER_COMPUTE], >> + (struct tgsi_image >> *)softpipe->tgsi.image[PIPE_SHADER_COMPUTE], >> + (struct tgsi_buffer >> *)softpipe->tgsi.buffer[PIPE_SHADER_COMPUTE]); >> + tgsi_exec_set_constant_buffers(machines[idx], >> PIPE_MAX_CONSTANT_BUFFERS, >> + >> softpipe->mapped_constants[PIPE_SHADER_COMPUTE], >> + >> softpipe->const_buffer_size[PIPE_SHADER_COMPUTE]); >> + } >> + } >> + } >> + >> + for (g_d = 0; g_d < grid_size[2]; g_d++) { >> + for (g_h = 0; g_h < grid_size[1]; g_h++) { >> + for (g_w = 0; g_w < grid_size[0]; g_w++) { >> + run_workgroup(cs, g_w, g_h, g_d, num_threads_in_group, >> machines); >> + } >> + } >> + } >> + >> + for (i = 0; i < num_threads_in_group; i++) { >> + cs_delete(cs, machines[i]); >> + } >> + >> + FREE(local_mem); >> + FREE(machines); >> +} >> diff --git a/src/gallium/drivers/softpipe/sp_context.c >> b/src/gallium/drivers/softpipe/sp_context.c >> index e3ec524..1690e38 100644 >> --- a/src/gallium/drivers/softpipe/sp_context.c >> +++ b/src/gallium/drivers/softpipe/sp_context.c >> @@ -212,6 +212,7 @@ softpipe_create_context(struct pipe_screen *screen, >> >> softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE ); >> softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE ); >> + softpipe->dump_cs = debug_get_bool_option( "SOFTPIPE_DUMP_CS", FALSE ); >> >> softpipe->pipe.screen = screen; >> softpipe->pipe.destroy = softpipe_destroy; >> @@ -233,6 +234,8 @@ softpipe_create_context(struct pipe_screen *screen, >> >> softpipe->pipe.draw_vbo = softpipe_draw_vbo; >> >> + softpipe->pipe.launch_grid = softpipe_launch_grid; >> + >> softpipe->pipe.clear = softpipe_clear; >> softpipe->pipe.flush = softpipe_flush_wrapped; >> softpipe->pipe.texture_barrier = softpipe_texture_barrier; >> diff --git a/src/gallium/drivers/softpipe/sp_context.h >> b/src/gallium/drivers/softpipe/sp_context.h >> index 70d00c8..a57f587 100644 >> --- a/src/gallium/drivers/softpipe/sp_context.h >> +++ b/src/gallium/drivers/softpipe/sp_context.h >> @@ -71,6 +71,7 @@ struct softpipe_context { >> struct sp_geometry_shader *gs; >> struct sp_velems_state *velems; >> struct sp_so_state *so; >> + struct sp_compute_shader *cs; >> >> /** Other rendering state */ >> struct pipe_blend_color blend_color; >> @@ -205,10 +206,11 @@ struct softpipe_context { >> * XXX wouldn't it make more sense for the tile cache to just be part >> * of sp_sampler_view? >> */ >> - struct softpipe_tex_tile_cache >> *tex_cache[PIPE_SHADER_GEOMETRY+1][PIPE_MAX_SHADER_SAMPLER_VIEWS]; >> + struct softpipe_tex_tile_cache >> *tex_cache[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS]; >> >> unsigned dump_fs : 1; >> unsigned dump_gs : 1; >> + unsigned dump_cs : 1; >> unsigned no_rast : 1; >> }; >> >> diff --git a/src/gallium/drivers/softpipe/sp_screen.c >> b/src/gallium/drivers/softpipe/sp_screen.c >> index d89d95c..4beeb80 100644 >> --- a/src/gallium/drivers/softpipe/sp_screen.c >> +++ b/src/gallium/drivers/softpipe/sp_screen.c >> @@ -157,7 +157,7 @@ softpipe_get_param(struct pipe_screen *screen, enum >> pipe_cap param) >> case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: >> return 0; >> case PIPE_CAP_COMPUTE: >> - return 0; >> + return 1; >> case PIPE_CAP_USER_VERTEX_BUFFERS: >> case PIPE_CAP_USER_INDEX_BUFFERS: >> case PIPE_CAP_USER_CONSTANT_BUFFERS: >> @@ -289,6 +289,8 @@ softpipe_get_shader_param(struct pipe_screen *screen, >> unsigned shader, enum pipe >> { >> case PIPE_SHADER_FRAGMENT: >> return tgsi_exec_get_shader_param(param); >> + case PIPE_SHADER_COMPUTE: >> + return tgsi_exec_get_shader_param(param); >> case PIPE_SHADER_VERTEX: >> case PIPE_SHADER_GEOMETRY: >> if (sp_screen->use_llvm) >> @@ -447,6 +449,48 @@ softpipe_get_timestamp(struct pipe_screen *_screen) >> return os_time_get_nano(); >> } >> >> +static int >> +softpipe_get_compute_param(struct pipe_screen *_screen, >> + enum pipe_shader_ir ir_type, >> + enum pipe_compute_cap param, >> + void *ret) >> +{ >> + switch (param) { >> + case PIPE_COMPUTE_CAP_IR_TARGET: >> + return 0; >> + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: >> + if (ret) { >> + uint64_t *grid_size = ret; >> + grid_size[0] = 65535; >> + grid_size[1] = 65535; >> + grid_size[2] = 65535; >> + } >> + return 3 * sizeof(uint64_t) ; >> + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: >> + if (ret) { >> + uint64_t *block_size = ret; >> + block_size[0] = 1024; >> + block_size[1] = 1024; >> + block_size[2] = 1024; >> + } >> + return 3 * sizeof(uint64_t); >> + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: >> + if (ret) { >> + uint64_t *max_threads_per_block = ret; >> + *max_threads_per_block = 2048; >> + } >> + return sizeof(uint64_t); >> + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: >> + if (ret) { >> + uint64_t *max_local_size = ret; >> + /* Value reported by the closed source driver. */ > > The comment here doesn't make much sense... > > 1024 interpreted tgsi machines, all running serially - I'm sure > performance is going to be amazing. > > But the approach looks reasonable to me. > > I'm not really familiar with compute shaders, but what I'm wondering is > since tgsi exec always operates on 4 values at a time, is that somehow > implicit in compute shaders?
So far I've set the execmask to 1 active channel, I'm contemplating changing that though and using less machines. Any ideas how to implement this in llvm? :-) 1024 CPU threads? Dave. _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev