[Mesa-dev] [PATCH 2/2] ac, radeonsi: reduce optimizations for complex compute shaders on older APUs

Marek Olšák Thu, 19 Jul 2018 20:13:12 -0700

From: Marek Olšák <marek.ol...@amd.com>

To make dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.23
finish sooner on the older CPUs. (otherwise it gets killed and we fail
the test)
---
 src/amd/common/ac_llvm_util.c                 | 18 ++++++++++--
 src/amd/common/ac_llvm_util.h                 | 11 ++++++-
 src/gallium/drivers/radeonsi/si_pipe.c        | 12 +++++++-
 src/gallium/drivers/radeonsi/si_shader.c      | 29 +++++++++++++++----
 .../drivers/radeonsi/si_shader_internal.h     |  3 +-
 .../drivers/radeonsi/si_shader_tgsi_setup.c   |  8 +++--
 6 files changed, 68 insertions(+), 13 deletions(-)


diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c
index 0c8dbf1ec51..b6960f7382d 100644
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -130,20 +130,21 @@ const char *ac_get_llvm_processor_name(enum radeon_family 
family)
                return HAVE_LLVM >= 0x0700 ? "gfx904" : "gfx902";
        case CHIP_VEGA20:
                return HAVE_LLVM >= 0x0700 ? "gfx906" : "gfx902";
        default:
                return "";
        }
 }
 
 static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
                                                     enum 
ac_target_machine_options tm_options,
+                                                    LLVMCodeGenOptLevel level,
                                                     const char **out_triple)
 {
        assert(family >= CHIP_TAHITI);
        char features[256];
        const char *triple = (tm_options & AC_TM_SUPPORTS_SPILL) ? 
"amdgcn-mesa-mesa3d" : "amdgcn--";
        LLVMTargetRef target = ac_get_llvm_target(triple);
        bool barrier_does_waitcnt = family != CHIP_VEGA20;
 
        snprintf(features, sizeof(features),
                 
"+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s%s%s%s",
@@ -151,21 +152,21 @@ static LLVMTargetMachineRef ac_create_target_machine(enum 
radeon_family family,
                 tm_options & AC_TM_FORCE_ENABLE_XNACK ? ",+xnack" : "",
                 tm_options & AC_TM_FORCE_DISABLE_XNACK ? ",-xnack" : "",
                 tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? 
",-promote-alloca" : "",
                 barrier_does_waitcnt ? ",+auto-waitcnt-before-barrier" : "");
        
        LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
                                     target,
                                     triple,
                                     ac_get_llvm_processor_name(family),
                                     features,
-                                    LLVMCodeGenLevelDefault,
+                                    level,
                                     LLVMRelocDefault,
                                     LLVMCodeModelDefault);
 
        if (out_triple)
                *out_triple = triple;
        return tm;
 }
 
 static LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef 
target_library_info,
                                            bool check_ir)
@@ -294,25 +295,34 @@ ac_count_scratch_private_memory(LLVMValueRef function)
 
 bool
 ac_init_llvm_compiler(struct ac_llvm_compiler *compiler,
                      bool okay_to_leak_target_library_info,
                      enum radeon_family family,
                      enum ac_target_machine_options tm_options)
 {
        const char *triple;
        memset(compiler, 0, sizeof(*compiler));
 
-       compiler->tm = ac_create_target_machine(family,
-                                           tm_options, &triple);
+       compiler->tm = ac_create_target_machine(family, tm_options,
+                                               LLVMCodeGenLevelDefault,
+                                               &triple);
        if (!compiler->tm)
                return false;
 
+       if (tm_options & AC_TM_CREATE_LOW_OPT) {
+               compiler->low_opt_tm =
+                       ac_create_target_machine(family, tm_options,
+                                                LLVMCodeGenLevelLess, NULL);
+               if (!compiler->low_opt_tm)
+                       goto fail;
+       }
+
        if (okay_to_leak_target_library_info || (HAVE_LLVM >= 0x0700)) {
                compiler->target_library_info =
                        ac_create_target_library_info(triple);
                if (!compiler->target_library_info)
                        goto fail;
        }
 
        compiler->passmgr = ac_create_passmgr(compiler->target_library_info,
                                              tm_options & AC_TM_CHECK_IR);
        if (!compiler->passmgr)
@@ -327,13 +337,15 @@ fail:
 void
 ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler)
 {
        if (compiler->passmgr)
                LLVMDisposePassManager(compiler->passmgr);
 #if HAVE_LLVM >= 0x0700
        /* This crashes on LLVM 5.0 and 6.0 and Ubuntu 18.04, so leak it there. 
*/
        if (compiler->target_library_info)
                ac_dispose_target_library_info(compiler->target_library_info);
 #endif
+       if (compiler->low_opt_tm)
+               LLVMDisposeTargetMachine(compiler->low_opt_tm);
        if (compiler->tm)
                LLVMDisposeTargetMachine(compiler->tm);
 }
diff --git a/src/amd/common/ac_llvm_util.h b/src/amd/common/ac_llvm_util.h
index e5b93037d26..c0e759b8836 100644
--- a/src/amd/common/ac_llvm_util.h
+++ b/src/amd/common/ac_llvm_util.h
@@ -56,34 +56,43 @@ enum ac_func_attr {
        AC_FUNC_ATTR_LEGACY       = (1u << 31),
 };
 
 enum ac_target_machine_options {
        AC_TM_SUPPORTS_SPILL = (1 << 0),
        AC_TM_SISCHED = (1 << 1),
        AC_TM_FORCE_ENABLE_XNACK = (1 << 2),
        AC_TM_FORCE_DISABLE_XNACK = (1 << 3),
        AC_TM_PROMOTE_ALLOCA_TO_SCRATCH = (1 << 4),
        AC_TM_CHECK_IR = (1 << 5),
+       AC_TM_CREATE_LOW_OPT = (1 << 6),
 };
 
 enum ac_float_mode {
        AC_FLOAT_MODE_DEFAULT,
        AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH,
        AC_FLOAT_MODE_UNSAFE_FP_MATH,
 };
 
 /* Per-thread persistent LLVM objects. */
 struct ac_llvm_compiler {
-       LLVMTargetMachineRef            tm;
        LLVMTargetLibraryInfoRef        target_library_info;
        LLVMPassManagerRef              passmgr;
+
+       /* Default compiler. */
+       LLVMTargetMachineRef            tm;
        struct ac_compiler_passes       *passes;
+
+       /* Optional compiler for faster compilation with fewer optimizations.
+        * LLVM modules can be created with "tm" too. There is no difference.
+        */
+       LLVMTargetMachineRef            low_opt_tm; /* uses -O1 instead of -O2 
*/
+       struct ac_compiler_passes       *low_opt_passes;
 };
 
 const char *ac_get_llvm_processor_name(enum radeon_family family);
 void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes);
 bool ac_is_sgpr_param(LLVMValueRef param);
 void ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function,
                           int attr_idx, enum ac_func_attr attr);
 void ac_add_func_attributes(LLVMContextRef ctx, LLVMValueRef function,
                            unsigned attrib_mask);
 void ac_dump_module(LLVMModuleRef module);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 22e333aec77..4f00eb5c2e2 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -100,35 +100,45 @@ static const struct debug_named_value debug_options[] = {
        { "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and 
exit." },
        { "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault 
test and exit." },
        { "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM 
fault test and exit." },
 
        DEBUG_NAMED_VALUE_END /* must be last */
 };
 
 static void si_init_compiler(struct si_screen *sscreen,
                             struct ac_llvm_compiler *compiler)
 {
+       /* Only create the less-optimizing version of the compiler on APUs
+        * predating Ryzen (Raven). */
+       bool create_low_opt_compiler = !sscreen->info.has_dedicated_vram &&
+                                      sscreen->info.chip_class <= VI;
+
        enum ac_target_machine_options tm_options =
                (sscreen->debug_flags & DBG(SI_SCHED) ? AC_TM_SISCHED : 0) |
                (sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 
0) |
                (sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 
0) |
                (!sscreen->llvm_has_working_vgpr_indexing ? 
AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) |
-               (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0);
+               (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) |
+               (create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0);
 
        ac_init_llvm_once();
        ac_init_llvm_compiler(compiler, true, sscreen->info.family, tm_options);
        compiler->passes = ac_create_llvm_passes(compiler->tm);
+
+       if (compiler->low_opt_tm)
+               compiler->low_opt_passes = 
ac_create_llvm_passes(compiler->low_opt_tm);
 }
 
 static void si_destroy_compiler(struct ac_llvm_compiler *compiler)
 {
        ac_destroy_llvm_passes(compiler->passes);
+       ac_destroy_llvm_passes(compiler->low_opt_passes);
        ac_destroy_llvm_compiler(compiler);
 }
 
 /*
  * pipe_context
  */
 static void si_destroy_context(struct pipe_context *context)
 {
        struct si_context *sctx = (struct si_context *)context;
        int i;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 43ba23ff494..405833d3ba7 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5638,21 +5638,22 @@ void si_shader_dump(struct si_screen *sscreen, const 
struct si_shader *shader,
                             check_debug_option);
 }
 
 static int si_compile_llvm(struct si_screen *sscreen,
                           struct ac_shader_binary *binary,
                           struct si_shader_config *conf,
                           struct ac_llvm_compiler *compiler,
                           LLVMModuleRef mod,
                           struct pipe_debug_callback *debug,
                           unsigned processor,
-                          const char *name)
+                          const char *name,
+                          bool less_optimized)
 {
        int r = 0;
        unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
 
        if (si_can_dump_shader(sscreen, processor)) {
                fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
 
                if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
                        fprintf(stderr, "%s LLVM IR:\n\n", name);
                        ac_dump_module(mod);
@@ -5660,21 +5661,22 @@ static int si_compile_llvm(struct si_screen *sscreen,
                }
        }
 
        if (sscreen->record_llvm_ir) {
                char *ir = LLVMPrintModuleToString(mod);
                binary->llvm_ir_string = strdup(ir);
                LLVMDisposeMessage(ir);
        }
 
        if (!si_replace_shader(count, binary)) {
-               r = si_llvm_compile(mod, binary, compiler, debug);
+               r = si_llvm_compile(mod, binary, compiler, debug,
+                                   less_optimized);
                if (r)
                        return r;
        }
 
        si_shader_binary_read_config(binary, conf, 0);
 
        /* Enable 64-bit and 16-bit denormals, because there is no performance
         * cost.
         *
         * If denormals are enabled, all floating-point output modifiers are
@@ -5877,21 +5879,21 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
 
        LLVMBuildRetVoid(ctx.ac.builder);
 
        ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
        si_llvm_optimize_module(&ctx);
 
        r = si_compile_llvm(sscreen, &ctx.shader->binary,
                            &ctx.shader->config, ctx.compiler,
                            ctx.ac.module,
                            debug, PIPE_SHADER_GEOMETRY,
-                           "GS Copy Shader");
+                           "GS Copy Shader", false);
        if (!r) {
                if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
                        fprintf(stderr, "GS Copy Shader:\n");
                si_shader_dump(sscreen, ctx.shader, debug,
                               PIPE_SHADER_GEOMETRY, stderr, true);
                r = si_shader_binary_upload(sscreen, ctx.shader);
        }
 
        si_llvm_dispose(&ctx);
 
@@ -6783,20 +6785,36 @@ static void si_build_wrapper_function(struct 
si_shader_context *ctx,
                                        assert(num_out_sgpr + 1 == num_out);
                                        num_out_sgpr = num_out;
                                }
                        }
                }
        }
 
        LLVMBuildRetVoid(builder);
 }
 
+static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
+                                   struct si_shader_selector *sel)
+{
+       if (!compiler->low_opt_passes)
+               return false;
+
+       /* Assume a slow CPU. */
+       assert(!sel->screen->info.has_dedicated_vram &&
+              sel->screen->info.chip_class <= VI);
+
+       /* For a crazy dEQP test containing 2597 memory opcodes, mostly
+        * buffer stores. */
+       return sel->type == PIPE_SHADER_COMPUTE &&
+              sel->info.num_memory_instructions > 1000;
+}
+
 int si_compile_tgsi_shader(struct si_screen *sscreen,
                           struct ac_llvm_compiler *compiler,
                           struct si_shader *shader,
                           struct pipe_debug_callback *debug)
 {
        struct si_shader_selector *sel = shader->selector;
        struct si_shader_context ctx;
        int r = -1;
 
        /* Dump TGSI code before doing TGSI->LLVM conversion in case the
@@ -7015,21 +7033,22 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                ctx.shader->config.private_mem_vgprs =
                        ac_count_scratch_private_memory(ctx.main_fn);
        }
 
        /* Make sure the input is a pointer and not integer followed by 
inttoptr. */
        assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
               LLVMPointerTypeKind);
 
        /* Compile to bytecode. */
        r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
-                           ctx.ac.module, debug, ctx.type, "TGSI shader");
+                           ctx.ac.module, debug, ctx.type, "TGSI shader",
+                           si_should_optimize_less(compiler, 
shader->selector));
        si_llvm_dispose(&ctx);
        if (r) {
                fprintf(stderr, "LLVM failed to compile shader\n");
                return r;
        }
 
        /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
         * LLVM 3.9svn has this bug.
         */
        if (sel->type == PIPE_SHADER_COMPUTE) {
@@ -7182,21 +7201,21 @@ si_get_shader_part(struct si_screen *sscreen,
        default:
                unreachable("bad shader part");
        }
 
        build(&ctx, key);
 
        /* Compile. */
        si_llvm_optimize_module(&ctx);
 
        if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
-                           ctx.ac.module, debug, ctx.type, name)) {
+                           ctx.ac.module, debug, ctx.type, name, false)) {
                FREE(result);
                result = NULL;
                goto out;
        }
 
        result->next = *list;
        *list = result;
 
 out:
        si_llvm_dispose(&ctx);
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h 
b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 21e325c2d82..36351391d95 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -210,21 +210,22 @@ si_shader_context(struct lp_build_tgsi_context *bld_base)
 
 static inline struct si_shader_context *
 si_shader_context_from_abi(struct ac_shader_abi *abi)
 {
        struct si_shader_context *ctx = NULL;
        return container_of(abi, ctx, abi);
 }
 
 unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary,
                         struct ac_llvm_compiler *compiler,
-                        struct pipe_debug_callback *debug);
+                        struct pipe_debug_callback *debug,
+                        bool less_optimized);
 
 LLVMTypeRef tgsi2llvmtype(struct lp_build_tgsi_context *bld_base,
                          enum tgsi_opcode_type type);
 
 LLVMValueRef bitcast(struct lp_build_tgsi_context *bld_base,
                     enum tgsi_opcode_type type, LLVMValueRef value);
 
 LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx,
                                 LLVMValueRef index,
                                 unsigned num);
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c 
b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
index b486be25749..b9ed0fc3ab0 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
@@ -75,35 +75,39 @@ static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, 
void *context)
        LLVMDisposeMessage(description);
 }
 
 /**
  * Compile an LLVM module to machine code.
  *
  * @returns 0 for success, 1 for failure
  */
 unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary,
                         struct ac_llvm_compiler *compiler,
-                        struct pipe_debug_callback *debug)
+                        struct pipe_debug_callback *debug,
+                        bool less_optimized)
 {
+       struct ac_compiler_passes *passes =
+               less_optimized && compiler->low_opt_passes ?
+                       compiler->low_opt_passes : compiler->passes;
        struct si_llvm_diagnostics diag;
        LLVMContextRef llvm_ctx;
 
        diag.debug = debug;
        diag.retval = 0;
 
        /* Setup Diagnostic Handler*/
        llvm_ctx = LLVMGetModuleContext(M);
 
        LLVMContextSetDiagnosticHandler(llvm_ctx, si_diagnostic_handler, &diag);
 
        /* Compile IR. */
-       if (!ac_compile_module_to_binary(compiler->passes, M, binary))
+       if (!ac_compile_module_to_binary(passes, M, binary))
                diag.retval = 1;
 
        if (diag.retval != 0)
                pipe_debug_message(debug, SHADER_INFO, "LLVM compile failed");
        return diag.retval;
 }
 
 LLVMTypeRef tgsi2llvmtype(struct lp_build_tgsi_context *bld_base,
                          enum tgsi_opcode_type type)
 {
-- 
2.17.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 2/2] ac, radeonsi: reduce optimizations for complex compute shaders on older APUs

Reply via email to