[Mesa-dev] [Bug 78298] Don't enforce gallium-pipe shared library when enable_xa is set to yes
https://bugs.freedesktop.org/show_bug.cgi?id=78298 NicolasChauvet changed: What|Removed |Added Status|NEW |RESOLVED Resolution|--- |NOTABUG --- Comment #3 from NicolasChauvet --- The issue was sorted out as the gallium-pipe was incorrecly distributed under the OpenCL sub-package, which conditionalized only for x86 (not for ARM). So I've incorrectly assumed the gallium-pipe was not built on x86. Thx for the answear. -- You are receiving this mail because: You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] i965: Relax accumulator dependency scheduling on Gen < 6
Many instructions implicitly update the accumulator on Gen < 6. The instruction scheduling code just calls add_barrier_deps() for each accumulator access on these platforms, but a large class of operations don't actually update the accumulator -- mostly move and logical instructions. Teaching the scheduling code about this would allow more flexibility to schedule instructions. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=77740 --- .../drivers/dri/i965/brw_schedule_instructions.cpp | 84 +++--- src/mesa/drivers/dri/i965/brw_shader.cpp | 7 ++ src/mesa/drivers/dri/i965/brw_shader.h | 1 + 3 files changed, 33 insertions(+), 59 deletions(-) I tested this on IvyBridge and IronLake with a piglit test run but it would be nice if someone could test on other gens too. diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index 8cc6908..6f8f405 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -742,8 +742,6 @@ fs_instruction_scheduler::is_compressed(fs_inst *inst) void fs_instruction_scheduler::calculate_deps() { - const bool gen6plus = v->brw->gen >= 6; - /* Pre-register-allocation, this tracks the last write per VGRF (so * different reg_offsets within it can interfere when they shouldn't). * After register allocation, reg_offsets are gone and we track individual @@ -803,7 +801,7 @@ fs_instruction_scheduler::calculate_deps() } else { add_dep(last_fixed_grf_write, n); } - } else if (inst->src[i].is_accumulator() && gen6plus) { + } else if (inst->src[i].is_accumulator()) { add_dep(last_accumulator_write, n); } else if (inst->src[i].file != BAD_FILE && inst->src[i].file != IMM && @@ -828,11 +826,7 @@ fs_instruction_scheduler::calculate_deps() } if (inst->reads_accumulator_implicitly()) { - if (gen6plus) { -add_dep(last_accumulator_write, n); - } else { -add_barrier_deps(n); - } + add_dep(last_accumulator_write, n); } /* write-after-write deps. */ @@ -867,7 +861,7 @@ fs_instruction_scheduler::calculate_deps() } else { last_fixed_grf_write = n; } - } else if (inst->dst.is_accumulator() && gen6plus) { + } else if (inst->dst.is_accumulator()) { add_dep(last_accumulator_write, n); last_accumulator_write = n; } else if (inst->dst.file != BAD_FILE && @@ -887,13 +881,10 @@ fs_instruction_scheduler::calculate_deps() last_conditional_mod[inst->flag_subreg] = n; } - if (inst->writes_accumulator) { - if (gen6plus) { -add_dep(last_accumulator_write, n); -last_accumulator_write = n; - } else { -add_barrier_deps(n); - } + if (inst->writes_accumulator_implicitly(v->brw->gen) && + !inst->dst.is_accumulator()) { + add_dep(last_accumulator_write, n); + last_accumulator_write = n; } } @@ -933,7 +924,7 @@ fs_instruction_scheduler::calculate_deps() } else { add_dep(n, last_fixed_grf_write); } - } else if (inst->src[i].is_accumulator() && gen6plus) { + } else if (inst->src[i].is_accumulator()) { add_dep(n, last_accumulator_write); } else if (inst->src[i].file != BAD_FILE && inst->src[i].file != IMM && @@ -958,11 +949,7 @@ fs_instruction_scheduler::calculate_deps() } if (inst->reads_accumulator_implicitly()) { - if (gen6plus) { -add_dep(n, last_accumulator_write); - } else { -add_barrier_deps(n); - } + add_dep(n, last_accumulator_write); } /* Update the things this instruction wrote, so earlier reads @@ -996,7 +983,7 @@ fs_instruction_scheduler::calculate_deps() } else { last_fixed_grf_write = n; } - } else if (inst->dst.is_accumulator() && gen6plus) { + } else if (inst->dst.is_accumulator()) { last_accumulator_write = n; } else if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { @@ -1013,12 +1000,8 @@ fs_instruction_scheduler::calculate_deps() last_conditional_mod[inst->flag_subreg] = n; } - if (inst->writes_accumulator) { - if (gen6plus) { -last_accumulator_write = n; - } else { -add_barrier_deps(n); - } + if (inst->writes_accumulator_implicitly(v->brw->gen)) { + last_accumulator_write = n; } } } @@ -1026,8 +1009,6 @@ fs_instruction_scheduler::calculate_deps() void vec4_instruction_scheduler::calculate_deps() { - const bool gen6plus = v->brw->gen >= 6; - schedule_no
[Mesa-dev] [PATCH] mesa: pass target through to driver when choosing texture format
This only matters for TextureView where the texObj's target has not been set yet, in all other instances, texObj->target should be the same as the passed-in target parameter. Signed-off-by: Ilia Mirkin --- I ran into an assert in mesa/st when choosing the texture format because the target was 0. (While trying to implement texture views.) Not sure why it cares about the target, but this seems correct. src/mesa/main/teximage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c index c7f301c..845ba80 100644 --- a/src/mesa/main/teximage.c +++ b/src/mesa/main/teximage.c @@ -3024,7 +3024,7 @@ _mesa_choose_texture_format(struct gl_context *ctx, } /* choose format from scratch */ - f = ctx->Driver.ChooseTextureFormat(ctx, texObj->Target, internalFormat, + f = ctx->Driver.ChooseTextureFormat(ctx, target, internalFormat, format, type); ASSERT(f != MESA_FORMAT_NONE); return f; -- 1.8.3.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 2/2] mesa/st: pass 4-offset TG4 without lowering if supported
ping for this and 1/2 (which just adds the new cap) On Mon, Apr 28, 2014 at 7:30 PM, Ilia Mirkin wrote: > Signed-off-by: Ilia Mirkin > --- > > The handling of the 4 offsets is less-than-pretty. I had an alternate version > that created a new ir_dereference_array object and ran ->accept on that. This > worked as well, but for each offset it would create a separate new array, and > then deref just one item out of it. This seems incredibly wasteful. The > slightly open-coded version of that seems reasonable and uses the same array. > > src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 55 > ++ > 1 file changed, 41 insertions(+), 14 deletions(-) > > diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > index d1c3856..20d5e99 100644 > --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > @@ -87,8 +87,7 @@ extern "C" { > */ > #define MAX_ARRAYS256 > > -/* if we support a native gallium TG4 with the ability to take 4 texoffsets > then bump this */ > -#define MAX_GLSL_TEXTURE_OFFSET 1 > +#define MAX_GLSL_TEXTURE_OFFSET 4 > > class st_src_reg; > class st_dst_reg; > @@ -2728,12 +2727,13 @@ glsl_to_tgsi_visitor::visit(ir_call *ir) > void > glsl_to_tgsi_visitor::visit(ir_texture *ir) > { > - st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, > offset, sample_index, component; > + st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, > offset[MAX_GLSL_TEXTURE_OFFSET], sample_index, component; > st_dst_reg result_dst, coord_dst, cube_sc_dst; > glsl_to_tgsi_instruction *inst = NULL; > unsigned opcode = TGSI_OPCODE_NOP; > const glsl_type *sampler_type = ir->sampler->type; > bool is_cube_array = false; > + unsigned i; > > /* if we are a cube array sampler */ > if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE && > @@ -2771,7 +2771,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >opcode = (is_cube_array && ir->shadow_comparitor) ? TGSI_OPCODE_TEX2 : > TGSI_OPCODE_TEX; >if (ir->offset) { > ir->offset->accept(this); > - offset = this->result; > + offset[0] = this->result; >} >break; > case ir_txb: > @@ -2780,7 +2780,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >lod_info = this->result; >if (ir->offset) { > ir->offset->accept(this); > - offset = this->result; > + offset[0] = this->result; >} >break; > case ir_txl: > @@ -2789,7 +2789,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >lod_info = this->result; >if (ir->offset) { > ir->offset->accept(this); > - offset = this->result; > + offset[0] = this->result; >} >break; > case ir_txd: > @@ -2800,7 +2800,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >dy = this->result; >if (ir->offset) { > ir->offset->accept(this); > - offset = this->result; > + offset[0] = this->result; >} >break; > case ir_txs: > @@ -2814,7 +2814,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >lod_info = this->result; >if (ir->offset) { > ir->offset->accept(this); > - offset = this->result; > + offset[0] = this->result; >} >break; > case ir_txf_ms: > @@ -2828,9 +2828,17 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >component = this->result; >if (ir->offset) { > ir->offset->accept(this); > - /* this should have been lowered */ > - assert(ir->offset->type->base_type != GLSL_TYPE_ARRAY); > - offset = this->result; > + if (ir->offset->type->base_type == GLSL_TYPE_ARRAY) { > +const glsl_type *elt_type = ir->offset->type->fields.array; > +for (i = 0; i < ir->offset->type->length; i++) { > + offset[i] = this->result; > + offset[i].index += i * type_size(elt_type); > + offset[i].type = elt_type->base_type; > + offset[i].swizzle = > swizzle_for_size(elt_type->vector_elements); > +} > + } else { > +offset[0] = this->result; > + } >} >break; > case ir_lod: > @@ -2960,8 +2968,9 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >this->prog); > > if (ir->offset) { > - inst->tex_offset_num_offset = 1; > - inst->tex_offsets[0] = offset; > + for (i = 0; i < MAX_GLSL_TEXTURE_OFFSET && offset[i].file != > PROGRAM_UNDEFINED; i++) > + inst->tex_offsets[i] = offset[i]; > + inst->tex_offset_num_offset = i; > } > > switch (sampler_type->sampler_dimensionality) { > @@ -4479,6 +4488,8 @@ translate_tex_offset(struct st_translate *t, > { > struct tgsi_texture_offset offset; > struct ureg_src imm_src; > +
Re: [Mesa-dev] [PATCH 00/21] deferred and threaded glCompileShader
On Tue, May 6, 2014 at 5:27 AM, Fredrik Höglund wrote: > On Tuesday 22 April 2014, Chia-I Wu wrote: >> Hi list, >> >> This series adds a thread pool to the GLSL compiler, and a drirc option to >> defer glCompileShader calls to the pool. The goal is to reduce the start-up >> time of applications that are aware of this feature. That is, applications >> that compile shaders first and check the compile status later. >> >> I do not have numbers from real applications yet. But trying to compiling a >> set of 2882 shaders extracted from some trace file, with everything else >> idled, the time it takes is >> >> 8 threads: 17.8s >> 4 threads: 20.3s >> 2 threads: 31.2s >> 1 threads: 58.0s >> no thread pool: 54.5 >> >> on a quad core system. >> >> Patches 1-4 fix potential races in the GLSL compiler. As the compiler is >> already shared by all contexts, these patches could be desirable even without >> the thread pool that I am going to add. >> >> Patches 5-18 adds true GL_DEBUG_OUTPUT_SYNCHRONOUS support to the KHR_debug >> code. All except patch 18 are clean-ups. Patch 18 adds a mutex to protect >> gl_debug_state. >> >> Patch 19 defines a simple API to create and work with thread pools, as well >> as >> a test for the API. >> >> Patch 20 adds the singleton GLSL thread pool and allows glCompileShader to be >> deferred to the pool. This feature needs to be explicitly enabled with >> _mesa_enable_glsl_threadpool. >> >> Patch 21 adds a drirc option to enable the thread pool. The idea is that >> only >> applications that can benefit from it will enable it. > > If applications are supposed to enable this behavior themselves I think > it would be better to add an extension that lets them do: > > glEnable(GL_DEFERRED_SHADER_COMPILATION_EXT); glHint might be better. But it is hard to define the behavior of deferred shader compilation without restricting what an implementation may do. And for a driver that is able to determine GL_COMPILE_STATUS quickly and defer only the optimization passes, threaded compile may be always on. > > The drirc option can still be useful for overriding the default though. > > Fredrik > -- o...@lunarg.com ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 02/21] glsl: protect locale_t with a mutex
On 05/06/2014 01:04 AM, Kenneth Graunke wrote: On 05/05/2014 10:01 AM, Ian Romanick wrote: The problem I see with that is we don't know what order constructors on global objects will occur. Are we sure the contructor will be invoked before we need the locale now and in the future? These functions are used during lexing to read floating point literals...without breaking on locales where the decimal point is a comma. All constructors have definitely been run by that point, and I can't imagine we'll ever need to call locale-independent strtof/strtod from other constructors. The order of constructors in a single TU is well defined. The usual pattern for this kind of initialization is to have the initializer object in the header as static: In strtod.h: struct loc_initializer { _Initializer() { if (!loc) { loc = initialize_it(); } } }; static loc_initializer loc_init; Any global object constructor that might use loc will have to #include strtod.h, therefore loc_init will appear first in that TU, and will initialize first. A real-world example would be std::_Init that initializes std::cout and its pals. -- Petri Latvala ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 02/21] glsl: protect locale_t with a mutex
On Tue, May 6, 2014 at 6:04 AM, Kenneth Graunke wrote: > On 05/05/2014 10:01 AM, Ian Romanick wrote: >> On 05/04/2014 02:14 PM, Chia-I Wu wrote: >>> On Sat, May 3, 2014 at 1:52 AM, Ian Romanick wrote: On 04/22/2014 01:58 AM, Chia-I Wu wrote: > There may be two contexts compiling shaders at the same time. locale_t > needs > to be protected. Rather than calling glsl_initialize_strtod from other places in the compiler, it seems better to use call_once from the strtof and strtod functions. >>> How about having a static object to call newlocale()/freelocale() in >>> its constructor/destructor? It will impose no overead on >>> glsl_strtod(), at the expense of little wasted memory when the >>> applications do not compile shaders. >> >> The problem I see with that is we don't know what order constructors on >> global objects will occur. Are we sure the contructor will be invoked >> before we need the locale now and in the future? > > These functions are used during lexing to read floating point > literals...without breaking on locales where the decimal point is a comma. > > All constructors have definitely been run by that point, and I can't > imagine we'll ever need to call locale-independent strtof/strtod from > other constructors. They are also used by the IR reader. But the IR reader is no longer used to parse built-in functions or so. We should be good as you said. Not that we need this, and I am not saying this with 100% confidence, from my research on stackoverflow (instead of the standard, sorry), it is the initialization order of static objects from different translation units that are undefined. If this locale_t_initializer was defined statically in strtod.cpp, other static constructors calling glsl_strtod() would actually work because, by the time glsl_strtod() is called, locale_t_initializer is guaranteed to be initialized. Anyway, I should add some assert() just in case. > > --Ken > -- o...@lunarg.com ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] glsl_to_tgsi: remove unnecessary dead code elimination pass
Reviewed-by: Marek Olšák Marek On Tue, May 6, 2014 at 5:40 AM, Bryan Cain wrote: > With the more advanced dead code elimination pass already being run, > eliminate_dead_code was making no difference in instruction count, and had > an undesirable O(n^2) runtime. So remove it and rename > eliminate_dead_code_advanced to eliminate_dead_code. > --- > src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 50 > +++- > 1 file changed, 5 insertions(+), 45 deletions(-) > > diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > index 6eb6c8a..b0e0782 100644 > --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > @@ -460,8 +460,7 @@ public: > int get_last_temp_write(int index); > > void copy_propagate(void); > - void eliminate_dead_code(void); > - int eliminate_dead_code_advanced(void); > + int eliminate_dead_code(void); > void merge_registers(void); > void renumber_registers(void); > > @@ -3663,7 +3662,8 @@ glsl_to_tgsi_visitor::copy_propagate(void) > } > > /* > - * Tracks available PROGRAM_TEMPORARY registers for dead code elimination. > + * On a basic block basis, tracks available PROGRAM_TEMPORARY registers for > dead > + * code elimination. > * > * The glsl_to_tgsi_visitor lazily produces code assuming that this pass > * will occur. As an example, a TXP production after copy propagation but > @@ -3676,48 +3676,9 @@ glsl_to_tgsi_visitor::copy_propagate(void) > * and after this pass: > * > * 0: TXP TEMP[2], INPUT[4].xyyw, texture[0], 2D; > - * > - * FIXME: assumes that all functions are inlined (no support for > BGNSUB/ENDSUB) > - * FIXME: doesn't eliminate all dead code inside of loops; it steps around > them > - */ > -void > -glsl_to_tgsi_visitor::eliminate_dead_code(void) > -{ > - int i; > - > - for (i=0; i < this->next_temp; i++) { > - int last_read = get_last_temp_read(i); > - int j = 0; > - > - foreach_list_safe(node, &this->instructions) { > - glsl_to_tgsi_instruction *inst = (glsl_to_tgsi_instruction *) node; > - > - if (inst->dst.file == PROGRAM_TEMPORARY && inst->dst.index == i && > - j > last_read) > - { > -inst->remove(); > -delete inst; > - } > - > - j++; > - } > - } > -} > - > -/* > - * On a basic block basis, tracks available PROGRAM_TEMPORARY registers for > dead > - * code elimination. This is less primitive than eliminate_dead_code(), as > it > - * is per-channel and can detect consecutive writes without a read between > them > - * as dead code. However, there is some dead code that can be eliminated by > - * eliminate_dead_code() but not this function - for example, this function > - * cannot eliminate an instruction writing to a register that is never read > and > - * is the only instruction writing to that register. > - * > - * The glsl_to_tgsi_visitor lazily produces code assuming that this pass > - * will occur. > */ > int > -glsl_to_tgsi_visitor::eliminate_dead_code_advanced(void) > +glsl_to_tgsi_visitor::eliminate_dead_code(void) > { > glsl_to_tgsi_instruction **writes = rzalloc_array(mem_ctx, > > glsl_to_tgsi_instruction *, > @@ -5245,9 +5206,8 @@ get_mesa_program(struct gl_context *ctx, > /* Perform optimizations on the instructions in the glsl_to_tgsi_visitor. > */ > v->simplify_cmp(); > v->copy_propagate(); > - while (v->eliminate_dead_code_advanced()); > + while (v->eliminate_dead_code()); > > - v->eliminate_dead_code(); > v->merge_registers(); > v->renumber_registers(); > > -- > 1.7.9.5 > > ___ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] Compiling Mesa/softpipe for Windows
Cross-compiling Mesa for windows on linux with mingw is dead easy: - install mingw-w64 C/C++ cross-compilers (any recent linux distro already has the packages) - run scons platform=windows libgl-gdi The opengl32.dll drop-in replacement will in mesa/build/windows-x86-debug/gallium/targets/libgl-gdi/opengl32.dll If you want a release build add "build=release" to the scons command line. llvmpipe is faster, but its build is more complex (you'll need to build LLVM for mingw, but that too can be done with cross-compilers.) Jose - Original Message - > Hello, > > I’m trying to get OpenGL 3.3 working on Windows via the softpipe driver. Can > somebody tell me the steps to successfully (cross) compile it with either > MinGW or Visual Studio. I seem to always run into problems. > > Thanks > André > ___ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://urldefense.proofpoint.com/v1/url?u=http://lists.freedesktop.org/mailman/listinfo/mesa-dev&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=NMr9uy2iTjWVixC0wOcYCWEIYhfo80qKwRgdodpoDzA%3D%0A&m=J1yzqfqqAXFRaco4DEX3lwmn2jsACsOex%2FrQfGJ6LL0%3D%0A&s=fc09b7ceb8ab879f7f58d38f43b2f6de9bbc6afc9fd04cbd0306afa1a24c75a0 > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 0/2] i965: Simulate MAD opcode with gen<6
These patches allow MAD opcode to be used with pre gen6 hardware. Instead of failing on emitting MAD there will be emitted MUL and ADD to simulate MAD. I tried this with piglit on ILK (gen5) and did not see regression. Juha-Pekka Heikkila (2): i965/fs: Simulate MAD opcode with gen<6 i965/vec4: Simulate MAD opcode for gen<6 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 15 ++ src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 41 ++ 2 files changed, 39 insertions(+), 17 deletions(-) -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] i965/fs: Simulate MAD opcode with gen<6
Signed-off-by: Juha-Pekka Heikkila --- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 15 ++- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index d2dc5fa..22ca528 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -293,10 +293,6 @@ fs_visitor::try_emit_saturate(ir_expression *ir) bool fs_visitor::try_emit_mad(ir_expression *ir) { - /* 3-src instructions were introduced in gen6. */ - if (brw->gen < 6) - return false; - /* MAD can only handle floating-point data. */ if (ir->type != glsl_type::float_type) return false; @@ -327,7 +323,16 @@ fs_visitor::try_emit_mad(ir_expression *ir) fs_reg src2 = this->result; this->result = fs_reg(this, ir->type); - emit(BRW_OPCODE_MAD, this->result, src0, src1, src2); + + /* 3-src instructions were introduced in gen6. */ + if (brw->gen < 6) { + fs_reg temp = fs_reg(this, glsl_type::float_type); + + emit(MUL(temp, src1, src2)); + emit(ADD(this->result, src0, temp)); + } else { + emit(BRW_OPCODE_MAD, this->result, src0, src1, src2); + } return true; } -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] i965/vec4: Simulate MAD opcode for gen<6
Signed-off-by: Juha-Pekka Heikkila --- src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 41 ++ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 7bad81c..506a4b2 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -1092,10 +1092,6 @@ vec4_visitor::try_emit_sat(ir_expression *ir) bool vec4_visitor::try_emit_mad(ir_expression *ir) { - /* 3-src instructions were introduced in gen6. */ - if (brw->gen < 6) - return false; - /* MAD can only handle floating-point data. */ if (ir->type->base_type != GLSL_TYPE_FLOAT) return false; @@ -,17 +1107,38 @@ vec4_visitor::try_emit_mad(ir_expression *ir) return false; } - nonmul->accept(this); - src_reg src0 = fix_3src_operand(this->result); + /* 3-src instructions were introduced in gen6. */ + if (brw->gen < 6) { + nonmul->accept(this); + src_reg src0(this->result); - mul->operands[0]->accept(this); - src_reg src1 = fix_3src_operand(this->result); + mul->operands[0]->accept(this); + src_reg src1(this->result); - mul->operands[1]->accept(this); - src_reg src2 = fix_3src_operand(this->result); + mul->operands[1]->accept(this); + src_reg src2(this->result); - this->result = src_reg(this, ir->type); - emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2); + this->result = src_reg(this, ir->type); + + dst_reg mul_destination = dst_reg(this, glsl_type::float_type); + mul_destination.writemask = dst_reg(this->result).writemask; + + emit(MUL(mul_destination, src1, src2)); + emit(ADD(dst_reg(this->result), src0, src_reg(mul_destination))); + } else { + nonmul->accept(this); + src_reg src0 = fix_3src_operand(this->result); + + mul->operands[0]->accept(this); + src_reg src1 = fix_3src_operand(this->result); + + mul->operands[1]->accept(this); + src_reg src2 = fix_3src_operand(this->result); + + this->result = src_reg(this, ir->type); + + emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2); + } return true; } -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] GL_OES_texture_float and GL_OES_texture_half_float support
Add support for GLES2 extentions for floating point and half floating point textures (GL_OES_texture_float, GL_OES_texture_half_float, GL_OES_texture_float_linear and GL_OES_texture_half_float_linear). --- src/mesa/main/extensions.c | 12 + src/mesa/main/glformats.c | 27 src/mesa/main/pack.c | 17 + src/mesa/main/teximage.c | 61 ++ 4 files changed, 117 insertions(+) diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c index c2ff7e3..58a5a51 100644 --- a/src/mesa/main/extensions.c +++ b/src/mesa/main/extensions.c @@ -360,6 +360,18 @@ static const struct extension extension_table[] = { { "GL_SGIS_texture_lod",o(dummy_true), GLL,1997 }, { "GL_SUN_multi_draw_arrays", o(dummy_true), GLL,1999 }, + /* + TODO: + - rather than have an all or nothing approach for floating point textures, +allow for driver to specify what parts of floating point texture funtionality +is supported: float/half-float and filtering for each. +*/ + { "GL_OES_texture_float", o(ARB_texture_float), ES2|ES3,2005 }, + { "GL_OES_texture_half_float", o(ARB_texture_float), ES2|ES3,2005 }, + { "GL_OES_texture_float_linear",o(ARB_texture_float), ES2|ES3,2005 }, + { "GL_OES_texture_half_float_linear", o(ARB_texture_float), ES2|ES3,2005 }, + + { 0, 0, 0, 0 }, }; diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c index 9bb341c..be728f4 100644 --- a/src/mesa/main/glformats.c +++ b/src/mesa/main/glformats.c @@ -93,6 +93,7 @@ _mesa_sizeof_type(GLenum type) case GL_DOUBLE: return sizeof(GLdouble); case GL_HALF_FLOAT_ARB: + case GL_HALF_FLOAT_OES: return sizeof(GLhalfARB); case GL_FIXED: return sizeof(GLfixed); @@ -125,6 +126,7 @@ _mesa_sizeof_packed_type(GLenum type) case GL_INT: return sizeof(GLint); case GL_HALF_FLOAT_ARB: + case GL_HALF_FLOAT_OES: return sizeof(GLhalfARB); case GL_FLOAT: return sizeof(GLfloat); @@ -243,6 +245,7 @@ _mesa_bytes_per_pixel(GLenum format, GLenum type) case GL_FLOAT: return comps * sizeof(GLfloat); case GL_HALF_FLOAT_ARB: + case GL_HALF_FLOAT_OES: return comps * sizeof(GLhalfARB); case GL_UNSIGNED_BYTE_3_3_2: case GL_UNSIGNED_BYTE_2_3_3_REV: @@ -1365,6 +1368,11 @@ _mesa_error_check_format_and_type(const struct gl_context *ctx, case GL_FLOAT: case GL_HALF_FLOAT: return GL_NO_ERROR; +case GL_HALF_FLOAT_OES: + return (format==GL_LUMINANCE || + format==GL_LUMINANCE_ALPHA || + format==GL_ALPHA) + ? GL_NO_ERROR: GL_INVALID_ENUM; default: return GL_INVALID_ENUM; } @@ -1401,6 +1409,9 @@ _mesa_error_check_format_and_type(const struct gl_context *ctx, case GL_UNSIGNED_SHORT_5_6_5_REV: case GL_HALF_FLOAT: return GL_NO_ERROR; +case GL_HALF_FLOAT_OES: + return (format==GL_RGB) + ? GL_NO_ERROR: GL_INVALID_ENUM; case GL_UNSIGNED_INT_2_10_10_10_REV: /* OK by GL_EXT_texture_type_2_10_10_10_REV */ return (ctx->API == API_OPENGLES2) @@ -1454,6 +1465,9 @@ _mesa_error_check_format_and_type(const struct gl_context *ctx, case GL_UNSIGNED_INT_2_10_10_10_REV: case GL_HALF_FLOAT: return GL_NO_ERROR; +case GL_HALF_FLOAT_OES: + return (format==GL_RGBA) + ? GL_NO_ERROR: GL_INVALID_ENUM; default: return GL_INVALID_ENUM; } @@ -1666,6 +1680,7 @@ _mesa_es_error_check_format_and_type(GLenum format, GLenum type, return type_valid ? GL_NO_ERROR : GL_INVALID_OPERATION; } + /** * Do error checking of format/type combinations for OpenGL ES 3 @@ -1676,6 +1691,18 @@ GLenum _mesa_es3_error_check_format_and_type(GLenum format, GLenum type, GLenum internalFormat) { + /* + special case checking for support the GLES2 extension + GL_OES_texture_float and GL_OES_texture_half_float +*/ + if(format==internalFormat + && (type==GL_HALF_FLOAT_OES || type==GL_FLOAT) + && (format==GL_RGBA || format==GL_RGB || + format==GL_LUMINANCE || format==GL_ALPHA || + format==GL_LUMINANCE_ALPHA) ) { + return GL_NO_ERROR; + } + switch (format) { case GL_RGBA: switch (type) { diff --git a/src/mesa/main/pack.c b/src/mesa/main/pack.c index 1df6568.
Re: [Mesa-dev] [PATCH 2/2] mesa/st: pass 4-offset TG4 without lowering if supported
Looks good to me. Does that mean if also the GATHER_SM5 cap is supported you have to support 4 independent, non-constant offsets? Would it make sense to reorder the caps so the gather stuff is all together (now 5 cap bits just for this...)? Roland Am 29.04.2014 01:30, schrieb Ilia Mirkin: > Signed-off-by: Ilia Mirkin > --- > > The handling of the 4 offsets is less-than-pretty. I had an alternate version > that created a new ir_dereference_array object and ran ->accept on that. This > worked as well, but for each offset it would create a separate new array, and > then deref just one item out of it. This seems incredibly wasteful. The > slightly open-coded version of that seems reasonable and uses the same array. > > src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 55 > ++ > 1 file changed, 41 insertions(+), 14 deletions(-) > > diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > index d1c3856..20d5e99 100644 > --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp > @@ -87,8 +87,7 @@ extern "C" { > */ > #define MAX_ARRAYS256 > > -/* if we support a native gallium TG4 with the ability to take 4 texoffsets > then bump this */ > -#define MAX_GLSL_TEXTURE_OFFSET 1 > +#define MAX_GLSL_TEXTURE_OFFSET 4 > > class st_src_reg; > class st_dst_reg; > @@ -2728,12 +2727,13 @@ glsl_to_tgsi_visitor::visit(ir_call *ir) > void > glsl_to_tgsi_visitor::visit(ir_texture *ir) > { > - st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, > offset, sample_index, component; > + st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, > offset[MAX_GLSL_TEXTURE_OFFSET], sample_index, component; > st_dst_reg result_dst, coord_dst, cube_sc_dst; > glsl_to_tgsi_instruction *inst = NULL; > unsigned opcode = TGSI_OPCODE_NOP; > const glsl_type *sampler_type = ir->sampler->type; > bool is_cube_array = false; > + unsigned i; > > /* if we are a cube array sampler */ > if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE && > @@ -2771,7 +2771,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >opcode = (is_cube_array && ir->shadow_comparitor) ? TGSI_OPCODE_TEX2 : > TGSI_OPCODE_TEX; >if (ir->offset) { > ir->offset->accept(this); > - offset = this->result; > + offset[0] = this->result; >} >break; > case ir_txb: > @@ -2780,7 +2780,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >lod_info = this->result; >if (ir->offset) { > ir->offset->accept(this); > - offset = this->result; > + offset[0] = this->result; >} >break; > case ir_txl: > @@ -2789,7 +2789,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >lod_info = this->result; >if (ir->offset) { > ir->offset->accept(this); > - offset = this->result; > + offset[0] = this->result; >} >break; > case ir_txd: > @@ -2800,7 +2800,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >dy = this->result; >if (ir->offset) { > ir->offset->accept(this); > - offset = this->result; > + offset[0] = this->result; >} >break; > case ir_txs: > @@ -2814,7 +2814,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >lod_info = this->result; >if (ir->offset) { > ir->offset->accept(this); > - offset = this->result; > + offset[0] = this->result; >} >break; > case ir_txf_ms: > @@ -2828,9 +2828,17 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >component = this->result; >if (ir->offset) { > ir->offset->accept(this); > - /* this should have been lowered */ > - assert(ir->offset->type->base_type != GLSL_TYPE_ARRAY); > - offset = this->result; > + if (ir->offset->type->base_type == GLSL_TYPE_ARRAY) { > +const glsl_type *elt_type = ir->offset->type->fields.array; > +for (i = 0; i < ir->offset->type->length; i++) { > + offset[i] = this->result; > + offset[i].index += i * type_size(elt_type); > + offset[i].type = elt_type->base_type; > + offset[i].swizzle = > swizzle_for_size(elt_type->vector_elements); > +} > + } else { > +offset[0] = this->result; > + } >} >break; > case ir_lod: > @@ -2960,8 +2968,9 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) > this->prog); > > if (ir->offset) { > - inst->tex_offset_num_offset = 1; > - inst->tex_offsets[0] = offset; > + for (i = 0; i < MAX_GLSL_TEXTURE_OFFSET && offset[i].file != > PROGRAM_UNDEFINED; i++) > + inst->tex_offsets[i] = offset[i]; > + inst->tex_offset_num_offset = i; > } > >
Re: [Mesa-dev] [PATCH 2/2] mesa/st: pass 4-offset TG4 without lowering if supported
On Tue, May 6, 2014 at 10:48 AM, Roland Scheidegger wrote: > Looks good to me. Thanks! > Does that mean if also the GATHER_SM5 cap is supported you have to > support 4 independent, non-constant offsets? Not 100% sure what you're asking... but yes, for ARB_gs5 to work, you have to support independent non-constant offsets. And if you have PIPE_CAP_TEXTURE_GATHER_OFFSETS enabled, you're making the claim that you can handle multiple independent offsets in a single texgather. Without the cap, the 4 offsets get lowered into 4 separate texgathers (with only one of the returned components used). With nvc0, the offsets are passed in via a register, so non-constant is never an issue. And with nv50, the offsets must be immediates (and there can be only 1 set of them), but it also has no hope of supporting all of ARB_gs5. > Would it make sense to reorder the caps so the gather stuff is all > together (now 5 cap bits just for this...)? The quantity of caps for texgather is a little ridiculous. I'm of the opinion that this should be the default behaviour, and it should be up to the driver to lower it into 4 texgathers if it can't handle them directly. Furthermore, this functionality is only available (via GL) with ARB_gs5, which in turn will require a whole bunch of stuff, so I don't know whether the GATHER_SM5 cap is really that useful. And for someone with a DX tracker, this functionality would again not be useful on its own, the rest of SM5 would have to be supported as well (I assume). But that's not what got implemented, and I don't care to modify radeon, which can only support 1 offset at a time. (Although I don't think the radeon impl got pushed...) I anticipate that llvmpipe doesn't care one way or another (perhaps with even a minor preference towards having it all in one instruction). If there's concensus, happy to switch this on by default and get rid of the cap :) [And also get rid of the GATHER_SM5 cap.] > > Roland > > Am 29.04.2014 01:30, schrieb Ilia Mirkin: >> Signed-off-by: Ilia Mirkin >> --- >> >> The handling of the 4 offsets is less-than-pretty. I had an alternate version >> that created a new ir_dereference_array object and ran ->accept on that. This >> worked as well, but for each offset it would create a separate new array, and >> then deref just one item out of it. This seems incredibly wasteful. The >> slightly open-coded version of that seems reasonable and uses the same array. >> >> src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 55 >> ++ >> 1 file changed, 41 insertions(+), 14 deletions(-) >> >> diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp >> b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp >> index d1c3856..20d5e99 100644 >> --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp >> +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp >> @@ -87,8 +87,7 @@ extern "C" { >> */ >> #define MAX_ARRAYS256 >> >> -/* if we support a native gallium TG4 with the ability to take 4 texoffsets >> then bump this */ >> -#define MAX_GLSL_TEXTURE_OFFSET 1 >> +#define MAX_GLSL_TEXTURE_OFFSET 4 >> >> class st_src_reg; >> class st_dst_reg; >> @@ -2728,12 +2727,13 @@ glsl_to_tgsi_visitor::visit(ir_call *ir) >> void >> glsl_to_tgsi_visitor::visit(ir_texture *ir) >> { >> - st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, >> offset, sample_index, component; >> + st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, >> offset[MAX_GLSL_TEXTURE_OFFSET], sample_index, component; >> st_dst_reg result_dst, coord_dst, cube_sc_dst; >> glsl_to_tgsi_instruction *inst = NULL; >> unsigned opcode = TGSI_OPCODE_NOP; >> const glsl_type *sampler_type = ir->sampler->type; >> bool is_cube_array = false; >> + unsigned i; >> >> /* if we are a cube array sampler */ >> if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE && >> @@ -2771,7 +2771,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >>opcode = (is_cube_array && ir->shadow_comparitor) ? TGSI_OPCODE_TEX2 >> : TGSI_OPCODE_TEX; >>if (ir->offset) { >> ir->offset->accept(this); >> - offset = this->result; >> + offset[0] = this->result; >>} >>break; >> case ir_txb: >> @@ -2780,7 +2780,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >>lod_info = this->result; >>if (ir->offset) { >> ir->offset->accept(this); >> - offset = this->result; >> + offset[0] = this->result; >>} >>break; >> case ir_txl: >> @@ -2789,7 +2789,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >>lod_info = this->result; >>if (ir->offset) { >> ir->offset->accept(this); >> - offset = this->result; >> + offset[0] = this->result; >>} >>break; >> case ir_txd: >> @@ -2800,7 +2800,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >>dy = this->result; >>if (ir->offset) { >> ir->offset-
Re: [Mesa-dev] [PATCH 3/3] svga: add switch case for PIPE_SHADER_CAP_PREFERRED_IR, remove default case
Series LGTM. Jose - Original Message - > Remove default switch case so we're warned of missing cases at compile > time. > --- > src/gallium/drivers/svga/svga_screen.c | 18 ++ > 1 file changed, 10 insertions(+), 8 deletions(-) > > diff --git a/src/gallium/drivers/svga/svga_screen.c > b/src/gallium/drivers/svga/svga_screen.c > index 8c70bb7..bc914b7 100644 > --- a/src/gallium/drivers/svga/svga_screen.c > +++ b/src/gallium/drivers/svga/svga_screen.c > @@ -343,11 +343,12 @@ static int svga_get_shader_param(struct pipe_screen > *screen, unsigned shader, en >case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: >case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: > return 16; > - default: > - debug_printf("Unexpected fragment shader query %u\n", param); > - return 0; > + case PIPE_SHADER_CAP_PREFERRED_IR: > + return PIPE_SHADER_IR_TGSI; >} > - break; > + /* If we get here, we failed to handle a cap above */ > + debug_printf("Unexpected fragment shader query %u\n", param); > + return 0; > case PIPE_SHADER_VERTEX: >switch (param) >{ > @@ -394,11 +395,12 @@ static int svga_get_shader_param(struct pipe_screen > *screen, unsigned shader, en >case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: >case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: > return 0; > - default: > - debug_printf("Unexpected vertex shader query %u\n", param); > - return 0; > + case PIPE_SHADER_CAP_PREFERRED_IR: > + return PIPE_SHADER_IR_TGSI; >} > - break; > + /* If we get here, we failed to handle a cap above */ > + debug_printf("Unexpected vertex shader query %u\n", param); > + return 0; > case PIPE_SHADER_GEOMETRY: > case PIPE_SHADER_COMPUTE: >/* no support for geometry or compute shaders at this time */ > -- > 1.7.10.4 > > ___ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://urldefense.proofpoint.com/v1/url?u=http://lists.freedesktop.org/mailman/listinfo/mesa-dev&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=NMr9uy2iTjWVixC0wOcYCWEIYhfo80qKwRgdodpoDzA%3D%0A&m=MQcxpL%2FgTB1nG3hZaJq%2FUqYDOOEJQ8XufGj7cxgVFEA%3D%0A&s=b771d15063454cf04d10429852920d1ef152aac7a9e539b01aa528bb1bedfdbc > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] automake: Honor GL_LIB for gallium libgl-xlib
Use "@GL_LIB@" in src/gallium/targets/libgl-xlib/Makefile.am to produce the library name specified by the configure --with-gl-lib-name option. --- src/gallium/targets/libgl-xlib/Makefile.am | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/gallium/targets/libgl-xlib/Makefile.am b/src/gallium/targets/libgl-xlib/Makefile.am index 4ee364e..7651333 100644 --- a/src/gallium/targets/libgl-xlib/Makefile.am +++ b/src/gallium/targets/libgl-xlib/Makefile.am @@ -40,17 +40,17 @@ AM_CPPFLAGS = \ -DGALLIUM_GALAHAD AM_CFLAGS = $(X11_INCLUDES) -lib_LTLIBRARIES = libGL.la +lib_LTLIBRARIES = lib@GL_LIB@.la -nodist_EXTRA_libGL_la_SOURCES = dummy.cpp -libGL_la_SOURCES = xlib.c -libGL_la_LDFLAGS = \ +nodist_EXTRA_lib@GL_LIB@_la_SOURCES = dummy.cpp +lib@GL_LIB@_la_SOURCES = xlib.c +lib@GL_LIB@_la_LDFLAGS = \ -no-undefined \ -version-number $(GL_MAJOR):$(GL_MINOR):$(GL_TINY) \ $(GC_SECTIONS) \ $(LD_NO_UNDEFINED) -libGL_la_LIBADD = \ +lib@GL_LIB@_la_LIBADD = \ $(top_builddir)/src/gallium/state_trackers/glx/xlib/libxlib.la \ $(top_builddir)/src/gallium/winsys/sw/xlib/libws_xlib.la \ $(top_builddir)/src/gallium/drivers/softpipe/libsoftpipe.la \ @@ -64,9 +64,9 @@ libGL_la_LIBADD = \ $(CLOCK_LIB) if HAVE_MESA_LLVM -libGL_la_LIBADD += $(top_builddir)/src/gallium/drivers/llvmpipe/libllvmpipe.la $(LLVM_LIBS) +lib@GL_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/llvmpipe/libllvmpipe.la $(LLVM_LIBS) AM_CPPFLAGS += -DGALLIUM_LLVMPIPE -libGL_la_LDFLAGS += $(LLVM_LDFLAGS) +lib@GL_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS) endif include $(top_srcdir)/install-gallium-links.mk -- 1.9.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] GL_OES_texture_float and GL_OES_texture_half_float support
The title should be something like mesa: Expose GL_OES_texture_float and GL_OES_texture_half_float. Have you found an application that wants these extensions? That might be useful to describe in the commit message. On Tue, May 6, 2014 at 4:02 AM, Kevin Rogovin wrote: > Add support for GLES2 extentions for floating point and half > floating point textures (GL_OES_texture_float, GL_OES_texture_half_float, > GL_OES_texture_float_linear and GL_OES_texture_half_float_linear). > > --- > src/mesa/main/extensions.c | 12 + > src/mesa/main/glformats.c | 27 > src/mesa/main/pack.c | 17 + > src/mesa/main/teximage.c | 61 > ++ > 4 files changed, 117 insertions(+) > > diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c > index c2ff7e3..58a5a51 100644 > --- a/src/mesa/main/extensions.c > +++ b/src/mesa/main/extensions.c > @@ -360,6 +360,18 @@ static const struct extension extension_table[] = { > { "GL_SGIS_texture_lod",o(dummy_true), > GLL,1997 }, > { "GL_SUN_multi_draw_arrays", o(dummy_true), > GLL,1999 }, > > + /* > + TODO: > + - rather than have an all or nothing approach for floating point > textures, > +allow for driver to specify what parts of floating point texture > funtionality functionality > +is supported: float/half-float and filtering for each. For which driver would that be useful? > +*/ > + { "GL_OES_texture_float", o(ARB_texture_float), > ES2|ES3,2005 }, > + { "GL_OES_texture_half_float", o(ARB_texture_float), > ES2|ES3,2005 }, > + { "GL_OES_texture_float_linear",o(ARB_texture_float), > ES2|ES3,2005 }, > + { "GL_OES_texture_half_float_linear", o(ARB_texture_float), > ES2|ES3,2005 }, The ES3 bit is for extensions that can not be exposed in ES2. ES2 means that it will be exposed in both ES2 and ES3. The extension table is organized by extension prefix. Put these extensions in the proper place. > + > + > { 0, 0, 0, 0 }, > }; > > diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c > index 9bb341c..be728f4 100644 > --- a/src/mesa/main/glformats.c > +++ b/src/mesa/main/glformats.c > @@ -93,6 +93,7 @@ _mesa_sizeof_type(GLenum type) > case GL_DOUBLE: >return sizeof(GLdouble); > case GL_HALF_FLOAT_ARB: > + case GL_HALF_FLOAT_OES: >return sizeof(GLhalfARB); > case GL_FIXED: >return sizeof(GLfixed); > @@ -125,6 +126,7 @@ _mesa_sizeof_packed_type(GLenum type) > case GL_INT: >return sizeof(GLint); > case GL_HALF_FLOAT_ARB: > + case GL_HALF_FLOAT_OES: >return sizeof(GLhalfARB); > case GL_FLOAT: >return sizeof(GLfloat); > @@ -243,6 +245,7 @@ _mesa_bytes_per_pixel(GLenum format, GLenum type) > case GL_FLOAT: >return comps * sizeof(GLfloat); > case GL_HALF_FLOAT_ARB: > + case GL_HALF_FLOAT_OES: >return comps * sizeof(GLhalfARB); > case GL_UNSIGNED_BYTE_3_3_2: > case GL_UNSIGNED_BYTE_2_3_3_REV: > @@ -1365,6 +1368,11 @@ _mesa_error_check_format_and_type(const struct > gl_context *ctx, > case GL_FLOAT: > case GL_HALF_FLOAT: > return GL_NO_ERROR; > +case GL_HALF_FLOAT_OES: > + return (format==GL_LUMINANCE || > + format==GL_LUMINANCE_ALPHA || > + format==GL_ALPHA) Spaces around operators. Repeated below as well. > + ? GL_NO_ERROR: GL_INVALID_ENUM; > default: > return GL_INVALID_ENUM; > } > @@ -1401,6 +1409,9 @@ _mesa_error_check_format_and_type(const struct > gl_context *ctx, > case GL_UNSIGNED_SHORT_5_6_5_REV: > case GL_HALF_FLOAT: > return GL_NO_ERROR; > +case GL_HALF_FLOAT_OES: > + return (format==GL_RGB) > + ? GL_NO_ERROR: GL_INVALID_ENUM; > case GL_UNSIGNED_INT_2_10_10_10_REV: > /* OK by GL_EXT_texture_type_2_10_10_10_REV */ > return (ctx->API == API_OPENGLES2) > @@ -1454,6 +1465,9 @@ _mesa_error_check_format_and_type(const struct > gl_context *ctx, > case GL_UNSIGNED_INT_2_10_10_10_REV: > case GL_HALF_FLOAT: > return GL_NO_ERROR; > +case GL_HALF_FLOAT_OES: > + return (format==GL_RGBA) > + ? GL_NO_ERROR: GL_INVALID_ENUM; > default: > return GL_INVALID_ENUM; > } > @@ -1666,6 +1680,7 @@ _mesa_es_error_check_format_and_type(GLenum format, > GLenum type, > return type_valid ? GL_NO_ERROR : GL_I
Re: [Mesa-dev] [PATCH 0/2] i965: Simulate MAD opcode with gen<6
Juha-Pekka Heikkila writes: > These patches allow MAD opcode to be used with pre gen6 hardware. > Instead of failing on emitting MAD there will be emitted MUL and ADD > to simulate MAD. > > I tried this with piglit on ILK (gen5) and did not see regression. This hides the MUL and ADD from instruction scheduling, which I expect to make performance worse. What was the motivation for this? pgpIztp4XE9kH.pgp Description: PGP signature ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/2] i965/fs: Simulate MAD opcode with gen<6
On Tue, May 6, 2014 at 3:53 AM, Juha-Pekka Heikkila wrote: > Signed-off-by: Juha-Pekka Heikkila > --- > src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 15 ++- > 1 file changed, 10 insertions(+), 5 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > index d2dc5fa..22ca528 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > @@ -293,10 +293,6 @@ fs_visitor::try_emit_saturate(ir_expression *ir) > bool > fs_visitor::try_emit_mad(ir_expression *ir) > { > - /* 3-src instructions were introduced in gen6. */ > - if (brw->gen < 6) > - return false; > - > /* MAD can only handle floating-point data. */ > if (ir->type != glsl_type::float_type) >return false; > @@ -327,7 +323,16 @@ fs_visitor::try_emit_mad(ir_expression *ir) > fs_reg src2 = this->result; > > this->result = fs_reg(this, ir->type); > - emit(BRW_OPCODE_MAD, this->result, src0, src1, src2); > + > + /* 3-src instructions were introduced in gen6. */ > + if (brw->gen < 6) { > + fs_reg temp = fs_reg(this, glsl_type::float_type); > + > + emit(MUL(temp, src1, src2)); > + emit(ADD(this->result, src0, temp)); > + } else { > + emit(BRW_OPCODE_MAD, this->result, src0, src1, src2); > + } > > return true; > } > -- > 1.8.1.2 try_emit_mad is called every time we visit an add-expression, and on platforms that don't have MAD it fails and the compiler generates standard code for the expression tree. So, if your expression tree was a a multiply-add the compiler will generate a multiply and an add instruction. Adding code to make try_emit_mad do that doesn't actually change anything. I've made a branch that uses the LINE instruction to perform multiply-adds when the arguments are immediates. Minus the shader size explosion in unigine tropics, it seems to be a pretty nice improvement. But the problem with unigine will have to be sorted out before it can be committed. Maybe you'd be interested in taking a look at that? See https://bugs.freedesktop.org/show_bug.cgi?id=77544 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 2/2] mesa/st: pass 4-offset TG4 without lowering if supported
Am 06.05.2014 17:03, schrieb Ilia Mirkin: > On Tue, May 6, 2014 at 10:48 AM, Roland Scheidegger > wrote: >> Looks good to me. > > Thanks! > >> Does that mean if also the GATHER_SM5 cap is supported you have to >> support 4 independent, non-constant offsets? > > Not 100% sure what you're asking... but yes, for ARB_gs5 to work, you > have to support independent non-constant offsets. And if you have > PIPE_CAP_TEXTURE_GATHER_OFFSETS enabled, you're making the claim that > you can handle multiple independent offsets in a single texgather. > Without the cap, the 4 offsets get lowered into 4 separate texgathers > (with only one of the returned components used). > > With nvc0, the offsets are passed in via a register, so non-constant > is never an issue. And with nv50, the offsets must be immediates (and > there can be only 1 set of them), but it also has no hope of > supporting all of ARB_gs5. > >> Would it make sense to reorder the caps so the gather stuff is all >> together (now 5 cap bits just for this...)? > > The quantity of caps for texgather is a little ridiculous. I'm of the > opinion that this should be the default behaviour, and it should be up > to the driver to lower it into 4 texgathers if it can't handle them > directly. Furthermore, this functionality is only available (via GL) > with ARB_gs5, which in turn will require a whole bunch of stuff, so I > don't know whether the GATHER_SM5 cap is really that useful. And for > someone with a DX tracker, this functionality would again not be > useful on its own, the rest of SM5 would have to be supported as well > (I assume). > > But that's not what got implemented, and I don't care to modify > radeon, which can only support 1 offset at a time. (Although I don't > think the radeon impl got pushed...) I anticipate that llvmpipe > doesn't care one way or another (perhaps with even a minor preference > towards having it all in one instruction). > > If there's concensus, happy to switch this on by default and get rid > of the cap :) [And also get rid of the GATHER_SM5 cap.] Well I think the point was that there's really hw which can only do simple gather (what d3d10.1 could do or arb_texture_gather would do). This hw will not be able to do other stuff from newer gl versions anyway so it should not be required to support those new features. I'm not entirely sure to what it's actually lowered but in any case llvmpipe if it implemented this definitely would want a non-lowered version. I think though some radeon hw could really do SM5 version but not independent offsets natively, though I'm not sure if it would really be all that complicated to handle it in the driver. I guess though this could be changed later rather easily. Roland > >> >> Roland >> >> Am 29.04.2014 01:30, schrieb Ilia Mirkin: >>> Signed-off-by: Ilia Mirkin >>> --- >>> >>> The handling of the 4 offsets is less-than-pretty. I had an alternate >>> version >>> that created a new ir_dereference_array object and ran ->accept on that. >>> This >>> worked as well, but for each offset it would create a separate new array, >>> and >>> then deref just one item out of it. This seems incredibly wasteful. The >>> slightly open-coded version of that seems reasonable and uses the same >>> array. >>> >>> src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 55 >>> ++ >>> 1 file changed, 41 insertions(+), 14 deletions(-) >>> >>> diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp >>> b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp >>> index d1c3856..20d5e99 100644 >>> --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp >>> +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp >>> @@ -87,8 +87,7 @@ extern "C" { >>> */ >>> #define MAX_ARRAYS256 >>> >>> -/* if we support a native gallium TG4 with the ability to take 4 >>> texoffsets then bump this */ >>> -#define MAX_GLSL_TEXTURE_OFFSET 1 >>> +#define MAX_GLSL_TEXTURE_OFFSET 4 >>> >>> class st_src_reg; >>> class st_dst_reg; >>> @@ -2728,12 +2727,13 @@ glsl_to_tgsi_visitor::visit(ir_call *ir) >>> void >>> glsl_to_tgsi_visitor::visit(ir_texture *ir) >>> { >>> - st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, >>> offset, sample_index, component; >>> + st_src_reg result_src, coord, cube_sc, lod_info, projector, dx, dy, >>> offset[MAX_GLSL_TEXTURE_OFFSET], sample_index, component; >>> st_dst_reg result_dst, coord_dst, cube_sc_dst; >>> glsl_to_tgsi_instruction *inst = NULL; >>> unsigned opcode = TGSI_OPCODE_NOP; >>> const glsl_type *sampler_type = ir->sampler->type; >>> bool is_cube_array = false; >>> + unsigned i; >>> >>> /* if we are a cube array sampler */ >>> if ((sampler_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE && >>> @@ -2771,7 +2771,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir) >>>opcode = (is_cube_array && ir->shadow_comparitor) ? TGSI_OPCODE_TEX2 >>> : TGSI_OPCODE_TEX; >>>if (ir->offset) { >>> ir->offset->accept(t
Re: [Mesa-dev] [PATCH 2/2] mesa/st: pass 4-offset TG4 without lowering if supported
On Tue, May 6, 2014 at 1:29 PM, Roland Scheidegger wrote: > Am 06.05.2014 17:03, schrieb Ilia Mirkin: >> On Tue, May 6, 2014 at 10:48 AM, Roland Scheidegger >> wrote: >>> Looks good to me. >> >> Thanks! >> >>> Does that mean if also the GATHER_SM5 cap is supported you have to >>> support 4 independent, non-constant offsets? >> >> Not 100% sure what you're asking... but yes, for ARB_gs5 to work, you >> have to support independent non-constant offsets. And if you have >> PIPE_CAP_TEXTURE_GATHER_OFFSETS enabled, you're making the claim that >> you can handle multiple independent offsets in a single texgather. >> Without the cap, the 4 offsets get lowered into 4 separate texgathers >> (with only one of the returned components used). >> >> With nvc0, the offsets are passed in via a register, so non-constant >> is never an issue. And with nv50, the offsets must be immediates (and >> there can be only 1 set of them), but it also has no hope of >> supporting all of ARB_gs5. >> >>> Would it make sense to reorder the caps so the gather stuff is all >>> together (now 5 cap bits just for this...)? >> >> The quantity of caps for texgather is a little ridiculous. I'm of the >> opinion that this should be the default behaviour, and it should be up >> to the driver to lower it into 4 texgathers if it can't handle them >> directly. Furthermore, this functionality is only available (via GL) >> with ARB_gs5, which in turn will require a whole bunch of stuff, so I >> don't know whether the GATHER_SM5 cap is really that useful. And for >> someone with a DX tracker, this functionality would again not be >> useful on its own, the rest of SM5 would have to be supported as well >> (I assume). >> >> But that's not what got implemented, and I don't care to modify >> radeon, which can only support 1 offset at a time. (Although I don't >> think the radeon impl got pushed...) I anticipate that llvmpipe >> doesn't care one way or another (perhaps with even a minor preference >> towards having it all in one instruction). >> >> If there's concensus, happy to switch this on by default and get rid >> of the cap :) [And also get rid of the GATHER_SM5 cap.] > Well I think the point was that there's really hw which can only do > simple gather (what d3d10.1 could do or arb_texture_gather would do). > This hw will not be able to do other stuff from newer gl versions anyway > so it should not be required to support those new features. Right. But since that hw will only ever expose ARB_texture_gather and not ARB_gpu_shader5, it will never receive a TG4 instruciton with non-const offsets or multiple offsets. So the cap to indicate that non-const or quad offsets are supported isn't really necessary, since those will only appear if ARB_gs5 support is claimed, which requires more than just the texgather stuff. (The PIPE_CAP_TEXTURE_GATHER_COMPONENTS cap _is_ necessary since it indicates ARB_texture_gather support, and the value that should be returned by some GL query about what tex gather supports.) > I'm not entirely sure to what it's actually lowered but in any case > llvmpipe if it implemented this definitely would want a non-lowered > version. Right now, it'll get lowered to 4 texgathers, with only one of the returned 4 components used from each one. (And it can't use texfetch since the min/max offsets are different, and there's probably some other clever reason as well.) > I think though some radeon hw could really do SM5 version but > not independent offsets natively, though I'm not sure if it would really > be all that complicated to handle it in the driver. Well, I think the claim was that SM5 doesn't actually support the 4 separate offsets, but GL4 does with textureGatherOffsets(). Also, I believe that radeon supports non-const natively, just not have 4 offsets in one instruction. Same deal with i965 (which is why that lowering pass exists in the first place). > I guess though this could be changed later rather easily. > > Roland > > >> >>> >>> Roland >>> >>> Am 29.04.2014 01:30, schrieb Ilia Mirkin: Signed-off-by: Ilia Mirkin --- The handling of the 4 offsets is less-than-pretty. I had an alternate version that created a new ir_dereference_array object and ran ->accept on that. This worked as well, but for each offset it would create a separate new array, and then deref just one item out of it. This seems incredibly wasteful. The slightly open-coded version of that seems reasonable and uses the same array. src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 55 ++ 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index d1c3856..20d5e99 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -87,8 +87,7 @@ extern "C" { */ #defin
Re: [Mesa-dev] [Mesa-users] Problem with ARB_copy_buffer on Mesa 9.2.4
Hello Jonas, I tested your program and on my system (Fedora 19) on an Evergreen (Radeon HD 5850) and I have the same issue indeed. Here is my result : Mesa 9.2.4 (from F19 repo) => Data does NOT match up! Mesa 9.2.5 => Data does NOT match up! Mesa 10.0.5 => Data does NOT match up! Mesa 10.1.0 => Data matches. So this is fixed in newer version. That said, Mesa 9.2 is not supported anymore and I really don't know if there will be a new Mesa 10.0.x release given the imminence of Mesa 10.2. If yes, I can bisect and you can open a bug. Mesa-dev, any new 10.0.x release planned ? Regards. Benjamin Le 06/05/2014 13:40, Jonas Kulla a écrit : > Hello list, > > after about 3 days of debugging, I was able to isolate a rather weird > behavior in Mesa GL. > The gist of it is the following: When I create a buffer object and > allocate uninitilaized > memory for it (glBufferData() with nullptr as 'data'), > then glCopyBufferSubData() data into > it from another buffer object, then subsequently fill a part of it > with glBufferSubData(), > this new data isn't visible to the buffer object. In fact, it seems > that the SubData'ed bytes > are completely lost. Any further data uploads however work as > expected. I will attach > a small C test case below that demonstrates this behavior. > > I realize that I am working with an old Mesa release (on Fedora 19), > but I'm afraid of > upgrading my system to the newest distro release as I might break my > working environment. > That's why I would like to kindly ask if someone could verify that > this problem still persists > on the newest Mesa code, in which case I would go ahead and file a bug > report. At the > same time, maybe someone could spot a critical mistake in my code that > would explain > this strange behavior I'm seeing. I think the code paths I'm hitting > here in the driver are > sufficiently obscure though. > > I should probably mention that my card is a Mobility Radeon HD 3650 > (ie. r600). > > Here's the code sample (you can replace the GL setup code with your own): > > #include > #include > > #include > #include > > static SDL_Window *win; > static SDL_GLContext *ctx; > > void setupGL() > { > SDL_Init(SDL_INIT_VIDEO); > win = SDL_CreateWindow("CopyBufferBug", > SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, 64, 64, > SDL_WINDOW_OPENGL); > ctx = SDL_GL_CreateContext(win); > glewInit(); > } > > static void teardownGL() > { > SDL_GL_DeleteContext(ctx); > SDL_DestroyWindow(win); > > SDL_Quit(); > } > > int main(int argc, char *argv[]) > { > setupGL(); > > /* These don't matter I think */ > #define BLOCK_SIZE 128 > #define BUFFER1_SIZE BLOCK_SIZE > #define BUFFER2_SIZE BLOCK_SIZE > #define BUFFER1_TARGET GL_COPY_READ_BUFFER > #define BUFFER2_TARGET GL_COPY_WRITE_BUFFER > #define BUFFER1_USAGE GL_DYNAMIC_DRAW > #define BUFFER2_USAGE GL_DYNAMIC_DRAW > > GLuint buffers[2]; > glGenBuffers(2, buffers); > > /* We allocate both buffers with undefined memory */ > glBindBuffer(BUFFER1_TARGET, buffers[0]); > glBufferData(BUFFER1_TARGET, BUFFER1_SIZE, 0, BUFFER1_USAGE); > > glBindBuffer(BUFFER2_TARGET, buffers[1]); > glBufferData(BUFFER2_TARGET, BUFFER2_SIZE, 0, BUFFER2_USAGE); > > /* Then copy (undefined) bytes from the first into the second > buffer */ > /* Note: If I comment this line out, everything works */ > glCopyBufferSubData(BUFFER1_TARGET, BUFFER2_TARGET, 0, 0, > BUFFER1_SIZE); > > /* Generate random string */ > FILE *rand = fopen("/dev/urandom", "r"); > char data[BLOCK_SIZE]; > fread(data, 1, sizeof(data), rand); > fclose(rand); > > /* We fill the second buffer with defined data */ > /* Note: If I execute this call twice (just copy paste the > line), everything works */ > glBufferSubData(BUFFER2_TARGET, 0, sizeof(data), data); > > /* Then download it again to compare its contents against our > test string */ > char data2[BLOCK_SIZE]; > glGetBufferSubData(BUFFER2_TARGET, 0, sizeof(data2), data2); > > if (memcmp(data, data2, sizeof(data))) > printf("Data does NOT match up!\n"); > else > printf("Data matches.\n"); > > glDeleteBuffers(2, buffers); > > teardownGL(); > > return 0; > } > > Thank you very much for your time. > Jonas > > > ___ > mesa-users mailing list > mesa-us...@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-users ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/5] i965: Always intel_prepare_render() after invalidating front buffers.
Kenneth Graunke writes: > Fixes glean/texture_srgb, which hit recursive-flush prevention > assertions in vbo_exec_FlushVertices. > > This probably hurts the performance of front buffer rendering, but > very few people in their right mind do front buffer rendering. This series is: Reviewed-by: Eric Anholt pgpw9NR42MFDO.pgp Description: PGP signature ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [rong.r.y...@intel.com: [Intel-gfx] How user space applications load registers on HSW?]
FWDing to mesa-dev, since they should have the same issue. - Forwarded message from "Yang, Rong R" - Date: Tue, 6 May 2014 08:26:15 + From: "Yang, Rong R" To: "intel-...@lists.freedesktop.org" Subject: [Intel-gfx] How user space applications load registers on HSW? Message-ID: <7597c9376c272a4ab2d29e91550b7b0901354...@shsmsx102.ccr.corp.intel.com> Hi, I am developing the HSW’s OCL driver in the linux. I encounter a LRI problem on HSW. Some gpgpu's applications, which use the shared local memory, must load the L3CTRLREG2 and L3CTRLREG3 registers to allocate the SLM in the L3 cache. So I add L3CTRLREG2 and L3CTRLREG3 to the gen7_render_regs to pass the cmds parse when exec buffer. But it still don’t work. I notice that, on HSW, the commands that load the register, such as MI_LOAD_REGISTER_IMM, will be converted to NOOP by the GPU if the batch buffer's MI_BATCH_NON_SECURE_HSW bit is set. And after parse cmd, the MI_BATCH_NON_SECURE_HSW still set in the kernel. So HSW don’t accept LRI commands. Can I load these registers in the user space? Or should I hack the kernel? Yang Rong ___ Intel-gfx mailing list intel-...@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/intel-gfx - End forwarded message - -- Ben Widawsky, Intel Open Source Technology Center ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] i965: Relax accumulator dependency scheduling on Gen < 6
Nice work. On Tue, May 6, 2014 at 1:16 AM, Iago Toral Quiroga wrote: > diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp > b/src/mesa/drivers/dri/i965/brw_shader.cpp > index 6e74803..37d3eab 100644 > --- a/src/mesa/drivers/dri/i965/brw_shader.cpp > +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp > @@ -676,6 +676,13 @@ backend_instruction::reads_accumulator_implicitly() const > } > > bool > +backend_instruction::writes_accumulator_implicitly(int gen) const > +{ > + return writes_accumulator || > + (gen < 6 && opcode >= BRW_OPCODE_ADD && opcode != BRW_OPCODE_NOP); Since our virtual instruction opcodes are > BRW_OPCODE_NOP, they'll also be classified as writing the accumulator, whereas before they weren't. I think the only ones (that are used on gen < 6) that generate hardware instructions that write the accumulator are FS_OPCODE_DDX FS_OPCODE_DDY FS_OPCODE_PIXEL_X FS_OPCODE_PIXEL_Y FS_OPCODE_CINTERP FS_OPCODE_LINTERP If you update this function with these and it still passes piglit on gen < 6, then this patch is Reviewed-by: Matt Turner ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [rong.r.y...@intel.com: [Intel-gfx] How user space applications load registers on HSW?]
On 05/06/2014 08:26:15 AM, Yang, Rong R wrote: > Hi, > > I am developing the HSW’s OCL driver in the linux. I encounter a LRI > problem on HSW. > > > Some gpgpu's applications, which use the shared local memory, must load > the L3CTRLREG2 and L3CTRLREG3 registers to allocate the SLM in the L3 > cache. > > So I add L3CTRLREG2 and L3CTRLREG3 to the gen7_render_regs to pass the > cmds parse when exec buffer. But it still don’t work. > > I notice that, on HSW, the commands that load the register, such as > MI_LOAD_REGISTER_IMM, will be converted to NOOP by the GPU if the batch > buffer's MI_BATCH_NON_SECURE_HSW bit is set. And after parse cmd, the > MI_BATCH_NON_SECURE_HSW still set in the kernel. So HSW don’t accept > LRI commands. > > > Can I load these registers in the user space? Or should I hack the > kernel? > > > Yang Rong I've been asking the kernel developers for the ability to LRI/LRM from userspace batches for around 1.5 years. Unfortunately, we're still waiting, and I honestly have no idea when they're going to finish it. In the meantime, you can apply the attached patch to your kernel tree to disable the hardware scanner, letting you run whatever commands you want. Obviously, we can't ship this on production systems, but it will allow you to do your development without having to wait for the kernel team. --Ken diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index a3ba9a8..86c173b 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -995,6 +995,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, return ret; flags = 0; + flags |= I915_DISPATCH_SECURE; if (args->flags & I915_EXEC_SECURE) { if (!file->is_master || !capable(CAP_SYS_ADMIN)) return -EPERM; signature.asc Description: OpenPGP digital signature ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 1/2] Import eglextchromium.h from Chromium.
In order to support the (currently unregistered) Chromium-specific EGL extension eglGetSyncValuesCHROMIUM on Intel systems, we need to import the Chromium header that defines it. The file was downloaded from https://chromium.googlesource.com/chromium/chromium/+/trunk/ui/gl/EGL/eglextchromium.h It is subject to the license found at https://chromium.googlesource.com/chromium/chromium/+/trunk/LICENSE I have imported the header file and added the license text to the top. The only change was to fix the include guard on the Chromium header to change the last line from a #define to a #endif, which makes the header actually compile. Signed-off-by: Sarah Sharp Reviewed-by: Chad Versace Cc: Jamey Sharp Cc: Ian Romanick Cc: Stéphane Marchesin --- v3: Add an include guard on Chromium header. include/EGL/eglext.h | 1 + include/EGL/eglextchromium.h | 60 src/egl/main/Makefile.am | 1 + 3 files changed, 62 insertions(+) create mode 100644 include/EGL/eglextchromium.h diff --git a/include/EGL/eglext.h b/include/EGL/eglext.h index 243da4a..88b39db 100644 --- a/include/EGL/eglext.h +++ b/include/EGL/eglext.h @@ -646,6 +646,7 @@ EGLAPI EGLuint64NV EGLAPIENTRY eglGetSystemTimeNV (void); #endif /* EGL_NV_system_time */ #include +#include #ifdef __cplusplus } diff --git a/include/EGL/eglextchromium.h b/include/EGL/eglextchromium.h new file mode 100644 index 000..0cc0976 --- /dev/null +++ b/include/EGL/eglextchromium.h @@ -0,0 +1,60 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +//* Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +//* Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +//* Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file contains Chromium-specific EGL extensions declarations. + +#ifndef GPU_EGL_EGLEXTCHROMIUM_H_ +#define GPU_EGL_EGLEXTCHROMIUM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* EGLSyncControlCHROMIUM requires 64-bit uint support */ +#if KHRONOS_SUPPORT_INT64 +#ifndef EGL_CHROMIUM_sync_control +#define EGL_CHROMIUM_sync_control 1 +typedef khronos_uint64_t EGLuint64CHROMIUM; +#ifdef EGL_EGLEXT_PROTOTYPES +EGLAPI EGLBoolean EGLAPIENTRY eglGetSyncValuesCHROMIUM( +EGLDisplay dpy, EGLSurface surface, EGLuint64CHROMIUM *ust, +EGLuint64CHROMIUM *msc, EGLuint64CHROMIUM *sbc); +#endif /* EGL_EGLEXT_PROTOTYPES */ +typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETSYNCVALUESCHROMIUMPROC) +(EGLDisplay dpy, EGLSurface surface, EGLuint64CHROMIUM *ust, + EGLuint64CHROMIUM *msc, EGLuint64CHROMIUM *sbc); +#endif +#endif + +#ifdef __cplusplus +} +#endif + +#endif // GPU_EGL_EGLEXTCHROMIUM_H_ diff --git a/src/egl/main/Makefile.am b/src/egl/main/Makefile.am index 2858913..23207db 100644 --- a/src/egl/main/Makefile.am +++ b/src/egl/main/Makefile.am @@ -126,5 +126,6 @@ egldir = $(includedir)/EGL egl_HEADERS = \ $(top_srcdir)/include/EGL/eglext.h \ $(top_srcdir)/include/EGL/egl.h \ + $(top_srcdir)/include/EGL/eglextchromium.h \ $(top_srcdir)/include/EGL/eglmesaext.h \ $(top_srcdir)/include/EGL/eglplatform.h -- 1.8.3.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 2/2] egl: Add EGL_CHROMIUM_sync_control extension.
Chromium defined a new GL extension (that isn't registered with Khronos). We need to add an EGL extension for it, so we can migrate ChromeOS on Intel systems to use EGL instead of GLX. http://git.chromium.org/gitweb/?p=chromium/src/third_party/khronos.git;a=commitdiff;h=27cbfdab35c601f70aa150581ad1448d0401f447 The EGL_CHROMIUM_sync_control extension is similar to the GLX extension OML_sync_control, but only defines one function, eglGetSyncValuesCHROMIUM, which is equivalent to glXGetSyncValuesOML. http://www.opengl.org/registry/specs/OML/glx_sync_control.txt Signed-off-by: Sarah Sharp Cc: Chad Versace Cc: Jamey Sharp Cc: Ian Romanick Cc: Stéphane Marchesin --- v2: - Clear up confusion around extension vs functions. The new EGL extension name is CHROMIUM_sync_control and the new function name is eglGetSyncValuesCHROMIUM. - Remove all instances of #ifdef EGL_CHROMIUM_sync_control, but leave the #define in include/EGL/eglext.h. - Extensions are sorted by group, then alphabetically. Make sure to respect that when adding the EGL_CHROMIUM_sync_control extension. - Set EGL error codes where appropriate. Make sure dri2_x11_get_sync_values and eglGetSyncValuesCHROMIUM set an EGL error code they fail. - Use the newly imported Chromium header, rather than putting the extension in eglext.h (which will be overwritten as new versions are imported from Khronos). v3: - unchanged from v2 src/egl/drivers/dri2/egl_dri2.c | 10 ++ src/egl/drivers/dri2/egl_dri2.h | 4 src/egl/drivers/dri2/egl_dri2_fallbacks.h | 8 src/egl/drivers/dri2/platform_android.c | 1 + src/egl/drivers/dri2/platform_drm.c | 1 + src/egl/drivers/dri2/platform_wayland.c | 1 + src/egl/drivers/dri2/platform_x11.c | 29 + src/egl/main/eglapi.c | 23 +++ src/egl/main/eglapi.h | 3 +++ src/egl/main/egldisplay.h | 2 ++ src/egl/main/eglmisc.c| 2 ++ 11 files changed, 84 insertions(+) diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c index dc541ad..e7987ee 100644 --- a/src/egl/drivers/dri2/egl_dri2.c +++ b/src/egl/drivers/dri2/egl_dri2.c @@ -1386,6 +1386,15 @@ dri2_create_image_wayland_wl_buffer(_EGLDisplay *disp, _EGLContext *ctx, } #endif +static EGLBoolean +dri2_get_sync_values_chromium(_EGLDisplay *dpy, _EGLSurface *surf, + EGLuint64KHR *ust, EGLuint64KHR *msc, + EGLuint64KHR *sbc) +{ + struct dri2_egl_display *dri2_dpy = dri2_egl_display(dpy); + return dri2_dpy->vtbl->get_sync_values(dpy, surf, ust, msc, sbc); +} + /** * Set the error code after a call to * dri2_egl_image::dri_image::createImageFromTexture. @@ -2177,6 +2186,7 @@ _eglBuiltInDriverDRI2(const char *args) dri2_drv->base.API.UnbindWaylandDisplayWL = dri2_unbind_wayland_display_wl; dri2_drv->base.API.QueryWaylandBufferWL = dri2_query_wayland_buffer_wl; #endif + dri2_drv->base.API.GetSyncValuesCHROMIUM = dri2_get_sync_values_chromium; dri2_drv->base.Name = "DRI2"; dri2_drv->base.Unload = dri2_unload; diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h index e62e265..44f26fb 100644 --- a/src/egl/drivers/dri2/egl_dri2.h +++ b/src/egl/drivers/dri2/egl_dri2.h @@ -138,6 +138,10 @@ struct dri2_egl_display_vtbl { struct wl_buffer* (*create_wayland_buffer_from_image)( _EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *img); + + EGLBoolean (*get_sync_values)(_EGLDisplay *display, _EGLSurface *surface, + EGLuint64KHR *ust, EGLuint64KHR *msc, + EGLuint64KHR *sbc); }; struct dri2_egl_display diff --git a/src/egl/drivers/dri2/egl_dri2_fallbacks.h b/src/egl/drivers/dri2/egl_dri2_fallbacks.h index a5cf344..9cba001 100644 --- a/src/egl/drivers/dri2/egl_dri2_fallbacks.h +++ b/src/egl/drivers/dri2/egl_dri2_fallbacks.h @@ -98,3 +98,11 @@ dri2_fallback_create_wayland_buffer_from_image(_EGLDriver *drv, { return NULL; } + +static inline EGLBoolean +dri2_fallback_get_sync_values(_EGLDisplay *dpy, _EGLSurface *surf, + EGLuint64KHR *ust, EGLuint64KHR *msc, + EGLuint64KHR *sbc) +{ + return EGL_FALSE; +} diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c index 7b1db76..71948bd 100644 --- a/src/egl/drivers/dri2/platform_android.c +++ b/src/egl/drivers/dri2/platform_android.c @@ -650,6 +650,7 @@ static struct dri2_egl_display_vtbl droid_display_vtbl = { .copy_buffers = dri2_fallback_copy_buffers, .query_buffer_age = dri2_fallback_query_buffer_age, .create_wayland_buffer_from_image = dri2_fallback_create_wayland_buffer_from_image, + .get_sync_values = dri2_fallback_get_sync_values, }; EGLBoolean diff --git a/src/egl/drivers/
[Mesa-dev] [PATCH] i965/fs: Enable vector-mask in correct dword in Broadwell's 3DSTATE_PS.
--- Noticed by inspection. Not tested. It looks like this would have messed up the scratch space base pointer. src/mesa/drivers/dri/i965/gen8_ps_state.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index 7d8f954..3006a0e 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -134,7 +134,7 @@ static void upload_ps_state(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; - uint32_t dw3 = 0, dw6 = 0, dw7 = 0; + uint32_t dw2 = 0, dw3 = 0, dw6 = 0, dw7 = 0; /* CACHE_NEW_SAMPLER */ BEGIN_BATCH(2); @@ -149,7 +149,7 @@ upload_ps_state(struct brw_context *brw) * incorrect for subspans where some of the pixels are unlit. We believe * the bit just didn't take effect in previous generations. */ - dw3 |= GEN7_PS_VECTOR_MASK_ENABLE; + dw2 |= GEN7_PS_VECTOR_MASK_ENABLE; /* CACHE_NEW_SAMPLER */ dw3 |= @@ -229,7 +229,7 @@ upload_ps_state(struct brw_context *brw) OUT_BATCH(brw->wm.base.prog_offset + brw->wm.prog_data->prog_offset_16); else OUT_BATCH(brw->wm.base.prog_offset); - OUT_BATCH(0); + OUT_BATCH(dw2); OUT_BATCH(dw3); if (brw->wm.prog_data->total_scratch) { OUT_RELOC64(brw->wm.base.scratch_bo, -- 1.8.3.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] i965/fs: Enable vector-mask in correct dword in Broadwell's 3DSTATE_PS.
On 05/06/2014 12:59 PM, Matt Turner wrote: > --- > Noticed by inspection. Not tested. > > It looks like this would have messed up the scratch space base pointer. > > src/mesa/drivers/dri/i965/gen8_ps_state.c | 6 +++--- > 1 file changed, 3 insertions(+), 3 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c > b/src/mesa/drivers/dri/i965/gen8_ps_state.c > index 7d8f954..3006a0e 100644 > --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c > +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c > @@ -134,7 +134,7 @@ static void > upload_ps_state(struct brw_context *brw) > { > struct gl_context *ctx = &brw->ctx; > - uint32_t dw3 = 0, dw6 = 0, dw7 = 0; > + uint32_t dw2 = 0, dw3 = 0, dw6 = 0, dw7 = 0; > > /* CACHE_NEW_SAMPLER */ > BEGIN_BATCH(2); > @@ -149,7 +149,7 @@ upload_ps_state(struct brw_context *brw) > * incorrect for subspans where some of the pixels are unlit. We believe > * the bit just didn't take effect in previous generations. > */ > - dw3 |= GEN7_PS_VECTOR_MASK_ENABLE; > + dw2 |= GEN7_PS_VECTOR_MASK_ENABLE; > > /* CACHE_NEW_SAMPLER */ > dw3 |= > @@ -229,7 +229,7 @@ upload_ps_state(struct brw_context *brw) >OUT_BATCH(brw->wm.base.prog_offset + > brw->wm.prog_data->prog_offset_16); > else >OUT_BATCH(brw->wm.base.prog_offset); > - OUT_BATCH(0); > + OUT_BATCH(dw2); > OUT_BATCH(dw3); > if (brw->wm.prog_data->total_scratch) { >OUT_RELOC64(brw->wm.base.scratch_bo, NAK. I don't know what docs you're looking at, but the latest ones: DW0 - header, opcode, length DW1, DW2 - kernel start pointer 0 DW3. has bit 30, "Vector Mask Enable." I'm pretty sure the code is correct as is. --Ken signature.asc Description: OpenPGP digital signature ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] i965/fs: Enable vector-mask in correct dword in Broadwell's 3DSTATE_PS.
On Tue, May 6, 2014 at 1:40 PM, Kenneth Graunke wrote: > NAK. I don't know what docs you're looking at, but the latest ones: > > DW0 - header, opcode, length > DW1, DW2 - kernel start pointer 0 > DW3. has bit 30, "Vector Mask Enable." Oh, you're right. The 3DSTATE_PS was 404'd this morning, so I looked at the wrong file in the XML. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] i965: Generalize the pixel_x/y workaround for all UW types.
This is the only case where a fs_reg in brw_fs_visitor is used during optimization/code generation, and it meant that optimizations had to be careful to not move pixel_x/y's register number without updating it. Additionally, it turns out we had a couple of other UW values that weren't getting this treatment (like gl_SampleID), so this more general fix is probably a good idea (though I wasn't able to replicate problems with either pixel_[xy]'s values or gl_SampleID, even when telling the register allocator to reuse registers immediately) --- src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp index c7b1f25..7969b67 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp @@ -86,10 +86,10 @@ fs_live_variables::setup_one_read(bblock_t *block, fs_inst *inst, */ int end_ip = ip; if (v->dispatch_width == 16 && (reg.stride == 0 || - ((v->pixel_x.file == GRF && - v->pixel_x.reg == reg.reg) || -(v->pixel_y.file == GRF && - v->pixel_y.reg == reg.reg { + reg.type == BRW_REGISTER_TYPE_UW || + reg.type == BRW_REGISTER_TYPE_W || + reg.type == BRW_REGISTER_TYPE_UB || + reg.type == BRW_REGISTER_TYPE_B)) { end_ip++; } -- 1.9.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] i965: Stop doing remapping of "special" regs.
Now that we aren't using pixel_[xy] in live variables, nothing is looking at these regs after the visitor stage. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 37 1 file changed, 37 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index c550c41..6b1b866 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1740,34 +1740,6 @@ fs_visitor::compact_virtual_grfs() } } - /* In addition to registers used in instructions, fs_visitor keeps -* direct references to certain special values which must be patched: -*/ - struct { - fs_reg *reg; - unsigned count; - } special[] = { - { &frag_depth, 1 }, - { &pixel_x, 1 }, - { &pixel_y, 1 }, - { &pixel_w, 1 }, - { &wpos_w, 1 }, - { &dual_src_output, 1 }, - { outputs, ARRAY_SIZE(outputs) }, - { delta_x, ARRAY_SIZE(delta_x) }, - { delta_y, ARRAY_SIZE(delta_y) }, - { &sample_mask, 1 }, - { &shader_start_time, 1 }, - }; - - /* Treat all special values as used, to be conservative */ - for (unsigned i = 0; i < ARRAY_SIZE(special); i++) { - for (unsigned j = 0; j < special[i].count; j++) { - if (special[i].reg[j].file == GRF) -remap_table[special[i].reg[j].reg] = 0; - } - } - /* Compact the GRF arrays. */ int new_index = 0; for (int i = 0; i < this->virtual_grf_count; i++) { @@ -1793,15 +1765,6 @@ fs_visitor::compact_virtual_grfs() inst->src[i].reg = remap_table[inst->src[i].reg]; } } - - /* Patch all the references to special values */ - for (unsigned i = 0; i < ARRAY_SIZE(special); i++) { - for (unsigned j = 0; j < special[i].count; j++) { - fs_reg *reg = &special[i].reg[j]; - if (reg->file == GRF && remap_table[reg->reg] != -1) -reg->reg = remap_table[reg->reg]; - } - } } /* -- 1.9.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] i965: Skip recalculating URB allocations if the entry size didn't change.
We only get here if the VS/GS compiled programs change, but we can even skip it if the VS/GS size didn't change. Affects cairo runtime on glamor by -1.26471% +/- 0.674335% (n=234) --- src/mesa/drivers/dri/i965/brw_context.c | 2 +- src/mesa/drivers/dri/i965/brw_context.h | 5 +++-- src/mesa/drivers/dri/i965/gen6_urb.c| 4 ++-- src/mesa/drivers/dri/i965/gen7_urb.c| 13 + 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 17ae685..2a4620b 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -745,7 +745,7 @@ brwCreateContext(gl_api api, brw->max_gtt_map_object_size = gtt_size / 4; if (brw->gen == 6) - brw->urb.gen6_gs_previously_active = false; + brw->urb.gs_present = false; brw->prim_restart.in_progress = false; brw->prim_restart.enable_cut_index = false; diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 92e1592..b450777 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1183,6 +1183,7 @@ struct brw_context */ struct { GLuint vsize;/* vertex size plus header in urb registers */ + GLuint gsize;/* GS output size in urb registers */ GLuint csize;/* constant buffer size in urb registers */ GLuint sfsize; /* setup data size in urb registers */ @@ -1205,10 +1206,10 @@ struct brw_context GLuint cs_start; GLuint size; /* Hardware URB size, in KB. */ - /* gen6: True if the most recently sent _3DSTATE_URB message allocated + /* True if the most recently sent _3DSTATE_URB message allocated * URB space for the GS. */ - bool gen6_gs_previously_active; + bool gs_present; } urb; diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c index b694f5d..9197bcf 100644 --- a/src/mesa/drivers/dri/i965/gen6_urb.c +++ b/src/mesa/drivers/dri/i965/gen6_urb.c @@ -109,9 +109,9 @@ gen6_upload_urb( struct brw_context *brw ) * doesn't exist on Gen6). So for now we just do a full pipeline flush as * a workaround. */ - if (brw->urb.gen6_gs_previously_active && !brw->ff_gs.prog_active) + if (brw->urb.gs_present && !brw->ff_gs.prog_active) intel_batchbuffer_emit_mi_flush(brw); - brw->urb.gen6_gs_previously_active = brw->ff_gs.prog_active; + brw->urb.gs_present = brw->ff_gs.prog_active; } const struct brw_tracked_state gen6_urb = { diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c index 2653e9c..3c21e1e 100644 --- a/src/mesa/drivers/dri/i965/gen7_urb.c +++ b/src/mesa/drivers/dri/i965/gen7_urb.c @@ -150,6 +150,19 @@ gen7_upload_urb(struct brw_context *brw) unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1; unsigned gs_entry_size_bytes = gs_size * 64; + /* If we're just switching between programs with the same URB requirements, +* skip the rest of the logic. +*/ + if (!(brw->state.dirty.brw & BRW_NEW_CONTEXT) && + brw->urb.vsize == vs_size && + brw->urb.gs_present == gs_present && + brw->urb.gsize == gs_size) { + return; + } + brw->urb.vsize = vs_size; + brw->urb.gs_present = gs_present; + brw->urb.gsize = gs_size; + /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS): * * VS Number of URB Entries must be divisible by 8 if the VS URB Entry -- 1.9.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] nv50/ir/gk110: fix set with f32 dest
Should fix SGE/SSG instructions, which were previously getting integer 0/-1 values. Signed-off-by: Ilia Mirkin --- src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index 5992c54..b8d0d3e 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -915,6 +915,9 @@ CodeEmitterGK110::emitSET(const CmpInstruction *i) modNegAbsF32_3b(i, 1); } FTZ_(3a); + + if (i->dType == TYPE_F32) + code[1] |= 1 << 23; } if (i->sType == TYPE_S32) code[1] |= 1 << 19; -- 1.8.3.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] i965: Enable GL_ARB_texture_view on Broadwell.
This is a port of commit c9c08867ed07ceb10b67ffac5f0a33812710a5e8. A tiny bit of extra work was necessary to not break stencil texturing. Cc: "10.2" Signed-off-by: Kenneth Graunke --- src/mesa/drivers/dri/i965/gen8_surface_state.c | 28 ++ src/mesa/drivers/dri/i965/intel_extensions.c | 5 + 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c index 564d275..d52b32e 100644 --- a/src/mesa/drivers/dri/i965/gen8_surface_state.c +++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c @@ -134,17 +134,20 @@ gen8_update_texture_surface(struct gl_context *ctx, struct intel_mipmap_tree *mt = intelObj->mt; struct gl_texture_image *firstImage = tObj->Image[0][tObj->BaseLevel]; struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit); + mesa_format format = intelObj->_Format; if (tObj->Target == GL_TEXTURE_BUFFER) { brw_update_buffer_texture_surface(ctx, unit, surf_offset); return; } - if (tObj->StencilSampling && firstImage->_BaseFormat == GL_DEPTH_STENCIL) + if (tObj->StencilSampling && firstImage->_BaseFormat == GL_DEPTH_STENCIL) { mt = mt->stencil_mt; + format = MESA_FORMAT_S_UINT8; + } unsigned tiling_mode, pitch; - if (mt->format == MESA_FORMAT_S_UINT8) { + if (format == MESA_FORMAT_S_UINT8) { tiling_mode = GEN8_SURFACE_TILING_W; pitch = 2 * mt->pitch; } else { @@ -152,9 +155,14 @@ gen8_update_texture_surface(struct gl_context *ctx, pitch = mt->pitch; } - uint32_t tex_format = translate_tex_format(brw, - mt->format, - sampler->sRGBDecode); + /* If this is a view with restricted NumLayers, then our effective depth +* is not just the miptree depth. +*/ + uint32_t effective_depth = + (tObj->Immutable && tObj->Target != GL_TEXTURE_3D) ? tObj->NumLayers + : mt->logical_depth0; + + uint32_t tex_format = translate_tex_format(brw, format, sampler->sRGBDecode); uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 13 * 4, 64, surf_offset); @@ -178,11 +186,15 @@ gen8_update_texture_surface(struct gl_context *ctx, surf[2] = SET_FIELD(mt->logical_width0 - 1, GEN7_SURFACE_WIDTH) | SET_FIELD(mt->logical_height0 - 1, GEN7_SURFACE_HEIGHT); - surf[3] = SET_FIELD(mt->logical_depth0 - 1, BRW_SURFACE_DEPTH) | (pitch - 1); + surf[3] = SET_FIELD(effective_depth - 1, BRW_SURFACE_DEPTH) | (pitch - 1); - surf[4] = gen7_surface_msaa_bits(mt->num_samples, mt->msaa_layout); + surf[4] = gen7_surface_msaa_bits(mt->num_samples, mt->msaa_layout) | + SET_FIELD(tObj->MinLayer, GEN7_SURFACE_MIN_ARRAY_ELEMENT) | + SET_FIELD(effective_depth - 1, + GEN7_SURFACE_RENDER_TARGET_VIEW_EXTENT); - surf[5] = SET_FIELD(tObj->BaseLevel - mt->first_level, GEN7_SURFACE_MIN_LOD) | + surf[5] = SET_FIELD(tObj->MinLevel + tObj->BaseLevel - mt->first_level, + GEN7_SURFACE_MIN_LOD) | (intelObj->_MaxLevel - tObj->BaseLevel); /* mip count */ surf[6] = 0; diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index ade86a5..c6c76c2 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -284,6 +284,7 @@ intelInitExtensions(struct gl_context *ctx) if (brw->gen >= 7) { ctx->Extensions.ARB_conservative_depth = true; + ctx->Extensions.ARB_texture_view = true; ctx->Extensions.AMD_vertex_shader_layer = true; if (can_do_pipelined_register_writes(brw)) { ctx->Extensions.ARB_transform_feedback2 = true; @@ -302,10 +303,6 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_compute_shader = true; } - if (brw->gen == 7) { - ctx->Extensions.ARB_texture_view = true; - } - if (brw->gen >= 8) { ctx->Extensions.ARB_stencil_texturing = true; } -- 1.9.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [Mesa-stable] [PATCH] i965: Enable GL_ARB_texture_view on Broadwell.
This patch is: Reviewed-by: Chris Forbes Spotted some other bugs (in using a view as a depth/stencil attachment) while reading around this, which I'll take care of. On Wed, May 7, 2014 at 12:03 PM, Kenneth Graunke wrote: > This is a port of commit c9c08867ed07ceb10b67ffac5f0a33812710a5e8. > A tiny bit of extra work was necessary to not break stencil texturing. > > Cc: "10.2" > Signed-off-by: Kenneth Graunke > --- > src/mesa/drivers/dri/i965/gen8_surface_state.c | 28 > ++ > src/mesa/drivers/dri/i965/intel_extensions.c | 5 + > 2 files changed, 21 insertions(+), 12 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c > b/src/mesa/drivers/dri/i965/gen8_surface_state.c > index 564d275..d52b32e 100644 > --- a/src/mesa/drivers/dri/i965/gen8_surface_state.c > +++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c > @@ -134,17 +134,20 @@ gen8_update_texture_surface(struct gl_context *ctx, > struct intel_mipmap_tree *mt = intelObj->mt; > struct gl_texture_image *firstImage = tObj->Image[0][tObj->BaseLevel]; > struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit); > + mesa_format format = intelObj->_Format; > > if (tObj->Target == GL_TEXTURE_BUFFER) { >brw_update_buffer_texture_surface(ctx, unit, surf_offset); >return; > } > > - if (tObj->StencilSampling && firstImage->_BaseFormat == GL_DEPTH_STENCIL) > + if (tObj->StencilSampling && firstImage->_BaseFormat == GL_DEPTH_STENCIL) > { >mt = mt->stencil_mt; > + format = MESA_FORMAT_S_UINT8; > + } > > unsigned tiling_mode, pitch; > - if (mt->format == MESA_FORMAT_S_UINT8) { > + if (format == MESA_FORMAT_S_UINT8) { >tiling_mode = GEN8_SURFACE_TILING_W; >pitch = 2 * mt->pitch; > } else { > @@ -152,9 +155,14 @@ gen8_update_texture_surface(struct gl_context *ctx, >pitch = mt->pitch; > } > > - uint32_t tex_format = translate_tex_format(brw, > - mt->format, > - sampler->sRGBDecode); > + /* If this is a view with restricted NumLayers, then our effective depth > +* is not just the miptree depth. > +*/ > + uint32_t effective_depth = > + (tObj->Immutable && tObj->Target != GL_TEXTURE_3D) ? tObj->NumLayers > + : > mt->logical_depth0; > + > + uint32_t tex_format = translate_tex_format(brw, format, > sampler->sRGBDecode); > > uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, > 13 * 4, 64, surf_offset); > @@ -178,11 +186,15 @@ gen8_update_texture_surface(struct gl_context *ctx, > surf[2] = SET_FIELD(mt->logical_width0 - 1, GEN7_SURFACE_WIDTH) | > SET_FIELD(mt->logical_height0 - 1, GEN7_SURFACE_HEIGHT); > > - surf[3] = SET_FIELD(mt->logical_depth0 - 1, BRW_SURFACE_DEPTH) | (pitch - > 1); > + surf[3] = SET_FIELD(effective_depth - 1, BRW_SURFACE_DEPTH) | (pitch - 1); > > - surf[4] = gen7_surface_msaa_bits(mt->num_samples, mt->msaa_layout); > + surf[4] = gen7_surface_msaa_bits(mt->num_samples, mt->msaa_layout) | > + SET_FIELD(tObj->MinLayer, GEN7_SURFACE_MIN_ARRAY_ELEMENT) | > + SET_FIELD(effective_depth - 1, > + GEN7_SURFACE_RENDER_TARGET_VIEW_EXTENT); > > - surf[5] = SET_FIELD(tObj->BaseLevel - mt->first_level, > GEN7_SURFACE_MIN_LOD) | > + surf[5] = SET_FIELD(tObj->MinLevel + tObj->BaseLevel - mt->first_level, > + GEN7_SURFACE_MIN_LOD) | > (intelObj->_MaxLevel - tObj->BaseLevel); /* mip count */ > > surf[6] = 0; > diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c > b/src/mesa/drivers/dri/i965/intel_extensions.c > index ade86a5..c6c76c2 100644 > --- a/src/mesa/drivers/dri/i965/intel_extensions.c > +++ b/src/mesa/drivers/dri/i965/intel_extensions.c > @@ -284,6 +284,7 @@ intelInitExtensions(struct gl_context *ctx) > > if (brw->gen >= 7) { >ctx->Extensions.ARB_conservative_depth = true; > + ctx->Extensions.ARB_texture_view = true; >ctx->Extensions.AMD_vertex_shader_layer = true; >if (can_do_pipelined_register_writes(brw)) { > ctx->Extensions.ARB_transform_feedback2 = true; > @@ -302,10 +303,6 @@ intelInitExtensions(struct gl_context *ctx) > ctx->Extensions.ARB_compute_shader = true; > } > > - if (brw->gen == 7) { > - ctx->Extensions.ARB_texture_view = true; > - } > - > if (brw->gen >= 8) { >ctx->Extensions.ARB_stencil_texturing = true; > } > -- > 1.9.1 > > ___ > mesa-stable mailing list > mesa-sta...@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-stable ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] Mixing of hardware and software renderers
I'm using some older hardware - an ATI Radeon 9200 - which can only handle up to OpenGL 1.2. I was wondering if it's possible to use the hardware renderer generally and have the driver hand off the handling of functions which my video card can't handle (such as functions from a higher OpenGL version) to the software render and then the software render hand control back to the hardware renderer once it's finished. If this isn't currently possible, is this perhaps a feature which might appear in the future? ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [Mesa-users] Problem with ARB_copy_buffer on Mesa 9.2.4
Hello Benjamin, thank you very much for testing this. If this issue is fixed in latest Mesa, then I don't have to issue a bug report after all. It's also not critical for me to have this issue fixed on my distribution as I can easily work around it; I just wanted to make sure that the fix for this is available in a higher Mesa version so I'll eventually get it via upgrading. Thanks a lot, Jonas 2014-05-06 20:01 GMT+02:00 Benjamin Bellec : > Hello Jonas, > > I tested your program and on my system (Fedora 19) on an Evergreen (Radeon > HD 5850) and I have the same issue indeed. > Here is my result : > > Mesa 9.2.4 (from F19 repo) => Data does NOT match up! > Mesa 9.2.5 => Data does NOT match up! > Mesa 10.0.5 => Data does NOT match up! > Mesa 10.1.0 => Data matches. > > So this is fixed in newer version. > That said, Mesa 9.2 is not supported anymore and I really don't know if > there will be a new Mesa 10.0.x release given the imminence of Mesa 10.2. > If yes, I can bisect and you can open a bug. > > Mesa-dev, any new 10.0.x release planned ? > > Regards. > > Benjamin > > Le 06/05/2014 13:40, Jonas Kulla a écrit : > > Hello list, > > after about 3 days of debugging, I was able to isolate a rather weird > behavior in Mesa GL. > The gist of it is the following: When I create a buffer object and > allocate uninitilaized > memory for it (glBufferData() with nullptr as 'data'), > then glCopyBufferSubData() data into > it from another buffer object, then subsequently fill a part of it with > glBufferSubData(), > this new data isn't visible to the buffer object. In fact, it seems that > the SubData'ed bytes > are completely lost. Any further data uploads however work as expected. I > will attach > a small C test case below that demonstrates this behavior. > > I realize that I am working with an old Mesa release (on Fedora 19), but > I'm afraid of > upgrading my system to the newest distro release as I might break my > working environment. > That's why I would like to kindly ask if someone could verify that this > problem still persists > on the newest Mesa code, in which case I would go ahead and file a bug > report. At the > same time, maybe someone could spot a critical mistake in my code that > would explain > this strange behavior I'm seeing. I think the code paths I'm hitting here > in the driver are > sufficiently obscure though. > > I should probably mention that my card is a Mobility Radeon HD 3650 (ie. > r600). > > Here's the code sample (you can replace the GL setup code with your own): > > #include > #include > > #include > #include > > static SDL_Window *win; > static SDL_GLContext *ctx; > > void setupGL() > { > SDL_Init(SDL_INIT_VIDEO); > win = SDL_CreateWindow("CopyBufferBug", SDL_WINDOWPOS_UNDEFINED, > SDL_WINDOWPOS_UNDEFINED, 64, 64, SDL_WINDOW_OPENGL); > ctx = SDL_GL_CreateContext(win); > glewInit(); > } > > static void teardownGL() > { > SDL_GL_DeleteContext(ctx); > SDL_DestroyWindow(win); > > SDL_Quit(); > } > > int main(int argc, char *argv[]) > { > setupGL(); > > /* These don't matter I think */ > #define BLOCK_SIZE 128 > #define BUFFER1_SIZE BLOCK_SIZE > #define BUFFER2_SIZE BLOCK_SIZE > #define BUFFER1_TARGET GL_COPY_READ_BUFFER > #define BUFFER2_TARGET GL_COPY_WRITE_BUFFER > #define BUFFER1_USAGE GL_DYNAMIC_DRAW > #define BUFFER2_USAGE GL_DYNAMIC_DRAW > > GLuint buffers[2]; > glGenBuffers(2, buffers); > > /* We allocate both buffers with undefined memory */ > glBindBuffer(BUFFER1_TARGET, buffers[0]); > glBufferData(BUFFER1_TARGET, BUFFER1_SIZE, 0, BUFFER1_USAGE); > > glBindBuffer(BUFFER2_TARGET, buffers[1]); > glBufferData(BUFFER2_TARGET, BUFFER2_SIZE, 0, BUFFER2_USAGE); > > /* Then copy (undefined) bytes from the first into the second > buffer */ > /* Note: If I comment this line out, everything works */ > glCopyBufferSubData(BUFFER1_TARGET, BUFFER2_TARGET, 0, 0, > BUFFER1_SIZE); > > /* Generate random string */ > FILE *rand = fopen("/dev/urandom", "r"); > char data[BLOCK_SIZE]; > fread(data, 1, sizeof(data), rand); > fclose(rand); > > /* We fill the second buffer with defined data */ > /* Note: If I execute this call twice (just copy paste the line), > everything works */ > glBufferSubData(BUFFER2_TARGET, 0, sizeof(data), data); > > /* Then download it again to compare its contents against our test > string */ > char data2[BLOCK_SIZE]; > glGetBufferSubData(BUFFER2_TARGET, 0, sizeof(data2), data2); > > if (memcmp(data, data2, sizeof(data))) > printf("Data does NOT match up!\n"); > else > printf("Data matches.\n"); > > glDeleteBuffers(2, buffers); > > teardownGL(); > > return 0; > } > > Thank you very much for
Re: [Mesa-dev] Mixing of hardware and software renderers
On 6 May 2014 14:51, Patrick McMunn wrote: > I'm using some older hardware - an ATI Radeon 9200 - which can only handle > up to OpenGL 1.2. I was wondering if it's possible to use the hardware > renderer generally and have the driver hand off the handling of functions > which my video card can't handle (such as functions from a higher OpenGL > version) to the software render and then the software render hand control > back to the hardware renderer once it's finished. If this isn't currently > possible, is this perhaps a feature which might appear in the future? It generally isn't possible and isn't worth it. Pingpong between hw and sw renderers and the fact that sw renderers are slow makes it unlikely you could do something like this practically. Either run a sw renderer, or get a better gpu. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/8] i965/fs: Add plumbing for communicating single program flow.
And do blorp at the same time. --- src/mesa/drivers/dri/i965/brw_blorp.h | 6 ++ src/mesa/drivers/dri/i965/brw_blorp_blit.cpp| 2 +- src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp | 10 +++--- src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h | 3 ++- src/mesa/drivers/dri/i965/brw_context.h | 1 + src/mesa/drivers/dri/i965/brw_fs.cpp| 6 -- src/mesa/drivers/dri/i965/brw_fs.h | 9 ++--- src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 8 +--- src/mesa/drivers/dri/i965/gen8_fs_generator.cpp | 10 ++ 9 files changed, 38 insertions(+), 17 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h b/src/mesa/drivers/dri/i965/brw_blorp.h index 15a7a0b..b217451 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp.h +++ b/src/mesa/drivers/dri/i965/brw_blorp.h @@ -202,6 +202,12 @@ struct brw_blorp_prog_data unsigned int first_curbe_grf; /** +* True if the WM program contains control flow instructions. Used to +* enable single program flow. +*/ + bool has_control_flow; + + /** * True if the WM program should be run in MSDISPMODE_PERSAMPLE with more * than one sample per pixel. */ diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp index 300ff5c..3f1a7bc 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp @@ -901,7 +901,7 @@ brw_blorp_blit_program::compile(struct brw_context *brw, */ render_target_write(); - return get_program(program_size, dump_file); + return get_program(program_size, &prog_data.has_control_flow, dump_file); } void diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp index 38969d8..4063c63 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp @@ -37,16 +37,20 @@ brw_blorp_eu_emitter::~brw_blorp_eu_emitter() } const unsigned * -brw_blorp_eu_emitter::get_program(unsigned *program_size, FILE *dump_file) +brw_blorp_eu_emitter::get_program(unsigned *program_size, + bool *has_control_flow, + FILE *dump_file) { const unsigned *res; if (unlikely(INTEL_DEBUG & DEBUG_BLORP)) { fprintf(stderr, "Native code for BLORP blit:\n"); - res = generator.generate_assembly(NULL, &insts, program_size, dump_file); + res = generator.generate_assembly(NULL, &insts, program_size, +has_control_flow, dump_file); fprintf(stderr, "\n"); } else { - res = generator.generate_assembly(NULL, &insts, program_size); + res = generator.generate_assembly(NULL, &insts, program_size, +has_control_flow); } return res; diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h index c10695e..386ddbb 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h @@ -33,7 +33,8 @@ protected: explicit brw_blorp_eu_emitter(struct brw_context *brw); ~brw_blorp_eu_emitter(); - const unsigned *get_program(unsigned *program_size, FILE *dump_file); + const unsigned *get_program(unsigned *program_size, bool *has_control_flow, + FILE *dump_file); void emit_kill_if_outside_rect(const struct brw_reg &x, const struct brw_reg &y, diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 82b38fc..18149b5 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -395,6 +395,7 @@ struct brw_wm_prog_data { bool dual_src_blend; bool uses_pos_offset; bool uses_omask; + bool has_control_flow; uint32_t prog_offset_16; /** diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index c550c41..8b7a77f 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -3200,11 +3200,13 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c, if (brw->gen >= 8) { gen8_fs_generator g(brw, c, prog, fp, v.do_dual_src); assembly = g.generate_assembly(&v.instructions, simd16_instructions, - final_assembly_size); + final_assembly_size, + &c->prog_data.has_control_flow); } else { fs_generator g(brw, c, prog, fp, v.do_dual_src); assembly = g.generate_assembly(&v.instructions, simd16_instructions, - final_assembly_size); + final_assembly_size, + &c->prog_data.has_control_flow); } if
[Mesa-dev] i965: Single program flow for shaders with no control flow.
The docs say that flipping this bit on for shaders that don't do SIMD branching (i.e., non-uniform control flow) will save us some power. An easy first step is turning this on when we don't see control flow. In the future with more infrastructure in place, we can determine if all branching conditions are uniformly constant and turn on SPF. Hopefully this saves some power and extends battery life, but I'm not sure how to accurately quantify this, short of printing i915_energy_uJ before and after some workload. Even then I don't have any expectation for how much energy the GPU would use for, say a piglit run. Is 200 ~ 300 Joules reasonable (over 220 seconds)? ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/8] i965/fs: Set has_control_flow to true on an IF or WHILE instruction.
All of the other control flow instructions are dependent on the existence of an IF or WHILE instruction. --- src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 2 ++ src/mesa/drivers/dri/i965/gen8_fs_generator.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index ae89a50..651b708 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -1606,6 +1606,7 @@ fs_generator::generate_code(exec_list *instructions, bool *has_control_flow, break; case BRW_OPCODE_IF: + *has_control_flow = true; if (inst->src[0].file != BAD_FILE) { /* The instruction has an embedded compare (only allowed on gen6) */ assert(brw->gen == 6); @@ -1640,6 +1641,7 @@ fs_generator::generate_code(exec_list *instructions, bool *has_control_flow, break; case BRW_OPCODE_WHILE: + *has_control_flow = true; brw_WHILE(p); break; diff --git a/src/mesa/drivers/dri/i965/gen8_fs_generator.cpp b/src/mesa/drivers/dri/i965/gen8_fs_generator.cpp index 7009c6b..086c84c 100644 --- a/src/mesa/drivers/dri/i965/gen8_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/gen8_fs_generator.cpp @@ -1057,6 +1057,7 @@ gen8_fs_generator::generate_code(exec_list *instructions, break; case BRW_OPCODE_IF: + *has_control_flow = true; IF(BRW_PREDICATE_NORMAL); break; @@ -1081,6 +1082,7 @@ gen8_fs_generator::generate_code(exec_list *instructions, break; case BRW_OPCODE_WHILE: + *has_control_flow = true; WHILE(); break; -- 1.8.3.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 7/8] i965/vs: Enable SPF when the shader contains no control flow.
--- src/mesa/drivers/dri/i965/gen6_vs_state.c | 3 ++- src/mesa/drivers/dri/i965/gen7_vs_state.c | 3 ++- src/mesa/drivers/dri/i965/gen8_vs_state.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c index 0af87d1..bdfb9b5 100644 --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c @@ -167,7 +167,8 @@ upload_vs_state(struct brw_context *brw) OUT_BATCH(floating_point_mode | ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_VS_SAMPLER_COUNT_SHIFT) | ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) << - GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); + GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | + (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE : 0)); if (brw->vs.prog_data->base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c index b5fc871..f9c9abc 100644 --- a/src/mesa/drivers/dri/i965/gen7_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c @@ -97,7 +97,8 @@ upload_vs_state(struct brw_context *brw) ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_VS_SAMPLER_COUNT_SHIFT) | ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) << - GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); + GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | + (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE : 0)); if (brw->vs.prog_data->base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c index 373cfe4..a83d78b 100644 --- a/src/mesa/drivers/dri/i965/gen8_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c @@ -85,7 +85,8 @@ upload_vs_state(struct brw_context *brw) ((ALIGN(stage_state->sampler_count, 4) / 4) << GEN6_VS_SAMPLER_COUNT_SHIFT) | ((prog_data->base.binding_table.size_bytes / 4) << - GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); + GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | + (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE : 0)); if (prog_data->total_scratch) { OUT_RELOC64(stage_state->scratch_bo, -- 1.8.3.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/8] i965/fs: Enable SPF when the shader contains no control flow.
--- src/mesa/drivers/dri/i965/gen6_wm_state.c | 5 + src/mesa/drivers/dri/i965/gen7_wm_state.c | 5 + src/mesa/drivers/dri/i965/gen8_ps_state.c | 5 + 3 files changed, 15 insertions(+) diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c index 22e0925..0c7e12b 100644 --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c @@ -151,6 +151,11 @@ upload_wm_state(struct brw_context *brw) dw2 |= ((brw->wm.prog_data->base.binding_table.size_bytes / 4) << GEN6_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT); + /* Enable single program flow mode to save power if the program doesn't +* contain any control flow instructions. +*/ + dw2 |= !brw->wm.prog_data->has_control_flow ? GEN6_WM_SPF_MODE : 0; + dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT; /* CACHE_NEW_WM_PROG */ diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c index 71535a5..575d321 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c @@ -171,6 +171,11 @@ upload_ps_state(struct brw_context *brw) if (ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT] == NULL) dw2 |= GEN7_PS_FLOATING_POINT_MODE_ALT; + /* Enable single program flow mode to save power if the program doesn't +* contain any control flow instructions. +*/ + dw2 |= !brw->wm.prog_data->has_control_flow ? GEN7_PS_SPF_MODE : 0; + /* Haswell requires the sample mask to be set in this packet as well as * in 3DSTATE_SAMPLE_MASK; the values should match. */ /* _NEW_BUFFERS, _NEW_MULTISAMPLE */ diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index 7d8f954..63883f8 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -145,6 +145,11 @@ upload_ps_state(struct brw_context *brw) /* CACHE_NEW_WM_PROG */ gen8_upload_constant_state(brw, &brw->wm.base, true, _3DSTATE_CONSTANT_PS); + /* Enable single program flow mode to save power if the program doesn't +* contain any control flow instructions. +*/ + dw3 |= !brw->wm.prog_data->has_control_flow ? GEN7_PS_SPF_MODE : 0; + /* Initialize the execution mask with VMask. Otherwise, derivatives are * incorrect for subspans where some of the pixels are unlit. We believe * the bit just didn't take effect in previous generations. -- 1.8.3.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 4/8] i965/blorp: Enable SPF when the shader contains no control flow.
--- src/mesa/drivers/dri/i965/gen6_blorp.cpp | 1 + src/mesa/drivers/dri/i965/gen7_blorp.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/src/mesa/drivers/dri/i965/gen6_blorp.cpp b/src/mesa/drivers/dri/i965/gen6_blorp.cpp index 4222fa8..5d7be60 100644 --- a/src/mesa/drivers/dri/i965/gen6_blorp.cpp +++ b/src/mesa/drivers/dri/i965/gen6_blorp.cpp @@ -695,6 +695,7 @@ gen6_blorp_emit_wm_config(struct brw_context *brw, dw6 |= 0 << GEN6_WM_NUM_SF_OUTPUTS_SHIFT; /* No inputs from SF */ if (params->use_wm_prog) { dw2 |= 1 << GEN6_WM_SAMPLER_COUNT_SHIFT; /* Up to 4 samplers */ + dw2 |= !prog_data->has_control_flow ? GEN6_WM_SPF_MODE : 0; dw4 |= prog_data->first_curbe_grf << GEN6_WM_DISPATCH_START_GRF_SHIFT_0; dw5 |= GEN6_WM_16_DISPATCH_ENABLE; dw5 |= GEN6_WM_KILL_ENABLE; /* TODO: temporarily smash on */ diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp index 4bf9396..3dbe174 100644 --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp @@ -583,6 +583,7 @@ gen7_blorp_emit_ps_config(struct brw_context *brw, dw4 |= SET_FIELD(1, HSW_PS_SAMPLE_MASK); /* 1 sample for now */ if (params->use_wm_prog) { dw2 |= 1 << GEN7_PS_SAMPLER_COUNT_SHIFT; /* Up to 4 samplers */ + dw2 |= !prog_data->has_control_flow ? GEN7_PS_SPF_MODE : 0; dw4 |= GEN7_PS_PUSH_CONSTANT_ENABLE; dw5 |= prog_data->first_curbe_grf << GEN7_PS_DISPATCH_START_GRF_SHIFT_0; } -- 1.8.3.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 6/8] i965/vec4: Set has_control_flow to true on an IF or WHILE instruction.
All of the other control flow instructions are dependent on the existence of an IF or WHILE instruction. --- src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 3 +++ src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index ba8d26d..89656d1 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -1318,6 +1318,9 @@ vec4_generator::generate_code(exec_list *instructions, bool *has_control_flow) unsigned pre_emit_nr_insn = p->nr_insn; + if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_WHILE) + *has_control_flow = true; + generate_vec4_instruction(inst, dst, src); if (inst->no_dd_clear || inst->no_dd_check) { diff --git a/src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp b/src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp index 222e81a..42d025e 100644 --- a/src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp @@ -850,6 +850,9 @@ gen8_vec4_generator::generate_code(exec_list *instructions, const unsigned pre_emit_nr_inst = nr_inst; + if (ir->opcode == BRW_OPCODE_IF || ir->opcode == BRW_OPCODE_WHILE) + *has_control_flow = true; + generate_vec4_instruction(ir, dst, src); if (ir->no_dd_clear || ir->no_dd_check) { -- 1.8.3.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 5/8] i965/vec4: Add plumbing for communicating single program flow.
--- src/mesa/drivers/dri/i965/brw_context.h | 2 ++ src/mesa/drivers/dri/i965/brw_vec4.cpp| 6 -- src/mesa/drivers/dri/i965/brw_vec4.h | 10 ++ src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 7 --- src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp | 6 -- src/mesa/drivers/dri/i965/gen8_vec4_generator.cpp | 8 +--- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 18149b5..08760de 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -598,6 +598,8 @@ struct brw_vec4_prog_data { * is the size of the URB entry used for output. */ GLuint urb_entry_size; + + bool has_control_flow; }; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index daff364..9e68ebc 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1831,11 +1831,13 @@ brw_vs_emit(struct brw_context *brw, if (brw->gen >= 8) { gen8_vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base, mem_ctx, INTEL_DEBUG & DEBUG_VS); - assembly = g.generate_assembly(&v.instructions, final_assembly_size); + assembly = g.generate_assembly(&v.instructions, final_assembly_size, + &prog_data->base.has_control_flow); } else { vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base, mem_ctx, INTEL_DEBUG & DEBUG_VS); - assembly = g.generate_assembly(&v.instructions, final_assembly_size); + assembly = g.generate_assembly(&v.instructions, final_assembly_size, + &prog_data->base.has_control_flow); } if (unlikely(brw->perf_debug) && shader) { diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index ebe707f..e895659 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -647,10 +647,11 @@ public: bool debug_flag); ~vec4_generator(); - const unsigned *generate_assembly(exec_list *insts, unsigned *asm_size); + const unsigned *generate_assembly(exec_list *insts, unsigned *asm_size, + bool *has_control_flow); private: - void generate_code(exec_list *instructions); + void generate_code(exec_list *instructions, bool *has_control_flow); void generate_vec4_instruction(vec4_instruction *inst, struct brw_reg dst, struct brw_reg *src); @@ -748,10 +749,11 @@ public: bool debug_flag); ~gen8_vec4_generator(); - const unsigned *generate_assembly(exec_list *insts, unsigned *asm_size); + const unsigned *generate_assembly(exec_list *insts, unsigned *asm_size, + bool *has_control_flow); private: - void generate_code(exec_list *instructions); + void generate_code(exec_list *instructions, bool *has_control_flow); void generate_vec4_instruction(vec4_instruction *inst, struct brw_reg dst, struct brw_reg *src); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index bcacde9..ba8d26d 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -1260,7 +1260,7 @@ vec4_generator::generate_vec4_instruction(vec4_instruction *instruction, } void -vec4_generator::generate_code(exec_list *instructions) +vec4_generator::generate_code(exec_list *instructions, bool *has_control_flow) { int last_native_insn_offset = 0; const char *last_annotation_string = NULL; @@ -1359,10 +1359,11 @@ vec4_generator::generate_code(exec_list *instructions) const unsigned * vec4_generator::generate_assembly(exec_list *instructions, - unsigned *assembly_size) + unsigned *assembly_size, + bool *has_control_flow) { brw_set_access_mode(p, BRW_ALIGN_16); - generate_code(instructions); + generate_code(instructions, has_control_flow); return brw_get_program(p, assembly_size); } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index 1321a94..428ed60 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -570,11 +570,13 @@ generate_assembly(struct brw_context *brw, if (brw->gen >= 8) { gen8_vec4_generator g(brw, shader_prog, prog, prog_data, mem_ctx, INTEL_DEBUG & DEBUG_GS); - return g.generate_assembly(instructions, final
[Mesa-dev] [PATCH 8/8] i965/gs: Enable SPF when the shader contains no control flow.
--- src/mesa/drivers/dri/i965/gen7_gs_state.c | 4 +++- src/mesa/drivers/dri/i965/gen8_gs_state.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c index d18ae15..d7ba4a0 100644 --- a/src/mesa/drivers/dri/i965/gen7_gs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c @@ -96,7 +96,9 @@ upload_gs_state(struct brw_context *brw) OUT_BATCH(((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_GS_SAMPLER_COUNT_SHIFT) | ((brw->gs.prog_data->base.base.binding_table.size_bytes / 4) << - GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); + GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | +(!brw->gs.prog_data->base.has_control_flow ? GEN6_GS_SPF_MODE : 0)); + if (brw->gs.prog_data->base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c index 97fbf84..e5260db 100644 --- a/src/mesa/drivers/dri/i965/gen8_gs_state.c +++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c @@ -61,7 +61,8 @@ gen8_upload_gs_state(struct brw_context *brw) ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_GS_SAMPLER_COUNT_SHIFT) | ((prog_data->base.binding_table.size_bytes / 4) << - GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); + GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | +(!brw->gs.prog_data->base.has_control_flow ? GEN6_GS_SPF_MODE : 0)); if (brw->gs.prog_data->base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, -- 1.8.3.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 7/8] i965/vs: Enable SPF when the shader contains no control flow.
On Wed, May 7, 2014 at 9:38 AM, Matt Turner wrote: > --- > src/mesa/drivers/dri/i965/gen6_vs_state.c | 3 ++- > src/mesa/drivers/dri/i965/gen7_vs_state.c | 3 ++- > src/mesa/drivers/dri/i965/gen8_vs_state.c | 3 ++- > 3 files changed, 6 insertions(+), 3 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c > b/src/mesa/drivers/dri/i965/gen6_vs_state.c > index 0af87d1..bdfb9b5 100644 > --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c > +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c > @@ -167,7 +167,8 @@ upload_vs_state(struct brw_context *brw) > OUT_BATCH(floating_point_mode | > ((ALIGN(stage_state->sampler_count, 4)/4) << > GEN6_VS_SAMPLER_COUNT_SHIFT) | > ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) << > - GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); > + GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | > + (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE : > 0)); The doc says bit 31 is "Single Vertex Dispatch". When this bit is set, I vaguely remember VS_INVOCATION_COUNT does get doubled for the same workload, and the performance is hurt. > > if (brw->vs.prog_data->base.total_scratch) { >OUT_RELOC(stage_state->scratch_bo, > diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c > b/src/mesa/drivers/dri/i965/gen7_vs_state.c > index b5fc871..f9c9abc 100644 > --- a/src/mesa/drivers/dri/i965/gen7_vs_state.c > +++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c > @@ -97,7 +97,8 @@ upload_vs_state(struct brw_context *brw) > ((ALIGN(stage_state->sampler_count, 4)/4) << >GEN6_VS_SAMPLER_COUNT_SHIFT) | > ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) << > - GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); > + GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | > + (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE : > 0)); > > if (brw->vs.prog_data->base.total_scratch) { >OUT_RELOC(stage_state->scratch_bo, > diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c > b/src/mesa/drivers/dri/i965/gen8_vs_state.c > index 373cfe4..a83d78b 100644 > --- a/src/mesa/drivers/dri/i965/gen8_vs_state.c > +++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c > @@ -85,7 +85,8 @@ upload_vs_state(struct brw_context *brw) > ((ALIGN(stage_state->sampler_count, 4) / 4) << > GEN6_VS_SAMPLER_COUNT_SHIFT) | > ((prog_data->base.binding_table.size_bytes / 4) << > - GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); > + GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | > + (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE : > 0)); > > if (prog_data->total_scratch) { >OUT_RELOC64(stage_state->scratch_bo, > -- > 1.8.3.2 > > ___ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev -- o...@lunarg.com ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 7/8] i965/vs: Enable SPF when the shader contains no control flow.
On Tue, May 6, 2014 at 7:14 PM, Chia-I Wu wrote: > On Wed, May 7, 2014 at 9:38 AM, Matt Turner wrote: >> --- >> src/mesa/drivers/dri/i965/gen6_vs_state.c | 3 ++- >> src/mesa/drivers/dri/i965/gen7_vs_state.c | 3 ++- >> src/mesa/drivers/dri/i965/gen8_vs_state.c | 3 ++- >> 3 files changed, 6 insertions(+), 3 deletions(-) >> >> diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c >> b/src/mesa/drivers/dri/i965/gen6_vs_state.c >> index 0af87d1..bdfb9b5 100644 >> --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c >> +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c >> @@ -167,7 +167,8 @@ upload_vs_state(struct brw_context *brw) >> OUT_BATCH(floating_point_mode | >> ((ALIGN(stage_state->sampler_count, 4)/4) << >> GEN6_VS_SAMPLER_COUNT_SHIFT) | >> ((brw->vs.prog_data->base.base.binding_table.size_bytes / 4) << >> - GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); >> + GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT) | >> + (!brw->vs.prog_data->base.has_control_flow ? GEN6_VS_SPF_MODE >> : 0)); > The doc says bit 31 is "Single Vertex Dispatch". When this bit is > set, I vaguely remember VS_INVOCATION_COUNT does get doubled for the > same workload, and the performance is hurt. Yeah. Chris Forbes mentioned this to me too. I'll drop the last four patches. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 20/21] mesa: add support for threaded glCompileShader
On Mon, May 5, 2014 at 5:37 AM, Chia-I Wu wrote: > On Sat, May 3, 2014 at 1:59 AM, Ian Romanick wrote: >> On 04/22/2014 01:58 AM, Chia-I Wu wrote: >>> From: Chia-I Wu >>> >>> Threaded glCompileShader can be enabled for a context by calling >>> _mesa_enable_glsl_threadpool. It will initialize the singleton GLSL thread >>> pool and defer glCompileShader calls to the thread pool. >>> >>> For applications to benefit from threaded glCompileShader, they have to >>> compile shaders in this fashion >>> >>> for (i = 0; i < num_shaders; i++) >>>glCompileShader(shaders[i]); >>> for (i = 0; i < num_shaders; i++) >>>glGetShaderiv(shaders[i], GL_COMPILE_STATUS, &val); >> >> I think when you try this series on some real applications, you will be >> disappointed. Eric had a pretty similar branch >> (http://cgit.freedesktop.org/~anholt/mesa/log/?h=compiler-threads), but >> it saw no benefit on any applications... because everybody does >> >> for (i = 0; i < num_shaders; i++) { >> glCompileShader(shaders[i]); >> glGetShaderiv(shaders[i], GL_COMPILE_STATUS, &val); >> } >> >> or >> >> for (i = 0; i < num_shaders; i++) { >> glCompileShader(shaders[i]); >> glAttachShader(prog, shaders[i]); >> } >> >> glLinkProgram(prog); > Yeah, I am aware of the situation with real-world applications. Only > applications that are modified to not immediately check compilation results > will get the speed up in compile times. That is why this feature needs to be > enabled through drirc. We, at LunarG, are working with major game engines > vendors to ensure this performance benefit is realized. > >> I'm also curious about your test case... did you link the shaders? As >> far as I'm aware, the bulk of time spent in the compiler happens during >> linking (final optimizations and register allocation). Eric's data >> (http://lists.freedesktop.org/archives/mesa-dev/2014-April/057494.html) >> says we spend more than 2x time in linking than in compiling. > No, I did not. In my other experiment with Unigine Tropics, the > distribution of time was more like > > glCompileShader: 50% > glLinkProgram FE: 25% > glLinkProgram BE: 25% I've rerun the test (source attached). The numbers from compiling and linking Unigine Tropics shaders are _mesa_CompileShader: 54.8% link_shaders: 17.1% brw_link_shaders: 27.9% The numbers from running on another set of shaders (took about 100 seconds) are _mesa_CompileShader: 50.4% link_shaders: 5.6% brw_link_shaders: 43.8% -- o...@lunarg.com #include #include #include #include #include #include #include #include #define MAX_PROGRAMS 4100 struct { GLuint prog; long long prog_time; long long shader_times[2]; } progs[MAX_PROGRAMS]; static GLuint create_shader(int id, GLenum type) { char filename[32]; size_t size; FILE *fp; char *buf; GLuint sh; switch (type) { case GL_VERTEX_SHADER: snprintf(filename, sizeof(filename), "%d.vert", id); break; case GL_FRAGMENT_SHADER: snprintf(filename, sizeof(filename), "%d.frag", id); break; default: return 0; break; } fp = fopen(filename, "rb"); if (!fp) return 0; fseek(fp, 0, SEEK_END); size = ftell(fp); if (!size) { fclose(fp); return 0; } fseek(fp, 0, SEEK_SET); buf = malloc(size + 1); if (!buf) { fclose(fp); return 0; } if (fread(buf, 1, size, fp) != size) { printf("error reading %s\n", filename); fclose(fp); return 0; } buf[size] = '\0'; fclose(fp); sh = glCreateShader(type); glShaderSource(sh, 1, (void*)&buf, NULL); return sh; } static void compile_all_shaders(void) { struct timeval start, end; GLint val; int i; for (i = 0; i < MAX_PROGRAMS; i++) { GLuint prog; int num_shaders, j; prog = glCreateProgram(); num_shaders = 0; for (j = 0; j < 2; j++) { GLenum type = (j == 0) ? GL_VERTEX_SHADER : GL_FRAGMENT_SHADER; GLuint sh; sh = create_shader(i, type); if (!sh) continue; num_shaders++; gettimeofday(&start, NULL); glCompileShader(sh); gettimeofday(&end, NULL); glGetShaderiv(sh, GL_COMPILE_STATUS, &val); if (!val) { char buf[1024]; GLsizei len; glGetShaderInfoLog(sh, sizeof(buf), &len, buf); printf("%d.%d: %*s\n", i, j, len, buf); } progs[i].shader_times[j] = 100ll * (end.tv_sec - start.tv_sec) + ((long long) end.tv_usec - start.tv_usec); glAttachShader(prog, sh); glDeleteShader(sh); } if (num_shaders < j) { if (num_shaders) { ;//glProgramParameteri(prog, GL_PROGRAM_SEPARABLE, GL_TRUE); } else { glDeleteProgram(prog); continue; } } gettimeofday(&start, NULL); glLinkProgram(prog); gettimeofday(&end, NULL); glGetProgramiv(prog, GL_LINK_STATUS, &val); if (!val) { char buf[1024]; GLsizei len; glGetProgramInfoLog(prog, sizeof(buf), &len, buf); printf("%d: %*s\n", i, len, buf); } progs[i].prog = prog; progs[i].prog_time = 100ll * (end.tv_sec - start.tv_sec) + ((long long) end.t