Re: [Mesa-dev] [PATCH 5/6] nvc0/ir: add support for PK2H/UP2H
Reviewed-by: Samuel Pitoiset On 01/03/2016 01:38 AM, Ilia Mirkin wrote: Signed-off-by: Ilia Mirkin --- .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 1 + .../drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp | 5 - .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 23 ++ src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 2 +- 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index e9ddd36..ec74e7a 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -740,6 +740,7 @@ CodeEmitterGM107::emitF2F() emitCC (0x2f); emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg()); emitFMZ (0x2c, 1); + emitField(0x29, 1, insn->subOp); emitRND (0x27, rnd, 0x2a); emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType))); emitField(0x08, 2, util_logbase2(typeSizeof(insn->dType))); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index 1d4f0d9..0b28047 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -1030,7 +1030,10 @@ CodeEmitterNVC0::emitCVT(Instruction *i) // for 8/16 source types, the byte/word is in subOp. word 1 is // represented as 2. - code[1] |= i->subOp << 0x17; + if (!isFloatType(i->sType)) + code[1] |= i->subOp << 0x17; + else + code[1] |= i->subOp << 0x18; if (sat) code[0] |= 0x20; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index beb67fe..e0b9435 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -319,6 +319,10 @@ unsigned int Instruction::srcMask(unsigned int s) const x |= 2; return x; } + case TGSI_OPCODE_PK2H: + return 0x3; + case TGSI_OPCODE_UP2H: + return 0x1; default: break; } @@ -452,6 +456,7 @@ nv50_ir::DataType Instruction::inferSrcType() const case TGSI_OPCODE_ATOMUMAX: case TGSI_OPCODE_UBFE: case TGSI_OPCODE_UMSB: + case TGSI_OPCODE_UP2H: return nv50_ir::TYPE_U32; case TGSI_OPCODE_I2F: case TGSI_OPCODE_I2D: @@ -516,10 +521,12 @@ nv50_ir::DataType Instruction::inferDstType() const case TGSI_OPCODE_DSGE: case TGSI_OPCODE_DSLT: case TGSI_OPCODE_DSNE: + case TGSI_OPCODE_PK2H: return nv50_ir::TYPE_U32; case TGSI_OPCODE_I2F: case TGSI_OPCODE_U2F: case TGSI_OPCODE_D2F: + case TGSI_OPCODE_UP2H: return nv50_ir::TYPE_F32; case TGSI_OPCODE_I2D: case TGSI_OPCODE_U2D: @@ -2807,6 +2814,22 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c)); break; + case TGSI_OPCODE_PK2H: + val0 = getScratch(); + val1 = getScratch(); + mkCvt(OP_CVT, TYPE_F16, val0, TYPE_F32, fetchSrc(0, 0)); + mkCvt(OP_CVT, TYPE_F16, val1, TYPE_F32, fetchSrc(0, 1)); + mkOp3(OP_INSBF, TYPE_U32, dst0[0], val1, mkImm(0x1010), val0); + break; + case TGSI_OPCODE_UP2H: + src0 = fetchSrc(0, 0); + if (dst0[0]) + mkCvt(OP_CVT, TYPE_F32, dst0[0], TYPE_F16, src0); + if (dst0[1]) { + geni = mkCvt(OP_CVT, TYPE_F32, dst0[1], TYPE_F16, src0); + geni->subOp = 1; + } + break; case TGSI_OPCODE_EMIT: /* export the saved viewport index */ if (viewport != NULL) { diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 58b712e..43f6164 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -197,6 +197,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_DRAW_PARAMETERS: case PIPE_CAP_MULTI_DRAW_INDIRECT: case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: return (class_3d >= NVE4_3D_CLASS) ? 1 : 0; @@ -219,7 +220,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_VERTEXID_NOBASE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: - case PIPE_CAP_TGSI_PACK_HALF_FLOAT: return 0; case PIPE_CAP_VENDOR_ID: ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 5/9] gallium/radeon: always add +DumpCode to the LLVM target machine for LLVM <= 3.5
On Sat, Jan 2, 2016 at 11:01 PM, Nicolai Hähnle wrote: > What's the reason for always having +DumpCode? Generating the assembly is > some overhead that's usually unnecessary. Even if it's a small part of the > profiles I've seen, it still seems like a natural thing to just skip. From > what I can tell it should be dependent on any of the shader dumping flags + > DBG_CHECK_VM being set. In any case, I suppose that would be for a separate > commit. Yeah, I agree that we shouldn't always generate the assembly string. However, there is one case where we probably want to dump it always: when a shader cache is used. We'll have only one chance to compile a shader with a shader cache. If we decide not to generate the assembly, we won't have it for all subsequent uses of the shader (even in other processes). Marek ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [Bug 75165] compute.c:464:49: error: function definition is not allowed here
https://bugs.freedesktop.org/show_bug.cgi?id=75165 Samuel Pitoiset changed: What|Removed |Added Status|NEW |RESOLVED Resolution|--- |FIXED --- Comment #4 from Samuel Pitoiset --- Fixed with "gallium/tests: fix build with clang compiler". http://cgit.freedesktop.org/mesa/mesa/commit/?id=6a49fcfb1f28b563b89f2b37e82d9f87c0671228 -- You are receiving this mail because: You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/5] configure.ac: Detect if running on POWER8 arch
On Thu, Dec 31, 2015 at 8:48 PM, Roland Scheidegger wrote: > Am 31.12.2015 um 10:30 schrieb Oded Gabbay: >> On Wed, Dec 30, 2015 at 5:41 PM, Roland Scheidegger >> wrote: >>> Am 30.12.2015 um 10:46 schrieb Oded Gabbay: On Wed, Dec 30, 2015 at 1:11 AM, Roland Scheidegger wrote: > > So, if I see that right, you will automatically generate binaries using > power8 instructions if compiled on power8 capable box, which then won't > run on boxes not supporting power8? Is that really what you want? > Maybe some runtime detection would be a good idea (though I don't know > if anyone cares about power7)? The problem is I don't think I can eliminate the build time check (although I would very much like to) because I need: 1. To pass a special flag to the GCC compiler: -mpower8-vector 2. To define _ARCH_PWR8 so GCC will include the newer intrinsic Without those two things, I won't be able to use vec_vgbbd which I need to implement the _mm_movemask_epi8 efficiently, and without that, all this patch series can be thrown out the window. The emulation of _mm_movemask_epi8 using regular instructions is just horrible. You are correct that once you build a binary with this flag on power8 machine, that binary won't run on power7 machine. You get "cannot execute binary file" Unfortunately, I don't see a way around this because even if I condition the use of vec_vgbbd on a runtime check/define, the library still won't be executable because it was built with -mpower8-vector. Having said that, because I *assume* IBM right now mostly cares about Linux running on POWER8 with little-endian, I think it is a fair compromise. >>> >>> Note I don't have anything against a build time check. My concern here >>> is something along the lines of unsuspecting distros shipping binaries >>> which won't work, as it looks to me like this will get picked up >>> automatically. That is different to how for instance sse41 is handled. >>> That is I believe this should only get enabled if someone has specified >>> some -mcpu=power8 or whatever flag explicitly somewhere already. >>> >>> Roland >> >> I understand and I share your concern. Maybe we should add >> "--disable-pwr8-inst" to mesa's configure ? if that flag is given to >> configure, it would disable the optimization code (won't add >> _ARCH_PWR8 to defines and won't add -mpower8-vector to gcc flags). >> >> What do you think ? > If the generated code with all automatically picked up compile options > really doesn't run on power7 just because of this, I think it would be > nicer if this were an explicit enable. > > Roland > So the problem is a bit worse then that and requires a harsher solution. Apparently, when that compiler flag (power8-vector) is given to GCC, GCC uses POWER8-only instructions in other places as well! What I have seen so far, is that it uses such instructions in the implementation of exp2() and/or log2() (in f.cpp) and also saw it in __glXInitializeVisualConfigFromTags(). The instructions used are not vector instructions, but floating point instructions, which were added only in PowerISA 2.07 Therefore, I think that for now, I will limit the entire optimization code to POWER8 *and* Little-Endian. Because ppc64le packages can *only* run on POWER8 systems, and because you can't transfer binaries between LE and BE machines, this workaround eliminates the danger of crashing on "illegal instruction". In addition, there is no more need for runtime checks. I hope you agree that with this change, it is better to enable the power8-vector by default when building on POWER8 machine installed with Linux LE. For all other archs it will be disabled by default. I will try to contact IBM GCC devs to see how we can overcome this problem (or if they even care) so I can expand these optimizations to BE as well. I will send the revised patches shortly. Oded > > > >> >> Oded >> >>> Oded > So far we didn't bother with that for SSE > but it has to be said SSE2 is a really low bar (and the manual assembly > stuff doesn't use anything more advanced, even though clearly things > like the emulated mm_mullo_epi32 are suboptimal if your cpu supports > sse41). And even then on non-x86 you actually might not get > PIPE_ARCH_SSE if you didn't set gcc's compile flags accordingly. > > Roland > > > Am 29.12.2015 um 17:12 schrieb Oded Gabbay: >> To determine if we could use special POWER8 assembly directives, we first >> need to detect whether we are running on POWER8 architecture. This patch >> adds this detection to configure.ac and adds the necessary compilation >> flags accordingly. >> >> Signed-off-by: Oded Gabbay >> --- >> configure.ac | 30 ++ >> 1 file changed, 30 insertions(+) >> >> diff --git a/configure.ac b/configur
[Mesa-dev] [PATCH v2 4/5] llvmpipe: Optimize BUILD_MASK(_LINEAR) for POWER8
This patch converts the SSE-optimized build_mask_32() and build_mask_linear_32() to VMX/VSX. I measured the results on POWER8 machine with 32 cores at 3.4GHz and 16GB of RAM. FPS/Score NameBefore AfterDelta glmark2 (score) 139.8 142.72.07% openarena and xonotic didn't show a significant (more than 1%) difference. v2: Make sure code is build only on POWER8 LE machine Signed-off-by: Oded Gabbay --- src/gallium/drivers/llvmpipe/lp_rast_tri.c | 150 + 1 file changed, 110 insertions(+), 40 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index c9b9221..09a182a 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -133,36 +133,8 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task, lp_rast_triangle_4(task, arg2); } -#if !defined(PIPE_ARCH_SSE) +#if defined(PIPE_ARCH_SSE) -void -lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - union lp_rast_cmd_arg arg2; - arg2.triangle.tri = arg.triangle.tri; - arg2.triangle.plane_mask = (1<<3)-1; - lp_rast_triangle_32_3(task, arg2); -} - -void -lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - union lp_rast_cmd_arg arg2; - arg2.triangle.tri = arg.triangle.tri; - arg2.triangle.plane_mask = (1<<4)-1; - lp_rast_triangle_32_4(task, arg2); -} - -void -lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - lp_rast_triangle_32_3_16(task, arg); -} - -#else #include #include "util/u_sse.h" @@ -265,12 +237,6 @@ sign_bits4(const __m128i *cstep, int cdiff) #define NR_PLANES 3 - - - - - - void lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) @@ -381,10 +347,6 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 0x & ~out[i].mask); } - - - - void lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) @@ -471,6 +433,114 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, } #undef NR_PLANES + +#else + +#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) + +#include +#include "util/u_pwr8.h" + +static inline void +build_masks_32(int c, + int cdiff, + int dcdx, + int dcdy, + unsigned *outmask, + unsigned *partmask) +{ + __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); + __m128i xdcdy = (__m128i) vec_splats(dcdy); + + /* Get values across the quad +*/ + __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); + __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); + __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); + + { + __m128i cstep01, cstep23, result; + + cstep01 = vec_packs_epi32(cstep0, cstep1); + cstep23 = vec_packs_epi32(cstep2, cstep3); + result = vec_packs_epi16(cstep01, cstep23); + + *outmask |= vec_movemask_epi8(result); + } + + + { + __m128i cio4 = (__m128i) vec_splats(cdiff); + __m128i cstep01, cstep23, result; + + cstep0 = vec_add_epi32(cstep0, cio4); + cstep1 = vec_add_epi32(cstep1, cio4); + cstep2 = vec_add_epi32(cstep2, cio4); + cstep3 = vec_add_epi32(cstep3, cio4); + + cstep01 = vec_packs_epi32(cstep0, cstep1); + cstep23 = vec_packs_epi32(cstep2, cstep3); + result = vec_packs_epi16(cstep01, cstep23); + + *partmask |= vec_movemask_epi8(result); + } +} + +static inline unsigned +build_mask_linear_32(int c, int dcdx, int dcdy) +{ + __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); + __m128i xdcdy = (__m128i) vec_splats(dcdy); + + /* Get values across the quad +*/ + __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); + __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); + __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); + + /* pack pairs of results into epi16 +*/ + __m128i cstep01 = vec_packs_epi32(cstep0, cstep1); + __m128i cstep23 = vec_packs_epi32(cstep2, cstep3); + + /* pack into epi8, preserving sign bits +*/ + __m128i result = vec_packs_epi16(cstep01, cstep23); + + /* extract sign bits to create mask +*/ + return vec_movemask_epi8(result); +} + +#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */ + +void +lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + union lp_rast_cmd_arg arg2; + arg2.triangle.tri = arg.triangle.tri; + arg2.triangle.plane_mask = (1<<3)-1; + lp_rast_triangle_32_3(task, arg2); +} + +void +lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, + const union lp
[Mesa-dev] [PATCH v2 0/5] Optimizing llvmpipe for POWER8 architecture
Hi, Here is the 2nd version of the patch series. The main change is that the new code is limited to POWER8 Little-Endian machines, due to a special compiler flag (power8-vector) that must bu turned on but creates code that can't run on POWER7 machines. As only POWER8 has an LE mode (ppc64le), the code can be encomposed with: #if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) and because an LE binary can't be run on BE machine anyway, we can safely enable this code by default for ppc64le architecture. A few more changes are detailed in each commit message. Thanks, - Oded Oded Gabbay (5): configure.ac: Detect if running on POWER8 arch llvmpipe: add POWER8 portability file - u_pwr8.h llvmpipe: Optimize do_triangle_ccw for POWER8 llvmpipe: Optimize BUILD_MASK(_LINEAR) for POWER8 llvmpipe: Optimize lp_rast_triangle_32_3_16 for POWER8 configure.ac| 55 + src/gallium/auxiliary/util/u_pwr8.h | 310 src/gallium/drivers/llvmpipe/lp_rast_tri.c | 290 ++ src/gallium/drivers/llvmpipe/lp_setup_tri.c | 100 + 4 files changed, 715 insertions(+), 40 deletions(-) create mode 100644 src/gallium/auxiliary/util/u_pwr8.h -- 2.5.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 2/5] llvmpipe: add POWER8 portability file - u_pwr8.h
This file provides a portability layer that will make it easier to convert SSE-based functions to VMX/VSX-based functions. All the functions implemented in this file are prefixed using "vec_". Therefore, when converting from SSE-based function, one needs to simply replace the "_mm_" prefix of the SSE function being called to "vec_". Having said that, not all functions could be converted as such, due to the differences between the architectures. So, when doing such conversion hurt the performance, I preferred to implement a more ad-hoc solution. For example, converting the _mm_shuffle_epi32 needed to be done using ad-hoc masks instead of a generic function. All the functions in this file support both little-endian and big-endian but currently the file is build only on POWER8 LE machine. All of the functions are implemented using the Altivec/VMX intrinsics, except one where I needed to use inline assembly (due to missing intrinsic). v2: - Use vec_vgbbd instead of __builtin_vec_vgbbd - Add an aligned load function - Don't use typeof() - Make file build only on POWER8 LE machine Signed-off-by: Oded Gabbay --- src/gallium/auxiliary/util/u_pwr8.h | 310 1 file changed, 310 insertions(+) create mode 100644 src/gallium/auxiliary/util/u_pwr8.h diff --git a/src/gallium/auxiliary/util/u_pwr8.h b/src/gallium/auxiliary/util/u_pwr8.h new file mode 100644 index 000..1eca6d6 --- /dev/null +++ b/src/gallium/auxiliary/util/u_pwr8.h @@ -0,0 +1,310 @@ +/* + * Copyright 2015 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Author: Oded Gabbay + */ + +/** + * @file + * POWER8 intrinsics portability header. + * + */ + +#ifndef U_PWR8_H_ +#define U_PWR8_H_ + +#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) + +#define VECTOR_ALIGN_16 __attribute__ ((__aligned__ (16))) + +typedef VECTOR_ALIGN_16 vector unsigned char __m128i; + +typedef VECTOR_ALIGN_16 union m128i { + __m128i m128i; + vector signed int m128si; + vector unsigned int m128ui; + ubyte ub[16]; + ushort us[8]; + int i[4]; + uint ui[4]; +} __m128i_union; + +static inline __m128i +vec_set_epi32 (int i3, int i2, int i1, int i0) +{ + __m128i_union vdst; + +#ifdef PIPE_ARCH_LITTLE_ENDIAN + vdst.i[0] = i0; + vdst.i[1] = i1; + vdst.i[2] = i2; + vdst.i[3] = i3; +#else + vdst.i[3] = i0; + vdst.i[2] = i1; + vdst.i[1] = i2; + vdst.i[0] = i3; +#endif + + return (__m128i) vdst.m128si; +} + +static inline __m128i +vec_setr_epi32 (int i0, int i1, int i2, int i3) +{ + return vec_set_epi32 (i3, i2, i1, i0); +} + +static inline __m128i +vec_unpacklo_epi32 (__m128i even, __m128i odd) +{ + static const __m128i perm_mask = +#ifdef PIPE_ARCH_LITTLE_ENDIAN + { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; +#else + {24, 25, 26, 27, 8, 9, 10, 11, 28, 29, 30, 31, 12, 13, 14, 15}; +#endif + + return vec_perm (even, odd, perm_mask); +} + +static inline __m128i +vec_unpackhi_epi32 (__m128i even, __m128i odd) +{ + static const __m128i perm_mask = +#ifdef PIPE_ARCH_LITTLE_ENDIAN + { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}; +#else + {16, 17, 18, 19, 0, 1, 2, 3, 20, 21, 22, 23, 4, 5, 6, 7}; +#endif + + return vec_perm (even, odd, perm_mask); +} + +static inline __m128i +vec_unpacklo_epi64 (__m128i even, __m128i odd) +{ + static const __m128i perm_mask = +#ifdef PIPE_ARCH_LITTLE_ENDIAN + { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; +#else + {24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15}; +#endif + + return vec_perm (even, odd, perm_mask); +} + +static inline __m128i +vec_unpackhi_epi64 (__m128i even, __m128i odd) +{ + static const __m128i perm_mask = +#ifdef PIPE_ARCH_LITTLE_ENDIAN + { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; +#else + {16, 17, 18, 19, 20,
[Mesa-dev] [PATCH v2 3/5] llvmpipe: Optimize do_triangle_ccw for POWER8
This patch converts the SSE optimization done in do_triangle_ccw to VMX/VSX. I measured the results on POWER8 machine with 32 cores at 3.4GHz and 16GB of RAM. FPS/Score NameBefore AfterDelta glmark2 (score) 136.6 139.82.34% openarena 16.14 16.351.30% xonotic 4.655 4.7071.11% v2: - Convert loads to use aligned loads - Make sure code is build only on POWER8 LE machine Signed-off-by: Oded Gabbay --- src/gallium/drivers/llvmpipe/lp_setup_tri.c | 100 1 file changed, 100 insertions(+) diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index b1671dd..0ff10a2 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -46,6 +46,9 @@ #if defined(PIPE_ARCH_SSE) #include +#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) +#include +#include "util/u_pwr8.h" #endif static inline int @@ -462,6 +465,103 @@ do_triangle_ccw(struct lp_setup_context *setup, STORE_PLANE(plane[2], p2); #undef STORE_PLANE } else +#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) + /* +* XXX this code is effectively disabled for all practical purposes, +* as the allowed fb size is tiny if FIXED_ORDER is 8. +*/ + if (setup->fb.width <= MAX_FIXED_LENGTH32 && + setup->fb.height <= MAX_FIXED_LENGTH32 && + (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 && + (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) { + unsigned int bottom_edge; + __m128i vertx, verty; + __m128i shufx, shufy; + __m128i dcdx, dcdy, c; + __m128i unused; + __m128i dcdx_neg_mask; + __m128i dcdy_neg_mask; + __m128i dcdx_zero_mask; + __m128i top_left_flag; + __m128i c_inc_mask, c_inc; + __m128i eo, p0, p1, p2; + __m128i_union vshuf_mask; + __m128i zero = vec_splats((unsigned char) 0); + PIPE_ALIGN_VAR(16) int32_t temp_vec[4]; + +#ifdef PIPE_ARCH_LITTLE_ENDIAN + vshuf_mask.i[0] = 0x07060504; + vshuf_mask.i[1] = 0x0B0A0908; + vshuf_mask.i[2] = 0x03020100; + vshuf_mask.i[3] = 0x0F0E0D0C; +#else + vshuf_mask.i[0] = 0x00010203; + vshuf_mask.i[1] = 0x0C0D0E0F; + vshuf_mask.i[2] = 0x04050607; + vshuf_mask.i[3] = 0x08090A0B; +#endif + + /* vertex x coords */ + vertx = vec_load_si128((const uint32_t *) position->x); + /* vertex y coords */ + verty = vec_load_si128((const uint32_t *) position->y); + + shufx = vec_perm (vertx, vertx, vshuf_mask.m128i); + shufy = vec_perm (verty, verty, vshuf_mask.m128i); + + dcdx = vec_sub_epi32(verty, shufy); + dcdy = vec_sub_epi32(vertx, shufx); + + dcdx_neg_mask = vec_srai_epi32(dcdx, 31); + dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero); + dcdy_neg_mask = vec_srai_epi32(dcdy, 31); + + bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0; + top_left_flag = (__m128i) vec_splats(bottom_edge); + + c_inc_mask = vec_or(dcdx_neg_mask, +vec_and(dcdx_zero_mask, + vec_xor(dcdy_neg_mask, +top_left_flag))); + + c_inc = vec_srli_epi32(c_inc_mask, 31); + + c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx), +vec_mullo_epi32(dcdy, verty)); + + c = vec_add_epi32(c, c_inc); + + /* Scale up to match c: + */ + dcdx = vec_slli_epi32(dcdx, FIXED_ORDER); + dcdy = vec_slli_epi32(dcdy, FIXED_ORDER); + + /* Calculate trivial reject values: + */ + eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy), + vec_and(dcdx_neg_mask, dcdx)); + + /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */ + + /* Pointless transpose which gets undone immediately in + * rasterization: + */ + transpose4_epi32(&c, &dcdx, &dcdy, &eo, + &p0, &p1, &p2, &unused); + +#define STORE_PLANE(plane, vec) do { \ + vec_store_si128((uint32_t *)&temp_vec, vec); \ + plane.c= (int64_t)temp_vec[0]; \ + plane.dcdx = temp_vec[1];\ + plane.dcdy = temp_vec[2];\ + plane.eo = temp_vec[3];\ + } while(0) + + STORE_PLANE(plane[0], p0); + STORE_PLANE(plane[1], p1); + STORE_PLANE(plane[2], p2); +#undef STORE_PLANE + } else #endif { int i; -- 2.5.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v2 5/5] llvmpipe: Optimize lp_rast_triangle_32_3_16 for POWER8
This patch converts the SSE-optimized lp_rast_triangle_32_3_16() to VMX/VSX. I measured the results on POWER8 machine with 32 cores at 3.4GHz and 16GB of RAM. FPS/Score NameBefore AfterDelta openarena16.35 16.7 2.14% xonotic 4.707 4.97 5.57% glmark2 didn't show a significant (more than 1%) difference. v2: Make sure code is build only on POWER8 LE machine Signed-off-by: Oded Gabbay --- src/gallium/drivers/llvmpipe/lp_rast_tri.c | 142 - 1 file changed, 141 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index 09a182a..232c859 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -512,7 +512,145 @@ build_mask_linear_32(int c, int dcdx, int dcdy) return vec_movemask_epi8(result); } -#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */ +static inline __m128i +lp_plane_to_m128i(const struct lp_rast_plane *plane) +{ + return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx, + (int32_t)plane->dcdy, (int32_t)plane->eo); +} + +#define NR_PLANES 3 + +void +lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + const struct lp_rast_triangle *tri = arg.triangle.tri; + const struct lp_rast_plane *plane = GET_PLANES(tri); + int x = (arg.triangle.plane_mask & 0xff) + task->x; + int y = (arg.triangle.plane_mask >> 8) + task->y; + unsigned i, j; + + struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; + unsigned nr = 0; + + __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ + __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ + __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ + __m128i zero = vec_splats((unsigned char) 0); + + __m128i c; + __m128i dcdx; + __m128i dcdy; + __m128i rej4; + + __m128i dcdx2; + __m128i dcdx3; + + __m128i span_0;/* 0,dcdx,2dcdx,3dcdx for plane 0 */ + __m128i span_1;/* 0,dcdx,2dcdx,3dcdx for plane 1 */ + __m128i span_2;/* 0,dcdx,2dcdx,3dcdx for plane 2 */ + __m128i unused; + + __m128i vshuf_mask0; + __m128i vshuf_mask1; + __m128i vshuf_mask2; + +#ifdef PIPE_ARCH_LITTLE_ENDIAN + vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100); + vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504); + vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908); +#else + vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F); + vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B); + vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607); +#endif + + transpose4_epi32(&p0, &p1, &p2, &zero, +&c, &dcdx, &dcdy, &rej4); + + /* Adjust dcdx; +*/ + dcdx = vec_sub_epi32(zero, dcdx); + + c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x))); + c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y))); + rej4 = vec_slli_epi32(rej4, 2); + + /* +* Adjust so we can just check the sign bit (< 0 comparison), +* instead of having to do a less efficient <= 0 comparison +*/ + c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1)); + rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1)); + + dcdx2 = vec_add_epi32(dcdx, dcdx); + dcdx3 = vec_add_epi32(dcdx2, dcdx); + + transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, +&span_0, &span_1, &span_2, &unused); + + for (i = 0; i < 4; i++) { + __m128i cx = c; + + for (j = 0; j < 4; j++) { + __m128i c4rej = vec_add_epi32(cx, rej4); + __m128i rej_masks = vec_srai_epi32(c4rej, 31); + + /* if (is_zero(rej_masks)) */ + if (vec_movemask_epi8(rej_masks) == 0) { +__m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0); +__m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1); +__m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2); + +__m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0); + +__m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0)); +__m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1)); +__m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2)); + +__m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1); +__m128i c_01 = vec_packs_epi32(c_0, c_1); + +__m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0)); +__m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1)); +__m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2)); + +__m128i c_2 = vec_or(vec_or(c0_
[Mesa-dev] [PATCH v2 1/5] configure.ac: Detect if running on POWER8 arch
To determine if we could use special POWER8 assembly directives, we first need to detect whether we are running on POWER8 architecture. This patch adds this detection to configure.ac and adds the necessary compilation flags accordingly. v2: - Add option to disable POWER8 instructions generation - Detect whether building on BE or LE machine and build with -mpower8-vector only on LE machine - Make the printed messages more standard Signed-off-by: Oded Gabbay --- configure.ac | 55 +++ 1 file changed, 55 insertions(+) diff --git a/configure.ac b/configure.ac index f8a70be..b1c1d7d 100644 --- a/configure.ac +++ b/configure.ac @@ -396,6 +396,61 @@ fi AM_CONDITIONAL([SSE41_SUPPORTED], [test x$SSE41_SUPPORTED = x1]) AC_SUBST([SSE41_CFLAGS], $SSE41_CFLAGS) +dnl Check for Endianness +AC_C_BIGENDIAN( + little_endian=no, + little_endian=yes, + little_endian=no, + little_endian=no +) + +dnl Check for POWER8 Architecture +PWR8_CFLAGS="-mpower8-vector" +have_pwr8_intrinsics=no +AC_MSG_CHECKING(whether gcc supports -mpower8-vector) +save_CFLAGS=$CFLAGS +CFLAGS="$PWR8_CFLAGS $CFLAGS" +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)) +#error "Need GCC >= 4.8 for sane POWER8 support" +#endif +#include +int main () { +vector unsigned char r; +vector unsigned int v = vec_splat_u32 (1); +r = __builtin_vec_vgbbd ((vector unsigned char) v); +return 0; +}]])], have_pwr8_intrinsics=yes) +CFLAGS=$save_CFLAGS + +AC_ARG_ENABLE(pwr8, + [AC_HELP_STRING([--disable-pwr8-inst], + [disable POWER8-specific instructions])], + [enable_pwr8=$enableval], [enable_pwr8=auto]) + +if test "x$enable_pwr8" = xno ; then + have_pwr8_intrinsics=disabled +fi + +if test $have_pwr8_intrinsics = yes && test $little_endian = yes ; then + DEFINES="$DEFINES -D_ARCH_PWR8" + CXXFLAGS="$CXXFLAGS $PWR8_CFLAGS" + CFLAGS="$CFLAGS $PWR8_CFLAGS" +else + PWR8_CFLAGS= +fi + +AC_MSG_RESULT($have_pwr8_intrinsics) +if test "x$enable_pwr8" = xyes && test $have_pwr8_intrinsics = no ; then + AC_MSG_ERROR([POWER8 compiler support not detected]) +fi + +if test $have_pwr8_intrinsics = yes && test $little_endian = no ; then + AC_MSG_WARN([POWER8 optimization is enabled only on POWER8 Little-Endian]) +fi + +AC_SUBST([PWR8_CFLAGS], $PWR8_CFLAGS) + dnl Can't have static and shared libraries, default to static if user dnl explicitly requested. If both disabled, set to static since shared dnl was explicitly requested. -- 2.5.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 2/6] i965/gen8+: Invalidate color calc state when switching to the GPGPU pipeline.
On Sun, Jan 3, 2016 at 1:48 AM, Francisco Jerez wrote: > This hardware bug can cause a hang on context restore while the > current pipeline is set to GPGPU (BDWGFX HSD 1909593). In addition to > clearing the valid bit, mark the CC state as dirty to make sure that > the CC indirect state pointer is re-emitted when we switch back to the > 3D pipeline. > --- > src/mesa/drivers/dri/i965/brw_misc_state.c | 20 > 1 file changed, 20 insertions(+) > > diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c > b/src/mesa/drivers/dri/i965/brw_misc_state.c > index cf6ba5b..7d53d18 100644 > --- a/src/mesa/drivers/dri/i965/brw_misc_state.c > +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c > @@ -868,6 +868,26 @@ brw_emit_select_pipeline(struct brw_context *brw, enum > brw_pipeline pipeline) > const uint32_t _3DSTATE_PIPELINE_SELECT = >is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45; > > + if (brw->gen >= 8) { > + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] > + * PIPELINE_SELECT [DevBWR+]": > + * > + * Project: BDW, SKL I think we should restrict this block to brw->gen == 8 || brw->gen == 9 in that case? I can't find evidence that the workaround applies to later hardware (and in fact the page cited has a different workaround for a later generation). ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 0/6] i965: GPGPU/3D pipeline switching fixes.
On Sun, Jan 3, 2016 at 1:47 AM, Francisco Jerez wrote: > The PIPELINE_SELECT command has a number of awkward restrictions we > don't currently take into account while switching between the GPGPU > and 3D pipeline, what in some cases can lead to corruption or hangs. > This series should implement all workarounds mentioned in the hardware > spec ("BXML » GT » MI » vol1a GPU Overview » [Instruction] > PIPELINE_SELECT [DevBWR+]") that seem to be relevant to us. I had a question about patch 2, and with that sorted out the series is Reviewed-by: Matt Turner ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)
https://bugs.freedesktop.org/show_bug.cgi?id=93570 Bug ID: 93570 Summary: the image of llvmpipe has a low quality on arm (with too many points on it) Product: Mesa Version: 11.0 Hardware: ARM OS: Linux (All) Status: NEW Severity: normal Priority: medium Component: Drivers/X11 Assignee: mesa-dev@lists.freedesktop.org Reporter: icenowy...@gmail.com QA Contact: mesa-dev@lists.freedesktop.org Created attachment 120782 --> https://bugs.freedesktop.org/attachment.cgi?id=120782&action=edit The result on arm, the points are shown I've built a llvmpipe-enabled mesa on my Allwinner A33 (Quad Cortex-A7) device. I tried to run glmark2 on it, but I found that the image has a very very low quality. On my laptop (i5-3230m), when I forced LIBGL_ALWAYS_SOFTWARE, there's no this problem. -- You are receiving this mail because: You are the QA Contact for the bug. You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)
https://bugs.freedesktop.org/show_bug.cgi?id=93570 --- Comment #1 from Icenowy Zheng --- The attachment is a jpg, shoot by my mobile phone -- You are receiving this mail because: You are the QA Contact for the bug. You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H
Am 03.01.2016 um 01:37 schrieb Ilia Mirkin: > Signed-off-by: Ilia Mirkin > --- > src/gallium/docs/source/tgsi.rst | 10 -- > 1 file changed, 8 insertions(+), 2 deletions(-) > > diff --git a/src/gallium/docs/source/tgsi.rst > b/src/gallium/docs/source/tgsi.rst > index 955ece8..f69998f 100644 > --- a/src/gallium/docs/source/tgsi.rst > +++ b/src/gallium/docs/source/tgsi.rst > @@ -458,7 +458,9 @@ while DDY is allowed to be the same for the entire 2x2 > quad. > > .. opcode:: PK2H - Pack Two 16-bit Floats > > - TBD > +.. math:: > + > + dst.x = f32\_to\_f16(src.x) | f32\_to\_f16(src.y) << 16 This doesn't quite match the tgsi info description (which says that the result is replicated). If you don't want channel replication probably should make that CHAN there instead. > > .. opcode:: PK2US - Pack Two Unsigned 16-bit Scalars > @@ -615,7 +617,11 @@ This instruction replicates its result. > > .. opcode:: UP2H - Unpack Two 16-Bit Floats > > - TBD > +.. math:: > + > + dst.x = f16\_to\_f32(src0.x \& 0x) > + > + dst.y = f16\_to\_f32(src0.x >> 16) > I'm certainly ok with that, albeit (just like PK2H unless you do replication) it's not what the original source for this opcode does (which would have been NV_fragment_program). For the series (with the first point addressed either way,though a tgsi exec implementation which should be trivial wouldn't hurt neither) Reviewed-by: Roland Scheidegger ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)
https://bugs.freedesktop.org/show_bug.cgi?id=93570 --- Comment #2 from Icenowy Zheng --- icenowy [ ~ ] ! glmark2 -b build libGL error: unable to load driver: mali_drm_dri.so libGL error: driver pointer missing libGL error: failed to load driver: mali_drm ** GLX does not support GLX_EXT_swap_control or GLX_MESA_swap_control! ** Failed to set swap interval. Results may be bounded above by refresh rate. === glmark2 2014.03 === OpenGL Information GL_VENDOR: VMware, Inc. GL_RENDERER: Gallium 0.4 on llvmpipe (LLVM 3.7, 128 bits) GL_VERSION:3.0 Mesa 11.1.0 === ** GLX does not support GLX_EXT_swap_control or GLX_MESA_swap_control! ** Failed to set swap interval. Results may be bounded above by refresh rate. [build] : FPS: 13 FrameTime: 76.923 ms === glmark2 Score: 13 === Here's some log -- You are receiving this mail because: You are the QA Contact for the bug. You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] st/mesa: fix parameter names for tesseval/tessctrl prototypes
Cc: Ilia Mirkin Signed-off-by: Samuel Pitoiset --- src/mesa/state_tracker/st_program.h | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index a8571f0..a745315 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -405,12 +405,12 @@ st_get_gp_variant(struct st_context *st, extern struct st_tcp_variant * st_get_tcp_variant(struct st_context *st, - struct st_tessctrl_program *stgp, + struct st_tessctrl_program *sttcp, const struct st_tcp_variant_key *key); extern struct st_tep_variant * st_get_tep_variant(struct st_context *st, - struct st_tesseval_program *stgp, + struct st_tesseval_program *sttep, const struct st_tep_variant_key *key); extern void @@ -427,11 +427,11 @@ st_release_gp_variants(struct st_context *st, extern void st_release_tcp_variants(struct st_context *st, -struct st_tessctrl_program *stgp); +struct st_tessctrl_program *sttcp); extern void st_release_tep_variants(struct st_context *st, -struct st_tesseval_program *stgp); +struct st_tesseval_program *sttep); extern void st_destroy_program_variants(struct st_context *st); -- 2.6.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H
On Sun, Jan 3, 2016 at 12:33 PM, Roland Scheidegger wrote: > Am 03.01.2016 um 01:37 schrieb Ilia Mirkin: >> Signed-off-by: Ilia Mirkin >> --- >> src/gallium/docs/source/tgsi.rst | 10 -- >> 1 file changed, 8 insertions(+), 2 deletions(-) >> >> diff --git a/src/gallium/docs/source/tgsi.rst >> b/src/gallium/docs/source/tgsi.rst >> index 955ece8..f69998f 100644 >> --- a/src/gallium/docs/source/tgsi.rst >> +++ b/src/gallium/docs/source/tgsi.rst >> @@ -458,7 +458,9 @@ while DDY is allowed to be the same for the entire 2x2 >> quad. >> >> .. opcode:: PK2H - Pack Two 16-bit Floats >> >> - TBD >> +.. math:: >> + >> + dst.x = f32\_to\_f16(src.x) | f32\_to\_f16(src.y) << 16 > This doesn't quite match the tgsi info description (which says that the > result is > replicated). If you don't want channel replication probably should make > that CHAN > there instead. I'll add the replication to the docs. Looks like NV_fragment_program also wanted this: tmp0 = VectorLoad(op0); /* result obtained by combining raw bits of tmp0.x, tmp0.y */ result.x = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16); result.y = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16); result.z = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16); result.w = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16); But looks like it's just packing, not actually converting. And it's unclear whether UP2H is converting or not... let's assume that they do the conversions or else this is going to be useless. > > > >> >> .. opcode:: PK2US - Pack Two Unsigned 16-bit Scalars >> @@ -615,7 +617,11 @@ This instruction replicates its result. >> >> .. opcode:: UP2H - Unpack Two 16-Bit Floats >> >> - TBD >> +.. math:: >> + >> + dst.x = f16\_to\_f32(src0.x \& 0x) >> + >> + dst.y = f16\_to\_f32(src0.x >> 16) >> > I'm certainly ok with that, albeit (just like PK2H unless you do > replication) it's not what the original source for this opcode does > (which would have been NV_fragment_program). tmp = ScalarLoad(op0); result.x = (fp16) (RawBits(tmp) & 0x); result.y = (fp16) ((RawBits(tmp) >> 16) & 0x); result.z = (fp16) (RawBits(tmp) & 0x); result.w = (fp16) ((RawBits(tmp) >> 16) & 0x); Happy to add the .zw = .xy bit here as well. I was previously not aware that these ops came from NV_fragment_program, and instead assumed that they came from some incomplete attempt to do... something. (I guess it was for implementing NV_fragment_program ;) ) > > For the series (with the first point addressed either way,though a tgsi > exec implementation which should be trivial wouldn't hurt neither) > Reviewed-by: Roland Scheidegger Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I won't be able to enable the cap since it will still use gallivm by default for vertices. I have a gallivm implementation as well, but it hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or another, but it must have been on another box with a more recent LLVM. -ilia ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] st/mesa: fix parameter names for tesseval/tessctrl prototypes
Reviewed-by: Ilia Mirkin Can you guess where I copy-pasted these from? :) On Sun, Jan 3, 2016 at 12:47 PM, Samuel Pitoiset wrote: > Cc: Ilia Mirkin > Signed-off-by: Samuel Pitoiset > --- > src/mesa/state_tracker/st_program.h | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/src/mesa/state_tracker/st_program.h > b/src/mesa/state_tracker/st_program.h > index a8571f0..a745315 100644 > --- a/src/mesa/state_tracker/st_program.h > +++ b/src/mesa/state_tracker/st_program.h > @@ -405,12 +405,12 @@ st_get_gp_variant(struct st_context *st, > > extern struct st_tcp_variant * > st_get_tcp_variant(struct st_context *st, > - struct st_tessctrl_program *stgp, > + struct st_tessctrl_program *sttcp, > const struct st_tcp_variant_key *key); > > extern struct st_tep_variant * > st_get_tep_variant(struct st_context *st, > - struct st_tesseval_program *stgp, > + struct st_tesseval_program *sttep, > const struct st_tep_variant_key *key); > > extern void > @@ -427,11 +427,11 @@ st_release_gp_variants(struct st_context *st, > > extern void > st_release_tcp_variants(struct st_context *st, > -struct st_tessctrl_program *stgp); > +struct st_tessctrl_program *sttcp); > > extern void > st_release_tep_variants(struct st_context *st, > -struct st_tesseval_program *stgp); > +struct st_tesseval_program *sttep); > > extern void > st_destroy_program_variants(struct st_context *st); > -- > 2.6.4 > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] st/mesa: fix parameter names for tesseval/tessctrl prototypes
On 01/03/2016 07:03 PM, Ilia Mirkin wrote: Reviewed-by: Ilia Mirkin Can you guess where I copy-pasted these from? :) Two lines above? :-) On Sun, Jan 3, 2016 at 12:47 PM, Samuel Pitoiset wrote: Cc: Ilia Mirkin Signed-off-by: Samuel Pitoiset --- src/mesa/state_tracker/st_program.h | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index a8571f0..a745315 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -405,12 +405,12 @@ st_get_gp_variant(struct st_context *st, extern struct st_tcp_variant * st_get_tcp_variant(struct st_context *st, - struct st_tessctrl_program *stgp, + struct st_tessctrl_program *sttcp, const struct st_tcp_variant_key *key); extern struct st_tep_variant * st_get_tep_variant(struct st_context *st, - struct st_tesseval_program *stgp, + struct st_tesseval_program *sttep, const struct st_tep_variant_key *key); extern void @@ -427,11 +427,11 @@ st_release_gp_variants(struct st_context *st, extern void st_release_tcp_variants(struct st_context *st, -struct st_tessctrl_program *stgp); +struct st_tessctrl_program *sttcp); extern void st_release_tep_variants(struct st_context *st, -struct st_tesseval_program *stgp); +struct st_tesseval_program *sttep); extern void st_destroy_program_variants(struct st_context *st); -- 2.6.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 2/6] i965/gen8+: Invalidate color calc state when switching to the GPGPU pipeline.
Matt Turner writes: > On Sun, Jan 3, 2016 at 1:48 AM, Francisco Jerez wrote: >> This hardware bug can cause a hang on context restore while the >> current pipeline is set to GPGPU (BDWGFX HSD 1909593). In addition to >> clearing the valid bit, mark the CC state as dirty to make sure that >> the CC indirect state pointer is re-emitted when we switch back to the >> 3D pipeline. >> --- >> src/mesa/drivers/dri/i965/brw_misc_state.c | 20 >> 1 file changed, 20 insertions(+) >> >> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c >> b/src/mesa/drivers/dri/i965/brw_misc_state.c >> index cf6ba5b..7d53d18 100644 >> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c >> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c >> @@ -868,6 +868,26 @@ brw_emit_select_pipeline(struct brw_context *brw, enum >> brw_pipeline pipeline) >> const uint32_t _3DSTATE_PIPELINE_SELECT = >>is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45; >> >> + if (brw->gen >= 8) { >> + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] >> + * PIPELINE_SELECT [DevBWR+]": >> + * >> + * Project: BDW, SKL > > I think we should restrict this block to brw->gen == 8 || brw->gen == > 9 in that case? > > I can't find evidence that the workaround applies to later hardware > (and in fact the page cited has a different workaround for a later > generation). Yeah, Gen10 will need a different workaround but I wasn't sure we could release the details already. Anyway I've changed the above locally to be limited to pre-Gen10 for now. Thanks. signature.asc Description: PGP signature ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)
https://bugs.freedesktop.org/show_bug.cgi?id=93570 Ilia Mirkin changed: What|Removed |Added Attachment #120782|text/plain |image/jpeg mime type|| -- You are receiving this mail because: You are the QA Contact for the bug. You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/5] configure.ac: Detect if running on POWER8 arch
Am 03.01.2016 um 13:50 schrieb Oded Gabbay: > On Thu, Dec 31, 2015 at 8:48 PM, Roland Scheidegger > wrote: >> Am 31.12.2015 um 10:30 schrieb Oded Gabbay: >>> On Wed, Dec 30, 2015 at 5:41 PM, Roland Scheidegger >>> wrote: Am 30.12.2015 um 10:46 schrieb Oded Gabbay: > On Wed, Dec 30, 2015 at 1:11 AM, Roland Scheidegger > wrote: >> >> So, if I see that right, you will automatically generate binaries using >> power8 instructions if compiled on power8 capable box, which then won't >> run on boxes not supporting power8? Is that really what you want? >> Maybe some runtime detection would be a good idea (though I don't know >> if anyone cares about power7)? > > The problem is I don't think I can eliminate the build time check > (although I would very much like to) because I need: > 1. To pass a special flag to the GCC compiler: -mpower8-vector > 2. To define _ARCH_PWR8 so GCC will include the newer intrinsic > > Without those two things, I won't be able to use vec_vgbbd which I > need to implement the _mm_movemask_epi8 efficiently, and without that, > all this patch series can be thrown out the window. The emulation of > _mm_movemask_epi8 using regular instructions is just horrible. > > You are correct that once you build a binary with this flag on power8 > machine, that binary won't run on power7 machine. You get "cannot > execute binary file" > > Unfortunately, I don't see a way around this because even if I > condition the use of vec_vgbbd on a runtime check/define, the library > still won't be executable because it was built with -mpower8-vector. > > Having said that, because I *assume* IBM right now mostly cares about > Linux running on POWER8 with little-endian, I think it is a fair > compromise. Note I don't have anything against a build time check. My concern here is something along the lines of unsuspecting distros shipping binaries which won't work, as it looks to me like this will get picked up automatically. That is different to how for instance sse41 is handled. That is I believe this should only get enabled if someone has specified some -mcpu=power8 or whatever flag explicitly somewhere already. Roland >>> >>> I understand and I share your concern. Maybe we should add >>> "--disable-pwr8-inst" to mesa's configure ? if that flag is given to >>> configure, it would disable the optimization code (won't add >>> _ARCH_PWR8 to defines and won't add -mpower8-vector to gcc flags). >>> >>> What do you think ? >> If the generated code with all automatically picked up compile options >> really doesn't run on power7 just because of this, I think it would be >> nicer if this were an explicit enable. >> >> Roland >> > > So the problem is a bit worse then that and requires a harsher solution. > Apparently, when that compiler flag (power8-vector) is given to GCC, > GCC uses POWER8-only instructions in other places as well! What I have > seen so far, is that it uses such instructions in the implementation > of exp2() and/or log2() (in f.cpp) and also saw it in > __glXInitializeVisualConfigFromTags(). The instructions used are not > vector instructions, but floating point instructions, which were added > only in PowerISA 2.07 > > Therefore, I think that for now, I will limit the entire optimization > code to POWER8 *and* Little-Endian. Because ppc64le packages can > *only* run on POWER8 systems, and because you can't transfer binaries > between LE and BE machines, this workaround eliminates the danger of > crashing on "illegal instruction". In addition, there is no more need > for runtime checks. > > I hope you agree that with this change, it is better to enable the > power8-vector by default when building on POWER8 machine installed > with Linux LE. For all other archs it will be disabled by default. Yes, that looks reasonable. > I will try to contact IBM GCC devs to see how we can overcome this > problem (or if they even care) so I can expand these optimizations to > BE as well. IIRC this is problematic for things like sse41 etc. as well, it is just how gcc works. I think the typical workaround is to move code using intrinsics which need special compile flags to their own file (or rather compile unit), which you then can compile with those flags. (This implies of course separate functions, that is you can't have any run-time check plus the assembly in the same file or function.) This is how mesa handles _mesa_streaming_load_memcpy for the i965 driver. Roland > > I will send the revised patches shortly. > >Oded > >> >> >> >>> >>> Oded >>> > > Oded > >> So far we didn't bother with that for SSE >> but it has to be said SSE2 is a really low bar (and the manual assembly >> stuff doesn't use anything more advanced, even though clearly things >> like the emulated mm_mullo_epi32 are suboptimal if your c
[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)
https://bugs.freedesktop.org/show_bug.cgi?id=93570 --- Comment #3 from Ilia Mirkin --- A fairer comparison would be LIBGL_ALWAYS_SOFTWARE=1 LP_NATIVE_VECTOR_WIDTH=128 Since I assume it will otherwise use 256-bit wide vectors on your intel CPU. But it appears that the rendering is actually incorrect there. My personal guess is that llvm on arm has some issues, but this is based purely on the fact that you're seeing misrendering :) It may be worthwhile to build mesa-git and llvm-svn and see if the issue persists. (mesa-git should be able to build against the llvm head, while released mesa probably won't.) -- You are receiving this mail because: You are the QA Contact for the bug. You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/5] configure.ac: Detect if running on POWER8 arch
On Sun, Jan 3, 2016 at 8:14 PM, Roland Scheidegger wrote: > Am 03.01.2016 um 13:50 schrieb Oded Gabbay: >> On Thu, Dec 31, 2015 at 8:48 PM, Roland Scheidegger >> wrote: >>> Am 31.12.2015 um 10:30 schrieb Oded Gabbay: On Wed, Dec 30, 2015 at 5:41 PM, Roland Scheidegger wrote: > Am 30.12.2015 um 10:46 schrieb Oded Gabbay: >> On Wed, Dec 30, 2015 at 1:11 AM, Roland Scheidegger >> wrote: >>> >>> So, if I see that right, you will automatically generate binaries using >>> power8 instructions if compiled on power8 capable box, which then won't >>> run on boxes not supporting power8? Is that really what you want? >>> Maybe some runtime detection would be a good idea (though I don't know >>> if anyone cares about power7)? >> >> The problem is I don't think I can eliminate the build time check >> (although I would very much like to) because I need: >> 1. To pass a special flag to the GCC compiler: -mpower8-vector >> 2. To define _ARCH_PWR8 so GCC will include the newer intrinsic >> >> Without those two things, I won't be able to use vec_vgbbd which I >> need to implement the _mm_movemask_epi8 efficiently, and without that, >> all this patch series can be thrown out the window. The emulation of >> _mm_movemask_epi8 using regular instructions is just horrible. >> >> You are correct that once you build a binary with this flag on power8 >> machine, that binary won't run on power7 machine. You get "cannot >> execute binary file" >> >> Unfortunately, I don't see a way around this because even if I >> condition the use of vec_vgbbd on a runtime check/define, the library >> still won't be executable because it was built with -mpower8-vector. >> >> Having said that, because I *assume* IBM right now mostly cares about >> Linux running on POWER8 with little-endian, I think it is a fair >> compromise. > > Note I don't have anything against a build time check. My concern here > is something along the lines of unsuspecting distros shipping binaries > which won't work, as it looks to me like this will get picked up > automatically. That is different to how for instance sse41 is handled. > That is I believe this should only get enabled if someone has specified > some -mcpu=power8 or whatever flag explicitly somewhere already. > > Roland I understand and I share your concern. Maybe we should add "--disable-pwr8-inst" to mesa's configure ? if that flag is given to configure, it would disable the optimization code (won't add _ARCH_PWR8 to defines and won't add -mpower8-vector to gcc flags). What do you think ? >>> If the generated code with all automatically picked up compile options >>> really doesn't run on power7 just because of this, I think it would be >>> nicer if this were an explicit enable. >>> >>> Roland >>> >> >> So the problem is a bit worse then that and requires a harsher solution. >> Apparently, when that compiler flag (power8-vector) is given to GCC, >> GCC uses POWER8-only instructions in other places as well! What I have >> seen so far, is that it uses such instructions in the implementation >> of exp2() and/or log2() (in f.cpp) and also saw it in >> __glXInitializeVisualConfigFromTags(). The instructions used are not >> vector instructions, but floating point instructions, which were added >> only in PowerISA 2.07 >> >> Therefore, I think that for now, I will limit the entire optimization >> code to POWER8 *and* Little-Endian. Because ppc64le packages can >> *only* run on POWER8 systems, and because you can't transfer binaries >> between LE and BE machines, this workaround eliminates the danger of >> crashing on "illegal instruction". In addition, there is no more need >> for runtime checks. >> >> I hope you agree that with this change, it is better to enable the >> power8-vector by default when building on POWER8 machine installed >> with Linux LE. For all other archs it will be disabled by default. > > Yes, that looks reasonable. Thanks. > >> I will try to contact IBM GCC devs to see how we can overcome this >> problem (or if they even care) so I can expand these optimizations to >> BE as well. > IIRC this is problematic for things like sse41 etc. as well, it is just > how gcc works. I think the typical workaround is to move code using > intrinsics which need special compile flags to their own file (or rather > compile unit), which you then can compile with those flags. (This > implies of course separate functions, that is you can't have any > run-time check plus the assembly in the same file or function.) > This is how mesa handles _mesa_streaming_load_memcpy for the i965 driver. > > Roland > Yeah, I imagined as much, but I didn't know this technique was already in use in mesa. I hate this fragmentation of code, and I think a better solution, at the compiler level, is to have some kind of flag which te
[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)
https://bugs.freedesktop.org/show_bug.cgi?id=93570 --- Comment #4 from Icenowy Zheng --- Of course I am not benchmarking. But the image *do* misrenders. It will take me days of time to build a llvm and mesa combination, and may fail (as my device has only 512MB RAM) -- You are receiving this mail because: You are the QA Contact for the bug. You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H
Am 03.01.2016 um 19:02 schrieb Ilia Mirkin: > On Sun, Jan 3, 2016 at 12:33 PM, Roland Scheidegger > wrote: >> Am 03.01.2016 um 01:37 schrieb Ilia Mirkin: >>> Signed-off-by: Ilia Mirkin >>> --- >>> src/gallium/docs/source/tgsi.rst | 10 -- >>> 1 file changed, 8 insertions(+), 2 deletions(-) >>> >>> diff --git a/src/gallium/docs/source/tgsi.rst >>> b/src/gallium/docs/source/tgsi.rst >>> index 955ece8..f69998f 100644 >>> --- a/src/gallium/docs/source/tgsi.rst >>> +++ b/src/gallium/docs/source/tgsi.rst >>> @@ -458,7 +458,9 @@ while DDY is allowed to be the same for the entire 2x2 >>> quad. >>> >>> .. opcode:: PK2H - Pack Two 16-bit Floats >>> >>> - TBD >>> +.. math:: >>> + >>> + dst.x = f32\_to\_f16(src.x) | f32\_to\_f16(src.y) << 16 >> This doesn't quite match the tgsi info description (which says that the >> result is >> replicated). If you don't want channel replication probably should make >> that CHAN >> there instead. > > I'll add the replication to the docs. Looks like NV_fragment_program > also wanted this: > > tmp0 = VectorLoad(op0); > /* result obtained by combining raw bits of tmp0.x, tmp0.y */ > result.x = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16); > result.y = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16); > result.z = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16); > result.w = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16); > > But looks like it's just packing, not actually converting. And it's > unclear whether UP2H is converting or not... let's assume that they do > the conversions or else this is going to be useless. I don't think that's quite true it only packs (the pseudo-code is probably a bit sloppy...), given what nv30 could do this doesn't make sense. Also, UP2H clearly states "...undoes the type conversion and packing performed by the PK2H instruction". Albeit the pseudo-code doesn't really mention float anywhere there neither. I think though this is due to the possibility of the src (for pk2h) or dst (for up2h) being either a float or half reg, so in the latter case you wouldn't get any conversion (but don't quote me on that...). > >> >> >> >>> >>> .. opcode:: PK2US - Pack Two Unsigned 16-bit Scalars >>> @@ -615,7 +617,11 @@ This instruction replicates its result. >>> >>> .. opcode:: UP2H - Unpack Two 16-Bit Floats >>> >>> - TBD >>> +.. math:: >>> + >>> + dst.x = f16\_to\_f32(src0.x \& 0x) >>> + >>> + dst.y = f16\_to\_f32(src0.x >> 16) >>> >> I'm certainly ok with that, albeit (just like PK2H unless you do >> replication) it's not what the original source for this opcode does >> (which would have been NV_fragment_program). > > tmp = ScalarLoad(op0); > result.x = (fp16) (RawBits(tmp) & 0x); > result.y = (fp16) ((RawBits(tmp) >> 16) & 0x); > result.z = (fp16) (RawBits(tmp) & 0x); > result.w = (fp16) ((RawBits(tmp) >> 16) & 0x); > > Happy to add the .zw = .xy bit here as well. I was previously not > aware that these ops came from NV_fragment_program, and instead > assumed that they came from some incomplete attempt to do... > something. (I guess it was for implementing NV_fragment_program ;) ) Yes. I don't think any real effort was really ever made to support it, but tgsi was supposed to provide a superset of all available opcodes coming from somewhere (be it gl extensions or coming from d3d9) then. There's actually an ooold branch sitting on fdo where Michal removed support for all these opcodes, but it was never merged (http://cgit.freedesktop.org/mesa/mesa/commit/?id=5efeade4dc7ffe2d10b231b56fac60dbaa8aa0c8) So, if you want slightly different semantics that should be fine, but if the original ones aren't annoying could of course just stick to them. Roland > >> >> For the series (with the first point addressed either way,though a tgsi >> exec implementation which should be trivial wouldn't hurt neither) >> Reviewed-by: Roland Scheidegger > > Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I > won't be able to enable the cap since it will still use gallivm by > default for vertices. I have a gallivm implementation as well, but it > hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or > another, but it must have been on another box with a more recent LLVM. Ah right. f16 conversion is pretty annoying indeed, though I'd hope the helpers for that should work. In any case, I only really suggested that because I'd thought it would be trivial, so if it's not I don't consider that important... Roland ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H
On Sun, Jan 3, 2016 at 2:08 PM, Roland Scheidegger wrote: >>> For the series (with the first point addressed either way,though a tgsi >>> exec implementation which should be trivial wouldn't hurt neither) >>> Reviewed-by: Roland Scheidegger >> >> Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I >> won't be able to enable the cap since it will still use gallivm by >> default for vertices. I have a gallivm implementation as well, but it >> hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or >> another, but it must have been on another box with a more recent LLVM. > > Ah right. f16 conversion is pretty annoying indeed, though I'd hope the > helpers for that should work. In any case, I only really suggested that > because I'd thought it would be trivial, so if it's not I don't consider > that important... I'll send it out as a separate series, including my (semi?) broken gallivm impl and leave it to you to fix it if you care, or ignore if you don't. (I already have it, so might as well...) I understand neither how LLVM works, nor how gallivm uses LLVM, which isn't a great combination :) ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/5] configure.ac: Detect if running on POWER8 arch
Am 03.01.2016 um 19:20 schrieb Oded Gabbay: > On Sun, Jan 3, 2016 at 8:14 PM, Roland Scheidegger wrote: >> Am 03.01.2016 um 13:50 schrieb Oded Gabbay: >>> On Thu, Dec 31, 2015 at 8:48 PM, Roland Scheidegger >>> wrote: Am 31.12.2015 um 10:30 schrieb Oded Gabbay: > On Wed, Dec 30, 2015 at 5:41 PM, Roland Scheidegger > wrote: >> Am 30.12.2015 um 10:46 schrieb Oded Gabbay: >>> On Wed, Dec 30, 2015 at 1:11 AM, Roland Scheidegger >>> wrote: So, if I see that right, you will automatically generate binaries using power8 instructions if compiled on power8 capable box, which then won't run on boxes not supporting power8? Is that really what you want? Maybe some runtime detection would be a good idea (though I don't know if anyone cares about power7)? >>> >>> The problem is I don't think I can eliminate the build time check >>> (although I would very much like to) because I need: >>> 1. To pass a special flag to the GCC compiler: -mpower8-vector >>> 2. To define _ARCH_PWR8 so GCC will include the newer intrinsic >>> >>> Without those two things, I won't be able to use vec_vgbbd which I >>> need to implement the _mm_movemask_epi8 efficiently, and without that, >>> all this patch series can be thrown out the window. The emulation of >>> _mm_movemask_epi8 using regular instructions is just horrible. >>> >>> You are correct that once you build a binary with this flag on power8 >>> machine, that binary won't run on power7 machine. You get "cannot >>> execute binary file" >>> >>> Unfortunately, I don't see a way around this because even if I >>> condition the use of vec_vgbbd on a runtime check/define, the library >>> still won't be executable because it was built with -mpower8-vector. >>> >>> Having said that, because I *assume* IBM right now mostly cares about >>> Linux running on POWER8 with little-endian, I think it is a fair >>> compromise. >> >> Note I don't have anything against a build time check. My concern here >> is something along the lines of unsuspecting distros shipping binaries >> which won't work, as it looks to me like this will get picked up >> automatically. That is different to how for instance sse41 is handled. >> That is I believe this should only get enabled if someone has specified >> some -mcpu=power8 or whatever flag explicitly somewhere already. >> >> Roland > > I understand and I share your concern. Maybe we should add > "--disable-pwr8-inst" to mesa's configure ? if that flag is given to > configure, it would disable the optimization code (won't add > _ARCH_PWR8 to defines and won't add -mpower8-vector to gcc flags). > > What do you think ? If the generated code with all automatically picked up compile options really doesn't run on power7 just because of this, I think it would be nicer if this were an explicit enable. Roland >>> >>> So the problem is a bit worse then that and requires a harsher solution. >>> Apparently, when that compiler flag (power8-vector) is given to GCC, >>> GCC uses POWER8-only instructions in other places as well! What I have >>> seen so far, is that it uses such instructions in the implementation >>> of exp2() and/or log2() (in f.cpp) and also saw it in >>> __glXInitializeVisualConfigFromTags(). The instructions used are not >>> vector instructions, but floating point instructions, which were added >>> only in PowerISA 2.07 >>> >>> Therefore, I think that for now, I will limit the entire optimization >>> code to POWER8 *and* Little-Endian. Because ppc64le packages can >>> *only* run on POWER8 systems, and because you can't transfer binaries >>> between LE and BE machines, this workaround eliminates the danger of >>> crashing on "illegal instruction". In addition, there is no more need >>> for runtime checks. >>> >>> I hope you agree that with this change, it is better to enable the >>> power8-vector by default when building on POWER8 machine installed >>> with Linux LE. For all other archs it will be disabled by default. >> >> Yes, that looks reasonable. > > Thanks. > >> >>> I will try to contact IBM GCC devs to see how we can overcome this >>> problem (or if they even care) so I can expand these optimizations to >>> BE as well. >> IIRC this is problematic for things like sse41 etc. as well, it is just >> how gcc works. I think the typical workaround is to move code using >> intrinsics which need special compile flags to their own file (or rather >> compile unit), which you then can compile with those flags. (This >> implies of course separate functions, that is you can't have any >> run-time check plus the assembly in the same file or function.) >> This is how mesa handles _mesa_streaming_load_memcpy for the i965 driver. >> >> Roland >> > Yeah, I imagined as much, but I didn't know this technique was al
[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)
https://bugs.freedesktop.org/show_bug.cgi?id=93570 --- Comment #5 from Roland Scheidegger --- This looks rather interesting, like a shuffle gone wrong (always affects the same 3 pixel in 4x4 pixel stamp). This chip does have NEON instructions right? I think llvm used to have quite some problems if it needed to lower all the vector code to scalars (not to mention the horrific performance). Theoretically llvmpipe should work pretty ok on arm (albeit there's no arm specific optimizations, so slower than possible) but there were spurious reports of it not working well earlier (not many people try, it seems). It could also well be a llvm bug, I'd suggest a newer version if you're not already using the latest. -- You are receiving this mail because: You are the QA Contact for the bug. You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H
On Sun, Jan 3, 2016 at 2:15 PM, Ilia Mirkin wrote: > On Sun, Jan 3, 2016 at 2:08 PM, Roland Scheidegger wrote: For the series (with the first point addressed either way,though a tgsi exec implementation which should be trivial wouldn't hurt neither) Reviewed-by: Roland Scheidegger >>> >>> Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I >>> won't be able to enable the cap since it will still use gallivm by >>> default for vertices. I have a gallivm implementation as well, but it >>> hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or >>> another, but it must have been on another box with a more recent LLVM. >> >> Ah right. f16 conversion is pretty annoying indeed, though I'd hope the >> helpers for that should work. In any case, I only really suggested that >> because I'd thought it would be trivial, so if it's not I don't consider >> that important... > > I'll send it out as a separate series, including my (semi?) broken > gallivm impl and leave it to you to fix it if you care, or ignore if > you don't. (I already have it, so might as well...) I understand > neither how LLVM works, nor how gallivm uses LLVM, which isn't a great > combination :) And of course the piglits expect out-of-bounds numbers to be represented as infinities, instead of the clamped value, which is what util_float_to_half does :( ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 6/6] r600: add support for PK2H/UP2H
Glenn (or anyone else interested in r600): feel free to take this patch over. I've pushed the core (and nvc0) support, but due to Roland's feedback I changed the return values to be replicated(ish), not 100% sure how to do that and it seems best to leave it to you. On Sat, Jan 2, 2016 at 7:38 PM, Ilia Mirkin wrote: > Signed-off-by: Ilia Mirkin > --- > src/gallium/drivers/r600/r600_pipe.c | 2 +- > src/gallium/drivers/r600/r600_shader.c | 102 > +++-- > 2 files changed, 99 insertions(+), 5 deletions(-) > > diff --git a/src/gallium/drivers/r600/r600_pipe.c > b/src/gallium/drivers/r600/r600_pipe.c > index 70c1ec1..359fe41 100644 > --- a/src/gallium/drivers/r600/r600_pipe.c > +++ b/src/gallium/drivers/r600/r600_pipe.c > @@ -328,6 +328,7 @@ static int r600_get_param(struct pipe_screen* pscreen, > enum pipe_cap param) > case PIPE_CAP_TEXTURE_QUERY_LOD: > case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: > case PIPE_CAP_SAMPLER_VIEW_TARGET: > + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: > return family >= CHIP_CEDAR ? 1 : 0; > case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: > return family >= CHIP_CEDAR ? 4 : 0; > @@ -351,7 +352,6 @@ static int r600_get_param(struct pipe_screen* pscreen, > enum pipe_cap param) > case PIPE_CAP_DRAW_PARAMETERS: > case PIPE_CAP_MULTI_DRAW_INDIRECT: > case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: > - case PIPE_CAP_TGSI_PACK_HALF_FLOAT: > return 0; > > case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: > diff --git a/src/gallium/drivers/r600/r600_shader.c > b/src/gallium/drivers/r600/r600_shader.c > index d411b0b..23ea34e 100644 > --- a/src/gallium/drivers/r600/r600_shader.c > +++ b/src/gallium/drivers/r600/r600_shader.c > @@ -8959,6 +8959,100 @@ static int tgsi_umad(struct r600_shader_ctx *ctx) > return 0; > } > > +static int tgsi_pk2h(struct r600_shader_ctx *ctx) > +{ > + struct tgsi_full_instruction *inst = > &ctx->parse.FullToken.FullInstruction; > + struct r600_bytecode_alu alu; > + int r; > + > + /* temp.xy = f32_to_f16(src) */ > + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); > + alu.op = ALU_OP1_FLT32_TO_FLT16; > + alu.dst.chan = 0; > + alu.dst.sel = ctx->temp_reg; > + alu.dst.write = 1; > + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); > + r = r600_bytecode_add_alu(ctx->bc, &alu); > + if (r) > + return r; > + alu.dst.chan = 1; > + r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); > + alu.last = 1; > + r = r600_bytecode_add_alu(ctx->bc, &alu); > + if (r) > + return r; > + > + /* dst.x = temp.y * 0x1 + temp.x */ > + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); > + alu.op = ALU_OP3_MULADD_UINT24; > + alu.is_op3 = 1; > + tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); > + alu.last = 1; > + alu.src[0].sel = ctx->temp_reg; > + alu.src[0].chan = 1; > + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; > + alu.src[1].value = 0x1; > + alu.src[2].sel = ctx->temp_reg; > + alu.src[2].chan = 0; > + r = r600_bytecode_add_alu(ctx->bc, &alu); > + if (r) > + return r; > + > + return 0; > +} > + > +static int tgsi_up2h(struct r600_shader_ctx *ctx) > +{ > + struct tgsi_full_instruction *inst = > &ctx->parse.FullToken.FullInstruction; > + struct r600_bytecode_alu alu; > + int r; > + int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); > + > + /* temp.x = src.x */ > + /* note: no need to mask out the high bits */ > + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); > + alu.op = ALU_OP1_MOV; > + alu.dst.chan = 0; > + alu.dst.sel = ctx->temp_reg; > + alu.dst.write = 1; > + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); > + r = r600_bytecode_add_alu(ctx->bc, &alu); > + if (r) > + return r; > + > + /* temp.y = src.x >> 16 */ > + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); > + alu.op = ALU_OP2_LSHR_INT; > + alu.dst.chan = 1; > + alu.dst.sel = ctx->temp_reg; > + alu.dst.write = 1; > + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); > + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; > + alu.src[1].value = 16; > + alu.last = 1; > + r = r600_bytecode_add_alu(ctx->bc, &alu); > + if (r) > + return r; > + > + /* dst.xy = f16_to_f32(temp.xy) */ > + for (int i = 0; i < lasti + 1; i++) { > + if (!(inst->Dst[0].Register.WriteMask & (1 << i))) > + continue; > + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); > + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); > + alu.op = ALU_OP1_FLT16_TO_FLT32; > + alu.src[0].sel = ctx->temp_reg; > +
[Mesa-dev] [PATCH 2/2] WIP tgsi: add PK2H/UP2H support
It seems like there's something horribly wrong with the util_float_to_half function. In a standalone compilation it works fine for -6.10203e-05, generating 0x8400, but inside mesa it ends up with 0x8000. The result of the magic.f multiplication is 0. No idea why. Signed-off-by: Ilia Mirkin --- See above comments for why I didn't include this at all. Should you figure out what was going wrong, note that this can only be enabled in softpipe if the whole pipeline is using tgsi_exec (or if the gallivm patch is fixed/upstreamed). Feel free to take this over and/or modify as necessary. src/gallium/auxiliary/tgsi/tgsi_exec.c | 44 ++-- src/gallium/drivers/softpipe/sp_screen.c | 2 +- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index f67c162..12a477b 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -58,6 +58,7 @@ #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi_exec.h" +#include "util/u_half.h" #include "util/u_memory.h" #include "util/u_math.h" @@ -3058,6 +3059,45 @@ exec_dp2(struct tgsi_exec_machine *mach, } static void +exec_pk2h(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + unsigned int chan; + union tgsi_exec_channel arg[2], dst; + + fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT); + fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT); + for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) { + dst.u[chan] = util_float_to_half(arg[0].f[chan]) | + (util_float_to_half(arg[1].f[chan]) << 16); + } + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { + store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT); + } + } +} + +static void +exec_up2h(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + unsigned int chan; + union tgsi_exec_channel arg, dst[2]; + + fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT); + for (chan = 0; chan < 4; chan++) { + dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0x); + dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16); + } + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { + store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT); + } + } +} + +static void exec_scs(struct tgsi_exec_machine *mach, const struct tgsi_full_instruction *inst) { @@ -4339,7 +4379,7 @@ exec_instruction( break; case TGSI_OPCODE_PK2H: - assert (0); + exec_pk2h(mach, inst); break; case TGSI_OPCODE_PK2US: @@ -4425,7 +4465,7 @@ exec_instruction( break; case TGSI_OPCODE_UP2H: - assert (0); + exec_up2h(mach, inst); break; case TGSI_OPCODE_UP2US: diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index e74044b..d4526ef 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -236,6 +236,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_CLIP_HALFZ: case PIPE_CAP_TEXTURE_FLOAT_LINEAR: case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: return 1; case PIPE_CAP_VERTEXID_NOBASE: return 0; @@ -252,7 +253,6 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_CLEAR_TEXTURE: case PIPE_CAP_DRAW_PARAMETERS: - case PIPE_CAP_TGSI_PACK_HALF_FLOAT: case PIPE_CAP_MULTI_DRAW_INDIRECT: case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: return 0; -- 2.4.10 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] WIP gallivm: add support for PK2H/UP2H
This hits assertion failures on LLVM 3.5 Signed-off-by: Ilia Mirkin --- It definitely worked at one point or another, but it might have been with a later LLVM version and/or on a different CPU. On my i7-920 with LLVM 3.5 I definitely get assertion errors from inside LLVM. Any interested party can take this patch over and fix it as they see fit. Or ignore it. src/gallium/auxiliary/gallivm/lp_bld_tgsi.c| 1 - src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 87 ++ src/gallium/drivers/llvmpipe/lp_screen.c | 2 +- 3 files changed, 88 insertions(+), 2 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c index c88dfbf..1cbe47c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c @@ -248,7 +248,6 @@ lp_build_tgsi_inst_llvm( /* Ignore deprecated instructions */ switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_UP2H: case TGSI_OPCODE_UP2US: case TGSI_OPCODE_UP4B: case TGSI_OPCODE_UP4UB: diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c index 3d5e2cb..ac3298d 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c @@ -1020,6 +1020,88 @@ static void dfrac_emit( emit_data->args[0], tmp, ""); } +static void +pk2h_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + /* src0.x */ + emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, +0, TGSI_CHAN_X); + /* src0.y */ + emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst, +0, TGSI_CHAN_Y); +} + +static void +emit_pk2h(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMContextRef context = bld_base->base.gallivm->context; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + LLVMTypeRef fp16 = LLVMVectorType(LLVMHalfTypeInContext(context), + bld_base->base.type.length); + LLVMTypeRef i16 = LLVMVectorType(LLVMInt16TypeInContext(context), +bld_base->base.type.length); + LLVMValueRef const16 = lp_build_const_vec(uint_bld->gallivm, uint_bld->type, + 16); + + LLVMValueRef low = LLVMBuildFPTrunc( + builder, emit_data->args[0], fp16, ""); + LLVMValueRef high = LLVMBuildFPTrunc( + builder, emit_data->args[1], fp16, ""); + + low = LLVMBuildZExt(builder, LLVMBuildBitCast(builder, low, i16, ""), + uint_bld->vec_type, ""); + high = LLVMBuildZExt(builder, LLVMBuildBitCast(builder, high, i16, ""), +uint_bld->vec_type, ""); + + emit_data->output[emit_data->chan] = + LLVMBuildOr(builder, low, LLVMBuildShl(builder, high, const16, ""), ""); +} + +static void +up2h_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + /* src0.x */ + emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, +0, TGSI_CHAN_X); +} + +static void +emit_up2h(const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + LLVMContextRef context = bld_base->base.gallivm->context; + struct lp_build_context *uint_bld = &bld_base->uint_bld; + LLVMTypeRef fp16 = LLVMVectorType(LLVMHalfTypeInContext(context), + bld_base->base.type.length); + LLVMTypeRef i16 = LLVMVectorType(LLVMInt16TypeInContext(context), +bld_base->base.type.length); + LLVMValueRef const16 = lp_build_const_vec(uint_bld->gallivm, uint_bld->type, + 16); + + LLVMValueRef input = LLVMBuildBitCast( + builder, emit_data->args[0], bld_base->base.int_vec_type, ""); + int i; + + for (i = 0; i < 2; i++) { + LLVMValueRef val = input; + if (i == 1) + val = LLVMBuildLShr(builder, val, const16, ""); + val = LLVMBuildTrunc(builder, val, i16, ""); + val = LLVMBuildBitCast(builder, val, fp16, ""); + emit_data->output[i] = + LLVMBuildFPExt(builder, val, bld_base->base.vec_type, ""); + } +} + void lp_set_default_actions(struct lp_build_tgsi_context * bld_base) { @@ -1093,6 +1175,11 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base) bld_base->op_actions[TGSI_OPCODE_DRCP].emit = drcp_emit; bld_base->op_
Re: [Mesa-dev] [PATCH 2/6] i965/gen8+: Invalidate color calc state when switching to the GPGPU pipeline.
On Saturday, January 2, 2016 10:48:01 PM PST Francisco Jerez wrote: > This hardware bug can cause a hang on context restore while the > current pipeline is set to GPGPU (BDWGFX HSD 1909593). In addition to > clearing the valid bit, mark the CC state as dirty to make sure that > the CC indirect state pointer is re-emitted when we switch back to the > 3D pipeline. > --- > src/mesa/drivers/dri/i965/brw_misc_state.c | 20 > 1 file changed, 20 insertions(+) > > diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/ dri/i965/brw_misc_state.c > index cf6ba5b..7d53d18 100644 > --- a/src/mesa/drivers/dri/i965/brw_misc_state.c > +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c > @@ -868,6 +868,26 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline) > const uint32_t _3DSTATE_PIPELINE_SELECT = >is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45; > > + if (brw->gen >= 8) { > + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] > + * PIPELINE_SELECT [DevBWR+]": How about: /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT: (We try to cite the public docs where possible.) Patches 1-2 are: Reviewed-by: Kenneth Graunke Thanks for fixing this - good catch! signature.asc Description: This is a digitally signed message part. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 3/6] i965/gen6-7: Implement stall and flushes required prior to switching pipelines.
On Saturday, January 2, 2016 10:48:02 PM PST Francisco Jerez wrote: > Switching the current pipeline while it's not completely idle or the > read and write caches aren't flushed can lead to corruption. Fixes > misrendering of at least the following Khronos CTS test: > > ES31-CTS.shader_image_load_store.basic-allTargets-store-fs > > The stall and flushes are no longer required on Gen8+. > > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93323 > --- > src/mesa/drivers/dri/i965/brw_misc_state.c | 28 +++ + > 1 file changed, 28 insertions(+) > > diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/ dri/i965/brw_misc_state.c > index 7d53d18..75540c1 100644 > --- a/src/mesa/drivers/dri/i965/brw_misc_state.c > +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c > @@ -886,6 +886,34 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline) > > brw->ctx.NewDriverState |= BRW_NEW_CC_STATE; >} > + > + } else if (brw->gen >= 6) { > + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] > + * PIPELINE_SELECT [DevBWR+]": Can we cite the public docs? > + * > + * Project: DEVSNB+ > + * > + * Software must ensure all the write caches are flushed through a > + * stalling PIPE_CONTROL command followed by another PIPE_CONTROL > + * command to invalidate read only caches prior to programming > + * MI_PIPELINE_SELECT command to change the Pipeline Select Mode. > + */ > + const unsigned dc_flush = > + brw->gen >= 7 ? PIPE_CONTROL_DATA_CACHE_INVALIDATE : 0; I was going to suggest doing a brw_emit_post_sync_nonzero_flush first on Sandybridge, but I forgot that we now just emit that at the start of every state upload. Fairly moot anyway since we don't do GPGPU on Sandybridge anyway. > + > + brw_emit_pipe_control_flush(brw, > + PIPE_CONTROL_RENDER_TARGET_FLUSH | > + PIPE_CONTROL_DEPTH_CACHE_FLUSH | > + dc_flush | > + PIPE_CONTROL_NO_WRITE | > + PIPE_CONTROL_CS_STALL); Why RENDER_TARGET_FLUSH, DEPTH_CACHE_FLUSH, DATA_CACHE_INVALIDATE, and NO_WRITE? The cited workaround explains a CS Stall and the RO invalidations below, but I'm not seeing why the others are needed. > + > + brw_emit_pipe_control_flush(brw, > + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | > + PIPE_CONTROL_CONST_CACHE_INVALIDATE | > + PIPE_CONTROL_STATE_CACHE_INVALIDATE | > + PIPE_CONTROL_INSTRUCTION_INVALIDATE | > + PIPE_CONTROL_NO_WRITE); > } > > /* Select the pipeline */ > signature.asc Description: This is a digitally signed message part. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 4/6] i965/gen4-5: Emit MI_FLUSH as required prior to switching pipelines.
On Saturday, January 2, 2016 10:48:03 PM PST Francisco Jerez wrote: > AFAIK brw_emit_select_pipeline() is only called once during context > init on Gen4-5, at which point the pipeline is likely to be already > idle so it may just happen to work by luck regardless of the MI_FLUSH. > --- > src/mesa/drivers/dri/i965/brw_misc_state.c | 13 + > 1 file changed, 13 insertions(+) > > diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/ dri/i965/brw_misc_state.c > index 75540c1..e5af1da 100644 > --- a/src/mesa/drivers/dri/i965/brw_misc_state.c > +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c > @@ -914,6 +914,19 @@ brw_emit_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline) >PIPE_CONTROL_STATE_CACHE_INVALIDATE | >PIPE_CONTROL_INSTRUCTION_INVALIDATE | >PIPE_CONTROL_NO_WRITE); > + > + } else { > + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] > + * PIPELINE_SELECT [DevBWR+]": > + * > + * Project: PRE-DEVSNB > + * > + * Software must ensure the current pipeline is flushed via an > + * MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT. > + */ > + BEGIN_BATCH(1); > + OUT_BATCH(MI_FLUSH); > + ADVANCE_BATCH(); > } > > /* Select the pipeline */ > Patches 4-5 are: Reviewed-by: Kenneth Graunke Patch 6 already has Matt's review, so I'm going to leave it be. signature.asc Description: This is a digitally signed message part. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600g: Add support for PK2H/UP2H
Based off of Ilia's original patch, but with output values replicated so that it matches the TGSI semantics. Signed-off-by: Glenn Kennard --- src/gallium/drivers/r600/r600_pipe.c | 2 +- src/gallium/drivers/r600/r600_shader.c | 107 +++-- 2 files changed, 104 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index d71082f..3b5d26c 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -328,6 +328,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_QUERY_LOD: case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: case PIPE_CAP_SAMPLER_VIEW_TARGET: + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: return family >= CHIP_CEDAR ? 1 : 0; case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: return family >= CHIP_CEDAR ? 4 : 0; @@ -349,7 +350,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_CLEAR_TEXTURE: case PIPE_CAP_DRAW_PARAMETERS: - case PIPE_CAP_TGSI_PACK_HALF_FLOAT: return 0; case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 9c040ae..7b1eade 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -8960,6 +8960,105 @@ static int tgsi_umad(struct r600_shader_ctx *ctx) return 0; } +static int tgsi_pk2h(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int r, i; + int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); + + /* temp.xy = f32_to_f16(src) */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_FLT32_TO_FLT16; + alu.dst.chan = 0; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + alu.dst.chan = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + /* dst.x = temp.y * 0x1 + temp.x */ + for (i = 0; i < lasti + 1; i++) { + if (!(inst->Dst[0].Register.WriteMask & (1 << i))) + continue; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP3_MULADD_UINT24; + alu.is_op3 = 1; + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + alu.last = i == lasti; + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = 1; + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[1].value = 0x1; + alu.src[2].sel = ctx->temp_reg; + alu.src[2].chan = 0; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + return 0; +} + +static int tgsi_up2h(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int r, i; + int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); + + /* temp.x = src.x */ + /* note: no need to mask out the high bits */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.dst.chan = 0; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + /* temp.y = src.x >> 16 */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_LSHR_INT; + alu.dst.chan = 1; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[1].value = 16; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + /* dst.wz = dst.xy = f16_to_f32(temp.xy) */ + for (i = 0; i < lasti + 1; i++) { + if (!(inst->Dst[0].Register.WriteMask & (1 << i))) + continue; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + alu.op = ALU_OP1_FLT16_TO_FLT32; + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = i % 2; + alu.last = i == lasti; + r = r600_bytecod
Re: [Mesa-dev] [PATCH 3/6] i965/gen6-7: Implement stall and flushes required prior to switching pipelines.
Kenneth Graunke writes: > On Saturday, January 2, 2016 10:48:02 PM PST Francisco Jerez wrote: >> Switching the current pipeline while it's not completely idle or the >> read and write caches aren't flushed can lead to corruption. Fixes >> misrendering of at least the following Khronos CTS test: >> >> ES31-CTS.shader_image_load_store.basic-allTargets-store-fs >> >> The stall and flushes are no longer required on Gen8+. >> >> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93323 >> --- >> src/mesa/drivers/dri/i965/brw_misc_state.c | 28 +++ > + >> 1 file changed, 28 insertions(+) >> >> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/ > dri/i965/brw_misc_state.c >> index 7d53d18..75540c1 100644 >> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c >> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c >> @@ -886,6 +886,34 @@ brw_emit_select_pipeline(struct brw_context *brw, enum > brw_pipeline pipeline) >> >> brw->ctx.NewDriverState |= BRW_NEW_CC_STATE; >>} >> + >> + } else if (brw->gen >= 6) { >> + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] >> + * PIPELINE_SELECT [DevBWR+]": > > Can we cite the public docs? > The public docs for PIPELINE_SELECT seemed rather inaccurate. The IVB version I have in front of me right now is missing this one workaround, and the BDW version mentions it incorrectly. Sigh... >> + * >> + * Project: DEVSNB+ >> + * >> + * Software must ensure all the write caches are flushed through a >> + * stalling PIPE_CONTROL command followed by another PIPE_CONTROL >> + * command to invalidate read only caches prior to programming >> + * MI_PIPELINE_SELECT command to change the Pipeline Select Mode. >> + */ >> + const unsigned dc_flush = >> + brw->gen >= 7 ? PIPE_CONTROL_DATA_CACHE_INVALIDATE : 0; > > I was going to suggest doing a brw_emit_post_sync_nonzero_flush first > on Sandybridge, but I forgot that we now just emit that at the start > of every state upload. Fairly moot anyway since we don't do GPGPU on > Sandybridge anyway. > Hmm, that sounds very sensible to me, it would be rather fragile for this function to rely on a flush with post-sync op having been done previously, even if at this point this will only be called once at context creation on SNB -- Although for the same reason it seems rather fragile for brw_emit_pipe_control_flush() to assume that the workaround has been applied already. I'd be inclined to change brw_emit_pipe_control_flush() to emit the post-sync op when needed on SNB just like we do for other PIPE_CONTROL workarounds on Gen7 and Gen8. >> + >> + brw_emit_pipe_control_flush(brw, >> + PIPE_CONTROL_RENDER_TARGET_FLUSH | >> + PIPE_CONTROL_DEPTH_CACHE_FLUSH | >> + dc_flush | >> + PIPE_CONTROL_NO_WRITE | >> + PIPE_CONTROL_CS_STALL); > > Why RENDER_TARGET_FLUSH, DEPTH_CACHE_FLUSH, DATA_CACHE_INVALIDATE, > and NO_WRITE? The cited workaround explains a CS Stall and the RO > invalidations below, but I'm not seeing why the others are needed. > It also says that "software must ensure all the write caches are flushed". >> + >> + brw_emit_pipe_control_flush(brw, >> + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | >> + PIPE_CONTROL_CONST_CACHE_INVALIDATE | >> + PIPE_CONTROL_STATE_CACHE_INVALIDATE | >> + PIPE_CONTROL_INSTRUCTION_INVALIDATE | >> + PIPE_CONTROL_NO_WRITE); >> } >> >> /* Select the pipeline */ >> signature.asc Description: PGP signature ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H
Am 03.01.2016 um 21:32 schrieb Ilia Mirkin: > On Sun, Jan 3, 2016 at 2:15 PM, Ilia Mirkin wrote: >> On Sun, Jan 3, 2016 at 2:08 PM, Roland Scheidegger >> wrote: > For the series (with the first point addressed either way,though a tgsi > exec implementation which should be trivial wouldn't hurt neither) > Reviewed-by: Roland Scheidegger Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I won't be able to enable the cap since it will still use gallivm by default for vertices. I have a gallivm implementation as well, but it hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or another, but it must have been on another box with a more recent LLVM. >>> >>> Ah right. f16 conversion is pretty annoying indeed, though I'd hope the >>> helpers for that should work. In any case, I only really suggested that >>> because I'd thought it would be trivial, so if it's not I don't consider >>> that important... >> >> I'll send it out as a separate series, including my (semi?) broken >> gallivm impl and leave it to you to fix it if you care, or ignore if >> you don't. (I already have it, so might as well...) I understand >> neither how LLVM works, nor how gallivm uses LLVM, which isn't a great >> combination :) > > And of course the piglits expect out-of-bounds numbers to be > represented as infinities, instead of the clamped value This is, imho, a bug, they should allow both. Because round-towards-zero when converting is allowed by GL when converting floats to half, albeit round-to-nearest-even is preferred. And the former gets you the clamped values. > which is what util_float_to_half does :( Yep. The reason both the util and gallivm code do round-towards zero is that for such conversions GL allows both, but d3d10 is deeply unhappy if you do round-toward-nearest-even (for float to float conversions), at least for the clamp vs. infinite issue. As per the data conversion rules: https://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx Albeit there's no specific half float conversion instructions in d3d10 (but in d3d11), render target conversions etc. must honor these rules too. I suspect most hw can do both without too much fuzz (x86 f16c certainly can). Roland ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H
On Sun, Jan 3, 2016 at 7:51 PM, Roland Scheidegger wrote: > Am 03.01.2016 um 21:32 schrieb Ilia Mirkin: >> On Sun, Jan 3, 2016 at 2:15 PM, Ilia Mirkin wrote: >>> On Sun, Jan 3, 2016 at 2:08 PM, Roland Scheidegger >>> wrote: >> For the series (with the first point addressed either way,though a tgsi >> exec implementation which should be trivial wouldn't hurt neither) >> Reviewed-by: Roland Scheidegger > > Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I > won't be able to enable the cap since it will still use gallivm by > default for vertices. I have a gallivm implementation as well, but it > hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or > another, but it must have been on another box with a more recent LLVM. Ah right. f16 conversion is pretty annoying indeed, though I'd hope the helpers for that should work. In any case, I only really suggested that because I'd thought it would be trivial, so if it's not I don't consider that important... >>> >>> I'll send it out as a separate series, including my (semi?) broken >>> gallivm impl and leave it to you to fix it if you care, or ignore if >>> you don't. (I already have it, so might as well...) I understand >>> neither how LLVM works, nor how gallivm uses LLVM, which isn't a great >>> combination :) >> >> And of course the piglits expect out-of-bounds numbers to be >> represented as infinities, instead of the clamped value > > This is, imho, a bug, they should allow both. Because round-towards-zero > when converting is allowed by GL when converting floats to half, albeit > round-to-nearest-even is preferred. And the former gets you the clamped > values. > >> which is what util_float_to_half does :( > Yep. The reason both the util and gallivm code do round-towards zero is > that for such conversions GL allows both, but d3d10 is deeply unhappy if > you do round-toward-nearest-even (for float to float conversions), at > least for the clamp vs. infinite issue. As per the data conversion > rules: > https://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx > Albeit there's no specific half float conversion instructions in d3d10 > (but in d3d11), render target conversions etc. must honor these rules too. > I suspect most hw can do both without too much fuzz (x86 f16c certainly > can). Take it up with people who aren't me :) http://cgit.freedesktop.org/mesa/mesa/tree/src/glsl/lower_packing_builtins.cpp#n990 FWIW the f32 -> f16 opcode this maps to on nvc0 has the same behaviour. Now it also has rounding mode flags which I don't set and perhaps one of them would yield the behaviour that you're talking about, but I don't know offhand how to get it. Curiously from the PTX ISA docs: "Conversions to floating-point that are beyond the range of floating-point numbers are represented with the maximum floating-point value (IEEE 754 Inf for f32 and f64, and ~131,000 for f16)." If you get the piglit tests changed, I guess I'll poke around. -ilia ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 2/2] WIP tgsi: add PK2H/UP2H support
Am 03.01.2016 um 22:29 schrieb Ilia Mirkin: > It seems like there's something horribly wrong with the > util_float_to_half function. In a standalone compilation it works fine > for -6.10203e-05, generating 0x8400, but inside mesa it ends up with > 0x8000. The result of the magic.f multiplication is 0. No idea why. Ahh that's easy. Because we switch off denorms (on x86...). (Look at util_fpstate_set_denorms_to_zero) And this number would be _just_ below the smallest normal number - the float mul would produce a denorm, which gets flushed to zero. That said, this is wrong (the gallivm code will not hit this issue, as it uses all int math, pretty much because of that, it should be noted that if you actually hit those denorms it's going to be dead slow on a lot of intel cpus). Should probably switch the util code to some other method for conversion as well. d3d10 says you must flush denorms to zero, and I can't see any reason why you'd want them for gl graphics neither, however conversion to/from f16 is an exception - denorm numbers must be correctly represented (at least for d3d10 - wouldn't be surprised if GL doesn't care). So it's a pretty minor issue (albeit really should be addressed at some point). The magic mul method is somewhat elegant, but that's a limitation it has. Roland > > Signed-off-by: Ilia Mirkin > --- > > See above comments for why I didn't include this at all. Should you figure out > what was going wrong, note that this can only be enabled in softpipe if the > whole pipeline is using tgsi_exec (or if the gallivm patch is > fixed/upstreamed). > > Feel free to take this over and/or modify as necessary. > > src/gallium/auxiliary/tgsi/tgsi_exec.c | 44 > ++-- > src/gallium/drivers/softpipe/sp_screen.c | 2 +- > 2 files changed, 43 insertions(+), 3 deletions(-) > > diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c > b/src/gallium/auxiliary/tgsi/tgsi_exec.c > index f67c162..12a477b 100644 > --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c > +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c > @@ -58,6 +58,7 @@ > #include "tgsi/tgsi_parse.h" > #include "tgsi/tgsi_util.h" > #include "tgsi_exec.h" > +#include "util/u_half.h" > #include "util/u_memory.h" > #include "util/u_math.h" > > @@ -3058,6 +3059,45 @@ exec_dp2(struct tgsi_exec_machine *mach, > } > > static void > +exec_pk2h(struct tgsi_exec_machine *mach, > + const struct tgsi_full_instruction *inst) > +{ > + unsigned int chan; > + union tgsi_exec_channel arg[2], dst; > + > + fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, > TGSI_EXEC_DATA_FLOAT); > + fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, > TGSI_EXEC_DATA_FLOAT); > + for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) { > + dst.u[chan] = util_float_to_half(arg[0].f[chan]) | > + (util_float_to_half(arg[1].f[chan]) << 16); > + } > + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { > + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { > + store_dest(mach, &dst, &inst->Dst[0], inst, chan, > TGSI_EXEC_DATA_UINT); > + } > + } > +} > + > +static void > +exec_up2h(struct tgsi_exec_machine *mach, > + const struct tgsi_full_instruction *inst) > +{ > + unsigned int chan; > + union tgsi_exec_channel arg, dst[2]; > + > + fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT); > + for (chan = 0; chan < 4; chan++) { > + dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0x); > + dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16); > + } > + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { > + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { > + store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, > TGSI_EXEC_DATA_FLOAT); > + } > + } > +} > + > +static void > exec_scs(struct tgsi_exec_machine *mach, > const struct tgsi_full_instruction *inst) > { > @@ -4339,7 +4379,7 @@ exec_instruction( >break; > > case TGSI_OPCODE_PK2H: > - assert (0); > + exec_pk2h(mach, inst); >break; > > case TGSI_OPCODE_PK2US: > @@ -4425,7 +4465,7 @@ exec_instruction( >break; > > case TGSI_OPCODE_UP2H: > - assert (0); > + exec_up2h(mach, inst); >break; > > case TGSI_OPCODE_UP2US: > diff --git a/src/gallium/drivers/softpipe/sp_screen.c > b/src/gallium/drivers/softpipe/sp_screen.c > index e74044b..d4526ef 100644 > --- a/src/gallium/drivers/softpipe/sp_screen.c > +++ b/src/gallium/drivers/softpipe/sp_screen.c > @@ -236,6 +236,7 @@ softpipe_get_param(struct pipe_screen *screen, enum > pipe_cap param) > case PIPE_CAP_CLIP_HALFZ: > case PIPE_CAP_TEXTURE_FLOAT_LINEAR: > case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: > + case PIPE_CAP_TGSI_PACK_HALF_FLOAT: >return 1; > case PIPE_CAP_VERTEXID_NOBASE: >return 0; > @@ -252,7 +253,6 @@ softpipe_get_param(struct pipe_screen *screen, enum > pipe_cap param) >
Re: [Mesa-dev] [PATCH 1/2] WIP gallivm: add support for PK2H/UP2H
Am 03.01.2016 um 22:29 schrieb Ilia Mirkin: > This hits assertion failures on LLVM 3.5 > > Signed-off-by: Ilia Mirkin > --- > > It definitely worked at one point or another, but it might have been with > a later LLVM version and/or on a different CPU. On my i7-920 with LLVM 3.5 > I definitely get assertion errors from inside LLVM. Any interested party > can take this patch over and fix it as they see fit. Or ignore it. Interesting. I wasn't even aware using fptrunc could work at all with f16 type. And on some quick look this was indeed introduced later, I think llvm 3.6 (some backends might still not do it today). There are also llvm.convert.to.fp16 (and f32) operations (probably the same backends won't do them neither...). I'm not really sure what rounding mode semantics they'll end up with. Seems like fptrunc actually might do round-to-nearest-even (I suppose llvm.convert.to.fp16 too), but depending on how llvm ends up doing it it might well be subject to the same no-denorm issue as the util code. (And unfortunately, it looks like we don't have any direct control over rounding mode neither for them so we can't ditch lp_build_float_to_smallfloat and lp_build_smallfloat_to_float.) Roland > > src/gallium/auxiliary/gallivm/lp_bld_tgsi.c| 1 - > src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 87 > ++ > src/gallium/drivers/llvmpipe/lp_screen.c | 2 +- > 3 files changed, 88 insertions(+), 2 deletions(-) > > diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c > b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c > index c88dfbf..1cbe47c 100644 > --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c > +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c > @@ -248,7 +248,6 @@ lp_build_tgsi_inst_llvm( > /* Ignore deprecated instructions */ > switch (inst->Instruction.Opcode) { > > - case TGSI_OPCODE_UP2H: > case TGSI_OPCODE_UP2US: > case TGSI_OPCODE_UP4B: > case TGSI_OPCODE_UP4UB: > diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c > b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c > index 3d5e2cb..ac3298d 100644 > --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c > +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c > @@ -1020,6 +1020,88 @@ static void dfrac_emit( > emit_data->args[0], > tmp, ""); > } > > +static void > +pk2h_fetch_args( > + struct lp_build_tgsi_context * bld_base, > + struct lp_build_emit_data * emit_data) > +{ > + /* src0.x */ > + emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, > +0, TGSI_CHAN_X); > + /* src0.y */ > + emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst, > +0, TGSI_CHAN_Y); > +} > + > +static void > +emit_pk2h(const struct lp_build_tgsi_action *action, > + struct lp_build_tgsi_context *bld_base, > + struct lp_build_emit_data *emit_data) > +{ > + LLVMBuilderRef builder = bld_base->base.gallivm->builder; > + LLVMContextRef context = bld_base->base.gallivm->context; > + struct lp_build_context *uint_bld = &bld_base->uint_bld; > + LLVMTypeRef fp16 = LLVMVectorType(LLVMHalfTypeInContext(context), > + bld_base->base.type.length); > + LLVMTypeRef i16 = LLVMVectorType(LLVMInt16TypeInContext(context), > +bld_base->base.type.length); > + LLVMValueRef const16 = lp_build_const_vec(uint_bld->gallivm, > uint_bld->type, > + 16); > + > + LLVMValueRef low = LLVMBuildFPTrunc( > + builder, emit_data->args[0], fp16, ""); > + LLVMValueRef high = LLVMBuildFPTrunc( > + builder, emit_data->args[1], fp16, ""); > + > + low = LLVMBuildZExt(builder, LLVMBuildBitCast(builder, low, i16, ""), > + uint_bld->vec_type, ""); > + high = LLVMBuildZExt(builder, LLVMBuildBitCast(builder, high, i16, ""), > +uint_bld->vec_type, ""); > + > + emit_data->output[emit_data->chan] = > + LLVMBuildOr(builder, low, LLVMBuildShl(builder, high, const16, ""), > ""); > +} > + > +static void > +up2h_fetch_args( > + struct lp_build_tgsi_context * bld_base, > + struct lp_build_emit_data * emit_data) > +{ > + /* src0.x */ > + emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, > +0, TGSI_CHAN_X); > +} > + > +static void > +emit_up2h(const struct lp_build_tgsi_action *action, > + struct lp_build_tgsi_context *bld_base, > + struct lp_build_emit_data *emit_data) > +{ > + LLVMBuilderRef builder = bld_base->base.gallivm->builder; > + LLVMContextRef context = bld_base->base.gallivm->context; > + struct lp_build_context *uint_bld = &bld_base->uint_bld; > + LLVMTypeRef fp16 = LLVMVectorType(LLVMHalfTypeInContext(context), > +
Re: [Mesa-dev] [PATCH 1/2] WIP gallivm: add support for PK2H/UP2H
On Sun, Jan 3, 2016 at 8:37 PM, Roland Scheidegger wrote: > Am 03.01.2016 um 22:29 schrieb Ilia Mirkin: >> This hits assertion failures on LLVM 3.5 >> >> Signed-off-by: Ilia Mirkin >> --- >> >> It definitely worked at one point or another, but it might have been with >> a later LLVM version and/or on a different CPU. On my i7-920 with LLVM 3.5 >> I definitely get assertion errors from inside LLVM. Any interested party >> can take this patch over and fix it as they see fit. Or ignore it. > > Interesting. I wasn't even aware using fptrunc could work at all with > f16 type. And on some quick look this was indeed introduced later, I > think llvm 3.6 (some backends might still not do it today). There are > also llvm.convert.to.fp16 (and f32) operations (probably the same > backends won't do them neither...). I'm not really sure what rounding > mode semantics they'll end up with. Seems like fptrunc actually might do > round-to-nearest-even (I suppose llvm.convert.to.fp16 too), but > depending on how llvm ends up doing it it might well be subject to the > same no-denorm issue as the util code. > (And unfortunately, it looks like we don't have any direct control over > rounding mode neither for them so we can't ditch > lp_build_float_to_smallfloat and lp_build_smallfloat_to_float.) My (admittedly faint) recollection is that this passed the existing piglit tests on a Haswell CPU and I guess at least LLVM 3.6 or maybe even 3.7 (not at the machine right now). But depending on CPU different code might be emitted of course. I wasn't aware of the lp_build_float_to_smallfloat stuff. I don't plan on pursuing this patch further, if you're interested, feel free to redo it. This whole series was mostly about me _really_ hating the code that mesa lowered the half-float pack/unpack into, not any actual performance thing. I don't think that we're aware of a single usage of these builtins outside of piglit. Curiously I saw that GRID Autosport makes use of f32tof16 (and back) functions, but they're locally-defined as uint packfp32(in float fp32) { uint result; uint temp = floatBitsToUint(fp32); result = ((temp & 0x8000u) >> 16) | (((temp & 0x7fffu) >> 13) - (0x3800u >> 13)); return result; } Which later is used as: r9.y = uintBitsToFloat(uint(0x3f80)); r0.x = uintBitsToFloat(uint(r9.y)); r0.y = uintBitsToFloat(f32tof16(r0.y)); r2.w = intBitsToFloat(bfi(floatBitsToInt(r0.x), floatBitsToInt(r0.y), int(16), int(16))); I guess it was too much trouble to use packHalf2x16(r0.xy) [and it *appears* that they forgot to f32tof16 r0.x... this all gets stored off into a ssbo and presumably reused somewhere, so can't tell if it was intended]. -ilia ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H
Am 04.01.2016 um 02:05 schrieb Ilia Mirkin: > On Sun, Jan 3, 2016 at 7:51 PM, Roland Scheidegger wrote: >> Am 03.01.2016 um 21:32 schrieb Ilia Mirkin: >>> On Sun, Jan 3, 2016 at 2:15 PM, Ilia Mirkin wrote: On Sun, Jan 3, 2016 at 2:08 PM, Roland Scheidegger wrote: >>> For the series (with the first point addressed either way,though a tgsi >>> exec implementation which should be trivial wouldn't hurt neither) >>> Reviewed-by: Roland Scheidegger >> >> Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I >> won't be able to enable the cap since it will still use gallivm by >> default for vertices. I have a gallivm implementation as well, but it >> hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or >> another, but it must have been on another box with a more recent LLVM. > > Ah right. f16 conversion is pretty annoying indeed, though I'd hope the > helpers for that should work. In any case, I only really suggested that > because I'd thought it would be trivial, so if it's not I don't consider > that important... I'll send it out as a separate series, including my (semi?) broken gallivm impl and leave it to you to fix it if you care, or ignore if you don't. (I already have it, so might as well...) I understand neither how LLVM works, nor how gallivm uses LLVM, which isn't a great combination :) >>> >>> And of course the piglits expect out-of-bounds numbers to be >>> represented as infinities, instead of the clamped value >> >> This is, imho, a bug, they should allow both. Because round-towards-zero >> when converting is allowed by GL when converting floats to half, albeit >> round-to-nearest-even is preferred. And the former gets you the clamped >> values. >> >>> which is what util_float_to_half does :( >> Yep. The reason both the util and gallivm code do round-towards zero is >> that for such conversions GL allows both, but d3d10 is deeply unhappy if >> you do round-toward-nearest-even (for float to float conversions), at >> least for the clamp vs. infinite issue. As per the data conversion >> rules: >> https://urldefense.proofpoint.com/v2/url?u=https-3A__msdn.microsoft.com_en-2Dus_library_windows_desktop_dd607323-2528v-3Dvs.85-2529.aspx&d=BQIBaQ&c=Sqcl0Ez6M0X8aeM67LKIiDJAXVeAw-YihVMNtXt-uEs&r=Vjtt0vs_iqoI31UfJxBl7yv9I2FeiaeAYgMTLKRBc_I&m=Gd7OrjAeguJzGQHAmmnWwz-_ok3_P7HVdfP1UqlD06w&s=c9EvJslgDjJWgBgsKb_VdSLRtbWWq30XqYi0689ilkQ&e= >> >> Albeit there's no specific half float conversion instructions in d3d10 >> (but in d3d11), render target conversions etc. must honor these rules too. >> I suspect most hw can do both without too much fuzz (x86 f16c certainly >> can). > > Take it up with people who aren't me :) > http://cgit.freedesktop.org/mesa/mesa/tree/src/glsl/lower_packing_builtins.cpp#n990 Yes, it is actually imho somewhat surprising intel gpus can't even do the round-toward-zero behavior natively (meaning they'd most likely have to emulate that one way or the other for the d3d10 driver). > FWIW the f32 -> f16 opcode this maps to on nvc0 has the same > behaviour. Now it also has rounding mode flags which I don't set and > perhaps one of them would yield the behaviour that you're talking > about, but I don't know offhand how to get it. Curiously from the PTX > ISA docs: "Conversions to floating-point that are beyond the range of > floating-point numbers are represented with the maximum floating-point > value (IEEE 754 Inf for f32 and f64, and ~131,000 for f16)." Yes that's somewhat odd. imho if you set round-towards-zero you should get the maxf value. With round-to-nearest(-even) you should get the infinities. This is per standard ieee754 rules. Getting something like round-to-nearest but the overflowing values still clamped to maxf (or vice versa) doesn't really make all that much sense, if that is what's somehow implied by this paragraph. > > If you get the piglit tests changed, I guess I'll poke around. Hmm quite some python code, so I probably don't have time to dig into that. Albeit what I can tell is the rounding mode functions inside gen_builtin_packing_tests.py are (to me) somewhat confusingly named, "round to nearest" and "round to even" - both are round to nearest, one is just "round to nearest_even" the other is really "round to nearest_away_from_zero". But really, glsl just says "The rounding mode cannot be set and is undefined." And that is true for ALL operations. The section also says though for "implicit and explicit conversions between types - Correctly rounded". It is possible to interpret that as meaning that while the rounding mode is undefined, it must be consistent for all operations (in which case it would indeed not be legal to do ordinary arithmetic with round-to-nearest but packHalf2x16 with round-toward-zero). If that's true, we need some way to distinguish between the two possible float->half conversions in gallium, which sounds like quite
[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)
https://bugs.freedesktop.org/show_bug.cgi?id=93570 --- Comment #6 from Icenowy Zheng --- My CPU (Cortex-A7) *do* has neon. All A7 cores have neon. I used llvm-3.7, and I'm now building llvm-svn (used https://github.com/llvm-mirror/llvm) Note: you can run a glmark2 and compare the image to the one that I uploaded. -- You are receiving this mail because: You are the QA Contact for the bug. You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 6/6] i965/gen7.5+: Disable resource streamer during GPGPU workloads.
On 01/03/2016 08:48 AM, Francisco Jerez wrote: > The RS and hardware binding tables are only supported on the 3D > pipeline and can lead to corruption if left enabled during a GPGPU > workload. Disable it when switching to the GPGPU (or media) pipeline > and re-enable it when switching back to the 3D pipeline. Yep, this is the way it is. Reviewed-by: Abdiel Janulgue > --- > src/mesa/drivers/dri/i965/brw_binding_tables.c | 2 +- > src/mesa/drivers/dri/i965/brw_misc_state.c | 38 > ++ > src/mesa/drivers/dri/i965/brw_state.h | 1 + > 3 files changed, 40 insertions(+), 1 deletion(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c > b/src/mesa/drivers/dri/i965/brw_binding_tables.c > index 80935cf..5c5aa0e 100644 > --- a/src/mesa/drivers/dri/i965/brw_binding_tables.c > +++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c > @@ -364,7 +364,7 @@ gen7_disable_hw_binding_tables(struct brw_context *brw) > /** > * Enable hardware binding tables and set up the binding table pool. > */ > -static void > +void > gen7_enable_hw_binding_tables(struct brw_context *brw) > { > if (!brw->use_resource_streamer) > diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c > b/src/mesa/drivers/dri/i965/brw_misc_state.c > index 2263604..7e68838 100644 > --- a/src/mesa/drivers/dri/i965/brw_misc_state.c > +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c > @@ -868,6 +868,25 @@ brw_emit_select_pipeline(struct brw_context *brw, enum > brw_pipeline pipeline) > const uint32_t _3DSTATE_PIPELINE_SELECT = >is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45; > > + if (brw->use_resource_streamer && pipeline != BRW_RENDER_PIPELINE) { > + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] > + * PIPELINE_SELECT [DevBWR+]": > + * > + * Project: HSW, BDW, CHV, SKL, BXT > + * > + * Hardware Binding Tables are only supported for 3D workloads. > Resource > + * streamer must be enabled only for 3D workloads. Resource streamer > + * must be disabled for Media and GPGPU workloads. > + */ > + BEGIN_BATCH(1); > + OUT_BATCH(MI_RS_CONTROL | 0); > + ADVANCE_BATCH(); > + > + gen7_disable_hw_binding_tables(brw); > + > + /* XXX - Disable gather constant pool too when we start using it. */ > + } > + > if (brw->gen >= 8) { >/* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] > * PIPELINE_SELECT [DevBWR+]": > @@ -959,6 +978,25 @@ brw_emit_select_pipeline(struct brw_context *brw, enum > brw_pipeline pipeline) >OUT_BATCH(0); >ADVANCE_BATCH(); > } > + > + if (brw->use_resource_streamer && pipeline == BRW_RENDER_PIPELINE) { > + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] > + * PIPELINE_SELECT [DevBWR+]": > + * > + * Project: HSW, BDW, CHV, SKL, BXT > + * > + * Hardware Binding Tables are only supported for 3D workloads. > Resource > + * streamer must be enabled only for 3D workloads. Resource streamer > + * must be disabled for Media and GPGPU workloads. > + */ > + BEGIN_BATCH(1); > + OUT_BATCH(MI_RS_CONTROL | 1); > + ADVANCE_BATCH(); > + > + gen7_enable_hw_binding_tables(brw); > + > + /* XXX - Re-enable gather constant pool here. */ > + } > } > > /** > diff --git a/src/mesa/drivers/dri/i965/brw_state.h > b/src/mesa/drivers/dri/i965/brw_state.h > index d29b997..7d61b7c 100644 > --- a/src/mesa/drivers/dri/i965/brw_state.h > +++ b/src/mesa/drivers/dri/i965/brw_state.h > @@ -396,6 +396,7 @@ void gen7_update_binding_table_from_array(struct > brw_context *brw, >gl_shader_stage stage, >const uint32_t* binding_table, >int num_surfaces); > +void gen7_enable_hw_binding_tables(struct brw_context *brw); > void gen7_disable_hw_binding_tables(struct brw_context *brw); > void gen7_reset_hw_bt_pool_offsets(struct brw_context *brw); > > ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] mesa: use gl_shader_variable in program resource list
On 12/30/2015 09:53 PM, Marek Olšák wrote: On Mon, Nov 2, 2015 at 10:12 AM, Tapani Pälli wrote: On 11/02/2015 09:16 AM, Ilia Mirkin wrote: On Mon, Nov 2, 2015 at 1:58 AM, Tapani Pälli wrote: Patch changes linker to allocate gl_shader_variable instead of using ir_variable. This makes it possible to get rid of ir_variables and ir in memory after linking. v2: check that we do not create duplicate entries with packed varyings Signed-off-by: Tapani Pälli --- src/glsl/linker.cpp| 58 +++--- src/mesa/main/mtypes.h | 56 src/mesa/main/shader_query.cpp | 36 +- 3 files changed, 123 insertions(+), 27 deletions(-) diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index 48dd2d3..d0353b4 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -3341,6 +3341,27 @@ build_stageref(struct gl_shader_program *shProg, const char *name, return stages; } +/** + * Create gl_shader_variable from ir_variable class. + */ +static gl_shader_variable * +create_shader_variable(struct gl_shader_program *shProg, const ir_variable *in) +{ + gl_shader_variable *out = ralloc(shProg, struct gl_shader_variable); + if (!out) + return NULL; + + out->type = in->type; + out->name = ralloc_strdup(shProg, in->name); This can fail too, right? Might be nice to error-check. Thanks, static analysis might complain about this, will fix. + + out->location = in->data.location; + out->index = in->data.index; + out->patch = in->data.patch; + out->mode = in->data.mode; + + return out; +} + static bool add_interface_variables(struct gl_shader_program *shProg, exec_list *ir, GLenum programInterface) @@ -3392,9 +3413,13 @@ add_interface_variables(struct gl_shader_program *shProg, if (strncmp(var->name, "gl_out_FragData", 15) == 0) continue; - if (!add_program_resource(shProg, programInterface, var, -build_stageref(shProg, var->name, - var->data.mode) | mask)) + gl_shader_variable *sha_v = create_shader_variable(shProg, var); + if (!sha_v) + return false; + + if (!add_program_resource(shProg, programInterface, sha_v, +build_stageref(shProg, sha_v->name, + sha_v->mode) | mask)) return false; } return true; @@ -3422,9 +3447,14 @@ add_packed_varyings(struct gl_shader_program *shProg, int stage) default: unreachable("unexpected type"); } - if (!add_program_resource(shProg, iface, var, - build_stageref(shProg, var->name, - var->data.mode))) + + gl_shader_variable *sha_v = create_shader_variable(shProg, var); + if (!sha_v) +return false; + + if (!add_program_resource(shProg, iface, sha_v, + build_stageref(shProg, sha_v->name, + sha_v->mode))) return false; } } @@ -3443,7 +3473,12 @@ add_fragdata_arrays(struct gl_shader_program *shProg) ir_variable *var = node->as_variable(); if (var) { assert(var->data.mode == ir_var_shader_out); - if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, var, + + gl_shader_variable *sha_v = create_shader_variable(shProg, var); + if (!sha_v) +return false; + + if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, sha_v, 1 << MESA_SHADER_FRAGMENT)) return false; } @@ -3723,8 +3758,13 @@ build_program_resource_list(struct gl_shader_program *shProg) if (shProg->SeparateShader) { if (!add_packed_varyings(shProg, input_stage)) return; - if (!add_packed_varyings(shProg, output_stage)) - return; + /* Only when dealing with multiple stages, otherwise we would have + * duplicate gl_shader_variable entries. + */ + if (input_stage != output_stage) { + if (!add_packed_varyings(shProg, output_stage)) +return; + } } if (!add_fragdata_arrays(shProg)) diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index d6c1eb8..0316769 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -2519,6 +2519,62 @@ struct gl_active_atomic_buffer }; /** + * Data container for shader queries. This holds only the minimal + * amount of required information for resource queries to work. + */ +struct gl_shader_variable +{ + /** +* Declared type of the variable +*/ + const struct glsl_type *type; + + /** +* Declared name of the variable +*/ + char *name; + + /**
[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)
https://bugs.freedesktop.org/show_bug.cgi?id=93570 --- Comment #7 from Icenowy Zheng --- In addition, my DDX driver is https://github.com/ssvb/xf86-video-fbturbo, which uses neon to accelerate operations such as bitblit . -- You are receiving this mail because: You are the QA Contact for the bug. You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)
https://bugs.freedesktop.org/show_bug.cgi?id=93570 --- Comment #8 from Icenowy Zheng --- It seems that the answer to the question above is "No". I changed to the original fbdev DDX, and the problem is still here? Is there any way to dump the binary code generate by llvm? -- You are receiving this mail because: You are the QA Contact for the bug. You are the assignee for the bug. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] mesa: do not validate io of non-compute and compute stage
Fixes regression on SSO tests that have both non-compute and compute programs in a program pipeline. Signed-off-by: Tapani Pälli Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93532 --- src/mesa/main/shader_query.cpp | 7 +++ 1 file changed, 7 insertions(+) diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index e526119..570acfa 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -1496,6 +1496,13 @@ _mesa_validate_pipeline_io(struct gl_pipeline_object *pipeline) for (idx = prev + 1; idx < ARRAY_SIZE(pipeline->CurrentProgram); idx++) { if (shProg[idx]) { + /* Pipeline might include both non-compute and a compute program, do + * not attempt to validate varyings between non-compute and compute + * stage. + */ + if (shProg[idx]->_LinkedShaders[idx]->Stage == MESA_SHADER_COMPUTE) +break; + if (!validate_io(shProg[prev]->_LinkedShaders[prev], shProg[idx]->_LinkedShaders[idx], shProg[prev]->IsES || shProg[idx]->IsES)) -- 2.5.0 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3] mesa: use gl_shader_variable in program resource list
Patch changes linker to allocate gl_shader_variable instead of using ir_variable. This makes it possible to get rid of ir_variables and ir in memory after linking. v2: check that we do not create duplicate entries with packed varyings v3: document 'patch' bit (Ilia Mirkin) Signed-off-by: Tapani Pälli --- src/glsl/linker.cpp| 61 +++--- src/mesa/main/mtypes.h | 61 ++ src/mesa/main/shader_query.cpp | 38 +- 3 files changed, 132 insertions(+), 28 deletions(-) diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index a6e81b4..45daa12 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -3373,6 +3373,30 @@ build_stageref(struct gl_shader_program *shProg, const char *name, return stages; } +/** + * Create gl_shader_variable from ir_variable class. + */ +static gl_shader_variable * +create_shader_variable(struct gl_shader_program *shProg, const ir_variable *in) +{ + gl_shader_variable *out = ralloc(shProg, struct gl_shader_variable); + if (!out) + return NULL; + + out->type = in->type; + out->name = ralloc_strdup(shProg, in->name); + + if (!out->name) + return NULL; + + out->location = in->data.location; + out->index = in->data.index; + out->patch = in->data.patch; + out->mode = in->data.mode; + + return out; +} + static bool add_interface_variables(struct gl_shader_program *shProg, exec_list *ir, GLenum programInterface) @@ -3424,9 +3448,13 @@ add_interface_variables(struct gl_shader_program *shProg, if (strncmp(var->name, "gl_out_FragData", 15) == 0) continue; - if (!add_program_resource(shProg, programInterface, var, -build_stageref(shProg, var->name, - var->data.mode) | mask)) + gl_shader_variable *sha_v = create_shader_variable(shProg, var); + if (!sha_v) + return false; + + if (!add_program_resource(shProg, programInterface, sha_v, +build_stageref(shProg, sha_v->name, + sha_v->mode) | mask)) return false; } return true; @@ -3454,9 +3482,14 @@ add_packed_varyings(struct gl_shader_program *shProg, int stage) default: unreachable("unexpected type"); } - if (!add_program_resource(shProg, iface, var, - build_stageref(shProg, var->name, - var->data.mode))) + + gl_shader_variable *sha_v = create_shader_variable(shProg, var); + if (!sha_v) +return false; + + if (!add_program_resource(shProg, iface, sha_v, + build_stageref(shProg, sha_v->name, + sha_v->mode))) return false; } } @@ -3475,7 +3508,12 @@ add_fragdata_arrays(struct gl_shader_program *shProg) ir_variable *var = node->as_variable(); if (var) { assert(var->data.mode == ir_var_shader_out); - if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, var, + + gl_shader_variable *sha_v = create_shader_variable(shProg, var); + if (!sha_v) +return false; + + if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, sha_v, 1 << MESA_SHADER_FRAGMENT)) return false; } @@ -3726,8 +3764,13 @@ build_program_resource_list(struct gl_shader_program *shProg) if (shProg->SeparateShader) { if (!add_packed_varyings(shProg, input_stage)) return; - if (!add_packed_varyings(shProg, output_stage)) - return; + /* Only when dealing with multiple stages, otherwise we would have + * duplicate gl_shader_variable entries. + */ + if (input_stage != output_stage) { + if (!add_packed_varyings(shProg, output_stage)) +return; + } } if (!add_fragdata_arrays(shProg)) diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 5b9fce8..c9fe728 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -2525,6 +2525,67 @@ struct gl_active_atomic_buffer }; /** + * Data container for shader queries. This holds only the minimal + * amount of required information for resource queries to work. + */ +struct gl_shader_variable +{ + /** +* Declared type of the variable +*/ + const struct glsl_type *type; + + /** +* Declared name of the variable +*/ + char *name; + + /** +* Storage location of the base of this variable +* +* The precise meaning of this field depends on the nature of the variable. +* +* - Vertex shader input: one of the values from \c gl_vert_attrib. +* - Vertex shader output: one of the values from \c gl_varying_slot. +* -