From: Connor Abbott <cwabbo...@gmail.com> --- src/compiler/nir/nir.h | 7 +++ src/compiler/nir/nir_intrinsics.h | 124 +++++++++++++++++++++++++++++++++++++- src/compiler/nir/nir_print.c | 1 + 3 files changed, 129 insertions(+), 3 deletions(-)
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 24934f0..4b5d78e 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -1076,6 +1076,12 @@ typedef enum { */ NIR_INTRINSIC_INTERP_MODE = 9, + /* + * Constant mask, invocation, swizzle, etc. used for a subgroup operation. + * The precise meaning depends on the intrinsic. + */ + NIR_INTRINSIC_SUBGROUP_DATA = 10, + NIR_INTRINSIC_NUM_INDEX_FLAGS, } nir_intrinsic_index_flag; @@ -1144,6 +1150,7 @@ INTRINSIC_IDX_ACCESSORS(desc_set, DESC_SET, unsigned) INTRINSIC_IDX_ACCESSORS(binding, BINDING, unsigned) INTRINSIC_IDX_ACCESSORS(component, COMPONENT, unsigned) INTRINSIC_IDX_ACCESSORS(interp_mode, INTERP_MODE, unsigned) +INTRINSIC_IDX_ACCESSORS(subgroup_data, SUBGROUP_DATA, unsigned) /** * \group texture information diff --git a/src/compiler/nir/nir_intrinsics.h b/src/compiler/nir/nir_intrinsics.h index 72c4296..32f52aa 100644 --- a/src/compiler/nir/nir_intrinsics.h +++ b/src/compiler/nir/nir_intrinsics.h @@ -93,12 +93,24 @@ INTRINSIC(get_buffer_size, 1, ARR(1), true, 1, 0, 0, xx, xx, xx, * the comment for NIR_INTRINSIC_CONVERGENT in nir.h for details. */ #define CONVERGENT(name, num_srcs, src0_components, src1_components, \ - dest_components) \ - INTRINSIC(name, num_srcs, ARR(src0_components, src1_components), \ + src2_components, dest_components) \ + INTRINSIC(name, num_srcs, ARR(src0_components, src1_components, \ + src2_components), \ true, dest_components, 0, 0, xx, xx, xx, \ NIR_INTRINSIC_CAN_REORDER | NIR_INTRINSIC_CAN_ELIMINATE | \ NIR_INTRINSIC_CONVERGENT) +/* + * Similar to CONVERGENT, except optimizations can assume that the intrinsic is + * only called in uniform control flow. + */ +#define UNIFORM_CONTROL(name, num_srcs, src0_components, src1_components, \ + dest_components) \ + INTRINSIC(name, num_srcs, ARR(src0_components, src1_components), \ + true, dest_components, 0, 0, xx, xx, xx, \ + NIR_INTRINSIC_CAN_REORDER | NIR_INTRINSIC_CAN_ELIMINATE | \ + NIR_INTRINSIC_UNIFORM_CONTROL) + BARRIER(barrier) BARRIER(discard) @@ -126,7 +138,7 @@ INTRINSIC(shader_clock, 0, ARR(0), true, 2, 0, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ * GLSL functions from ARB_shader_ballot. */ CROSS_THREAD(ballot, 1, 1, 0, 1) -CONVERGENT(read_invocation, 2, 0, 1, 0) +CONVERGENT(read_invocation, 2, 0, 1, 0, 0) CROSS_THREAD(read_first_invocation, 1, 0, 0, 0) /* @@ -148,6 +160,105 @@ CROSS_THREAD(vote_any, 1, 1, 0, 1) CROSS_THREAD(vote_all, 1, 1, 0, 1) CROSS_THREAD(vote_eq, 1, 1, 0, 1) +/* AMD_shader_ballot intrinsics */ + +/* + * This is like CROSS_THREAD for instructions that communicate across an entire + * workgroup, instead of just the subgroup. + */ + +#define CROSS_SUBGROUP(name, num_srcs, src0_components, src1_components, \ + dest_components) \ + INTRINSIC(name, num_srcs, ARR(src0_components, src1_components), \ + true, dest_components, 0, 0, xx, xx, xx, \ + NIR_INTRINSIC_CAN_ELIMINATE) + +#define CROSS_SUBGROUP_UNIFORM(name, num_srcs, src0_components, \ + src1_components, dest_components) \ + INTRINSIC(name, num_srcs, ARR(src0_components, src1_components), \ + true, dest_components, 0, 0, xx, xx, xx, \ + NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_UNIFORM_CONTROL) + +#define REDUCE_OP(name, postfix) \ + UNIFORM_CONTROL(subgroup_##name##postfix, 1, 0, 0, 0) \ + CROSS_SUBGROUP(subgroup_##name##postfix##_nonuniform, 1, 0, 0, 0) \ + CROSS_SUBGROUP_UNIFORM(group_##name##postfix, 1, 0, 0, 0) \ + CROSS_SUBGROUP(group_##name##postfix##_nonuniform, 1, 0, 0, 0) \ + +#define GROUP_SUBGROUP_REDUCE(name) \ + REDUCE_OP(name, ) \ + REDUCE_OP(name, _inclusive_scan) \ + REDUCE_OP(name, _exclusive_scan) + +GROUP_SUBGROUP_REDUCE(iadd) +GROUP_SUBGROUP_REDUCE(fadd) +GROUP_SUBGROUP_REDUCE(fmin) +GROUP_SUBGROUP_REDUCE(umin) +GROUP_SUBGROUP_REDUCE(imin) +GROUP_SUBGROUP_REDUCE(fmax) +GROUP_SUBGROUP_REDUCE(umax) +GROUP_SUBGROUP_REDUCE(imax) + +#undef GROUP_SUBGROUP_REDUCE +#undef REDUCE_OP + +/* Analogous to vote_any and vote_all, but works across an entire workgroup. + * Also, this version can only be called in uniform control flow. + */ +CROSS_SUBGROUP_UNIFORM(group_any, 1, 1, 0, 1) +CROSS_SUBGROUP_UNIFORM(group_all, 1, 1, 0, 1) + +/* Similarly, this is analogous to read_invocation but works across an entire + * workgroup and can only be called in uniform control flow. Unlike + * read_invocation, the second argument is a 3-dimensional local ID instead of + * a simple linear index. + */ +CROSS_SUBGROUP_UNIFORM(group_broadcast, 2, 1, 3, 1) + +#define CROSS_THREAD_WITH_DATA(name, src_components, dest_components) \ + INTRINSIC(name, 1, ARR(src_components), \ + true, dest_components, 0, 1, SUBGROUP_DATA, xx, xx, \ + NIR_INTRINSIC_CAN_REORDER | NIR_INTRINSIC_CAN_ELIMINATE | \ + NIR_INTRINSIC_CROSS_THREAD) + +/* The index is is interpreted as a swizzle value, with bits [2*i:2*i+1] + * giving the lane within the quad that should be stored in the i'th lane of + * the quad. Inactive lanes return 0. That is, the computation is: + * + * for (i = 0; i < SubgroupSize; i++) { + * for (j = 0; j < 4; j++) { + * out[i + j] = is_active[i + swizzle[2*j:2*j+1]] ? in[i + swizzle[2*j:2*j+1]] : 0; + * } + * } + */ +CROSS_THREAD_WITH_DATA(quad_swizzle_amd, 0, 0) + +/* + * Implements AMD's swizzle-masked intrinsic. The mask is interpreted as 3 + * 5-bit integers. Implements the following, taken from AMD_shader_ballot: + * + * for (i = 0; i < SubgroupSize; i++) { + * j = (((i & 0x1f) & data[0:4]) | data[5:9]) ^ data[10:14]; + * j |= i & 0x20; + * out[i] = is_active[j] ? in[j] : 0; + */ +CROSS_THREAD_WITH_DATA(masked_swizzle_amd, 0, 0) + +/* + * The opposite of read_invocation - return the first argument, except for the + * second argument in the invocation given by the third argument. The second + * and third arguments must be dynamically uniform within the subgroup. + */ +CONVERGENT(write_invocation, 3, 0, 0, 1, 0) + +/* + * Implements the AMD mbcnt instruction. Returns: + * + * bitCount(gl_SubgroupLtMask & src0) + */ +INTRINSIC(mbcnt_amd, 1, ARR(1), true, 1, 0, 0, xx, xx, xx, + NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER) + /** * Basic Geometry Shader intrinsics. * @@ -387,6 +498,13 @@ SYSTEM_VALUE(blend_const_color_a_float, 1, 0, xx, xx, xx) SYSTEM_VALUE(blend_const_color_rgba8888_unorm, 1, 0, xx, xx, xx) SYSTEM_VALUE(blend_const_color_aaaa8888_unorm, 1, 0, xx, xx, xx) +#define CROSS_THREAD_UNIFORM(name, dest_components, src_components) \ + INTRINSIC(name, 1, ARR(src_components), true, dest_components, 0, 0, \ + xx, xx, xx, \ + NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER | \ + NIR_INTRINSIC_CROSS_THREAD | NIR_INTRINSIC_CONVERGENT) + + /** * Barycentric coordinate intrinsics. * diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c index f4811fe..b9cb686 100644 --- a/src/compiler/nir/nir_print.c +++ b/src/compiler/nir/nir_print.c @@ -595,6 +595,7 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state) [NIR_INTRINSIC_BINDING] = "binding", [NIR_INTRINSIC_COMPONENT] = "component", [NIR_INTRINSIC_INTERP_MODE] = "interp_mode", + [NIR_INTRINSIC_SUBGROUP_DATA] = "subgroup_data", }; for (unsigned idx = 1; idx < NIR_INTRINSIC_NUM_INDEX_FLAGS; idx++) { if (!info->index_map[idx]) -- 2.9.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev