On Thu, Jul 6, 2017 at 4:48 PM, Matt Turner <matts...@gmail.com> wrote: > We already had a channel_num system value, which I'm renaming to > subgroup_invocation to match the rest of the new system values. > > Note that while ballotARB(true) will return zeros in the high 32-bits on > systems where gl_SubGroupSizeARB <= 32, the gl_SubGroup??MaskARB > variables do not consider whether channels are enabled. See issue (1) of > ARB_shader_ballot. > --- > src/compiler/nir/nir.c | 4 ++++ > src/compiler/nir/nir_intrinsics.h | 8 +++++++- > src/compiler/nir/nir_lower_system_values.c | 28 ++++++++++++++++++++++++++++ > src/intel/compiler/brw_fs_nir.cpp | 2 +- > src/intel/compiler/brw_nir_intrinsics.c | 4 ++-- > 5 files changed, 42 insertions(+), 4 deletions(-) > > diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c > index 491b908396..9827e129ca 100644 > --- a/src/compiler/nir/nir.c > +++ b/src/compiler/nir/nir.c > @@ -1908,6 +1908,10 @@ nir_intrinsic_from_system_value(gl_system_value val) > return nir_intrinsic_load_helper_invocation; > case SYSTEM_VALUE_VIEW_INDEX: > return nir_intrinsic_load_view_index; > + case SYSTEM_VALUE_SUBGROUP_SIZE: > + return nir_intrinsic_load_subgroup_size; > + case SYSTEM_VALUE_SUBGROUP_INVOCATION: > + return nir_intrinsic_load_subgroup_invocation; > default: > unreachable("system value does not directly correspond to intrinsic"); > } > diff --git a/src/compiler/nir/nir_intrinsics.h > b/src/compiler/nir/nir_intrinsics.h > index 6c6ba4cf59..96ecfbc338 100644 > --- a/src/compiler/nir/nir_intrinsics.h > +++ b/src/compiler/nir/nir_intrinsics.h > @@ -344,10 +344,16 @@ SYSTEM_VALUE(work_group_id, 3, 0, xx, xx, xx) > SYSTEM_VALUE(user_clip_plane, 4, 1, UCP_ID, xx, xx) > SYSTEM_VALUE(num_work_groups, 3, 0, xx, xx, xx) > SYSTEM_VALUE(helper_invocation, 1, 0, xx, xx, xx) > -SYSTEM_VALUE(channel_num, 1, 0, xx, xx, xx) > SYSTEM_VALUE(alpha_ref_float, 1, 0, xx, xx, xx) > SYSTEM_VALUE(layer_id, 1, 0, xx, xx, xx) > SYSTEM_VALUE(view_index, 1, 0, xx, xx, xx) > +SYSTEM_VALUE(subgroup_size, 1, 0, xx, xx, xx) > +SYSTEM_VALUE(subgroup_invocation, 1, 0, xx, xx, xx) > +SYSTEM_VALUE(subgroup_eq_mask, 1, 0, xx, xx, xx) > +SYSTEM_VALUE(subgroup_ge_mask, 1, 0, xx, xx, xx) > +SYSTEM_VALUE(subgroup_gt_mask, 1, 0, xx, xx, xx) > +SYSTEM_VALUE(subgroup_le_mask, 1, 0, xx, xx, xx) > +SYSTEM_VALUE(subgroup_lt_mask, 1, 0, xx, xx, xx) > > /* Blend constant color values. Float values are clamped. */ > SYSTEM_VALUE(blend_const_color_r_float, 1, 0, xx, xx, xx) > diff --git a/src/compiler/nir/nir_lower_system_values.c > b/src/compiler/nir/nir_lower_system_values.c > index 810100a081..faf0c3c9da 100644 > --- a/src/compiler/nir/nir_lower_system_values.c > +++ b/src/compiler/nir/nir_lower_system_values.c > @@ -116,6 +116,34 @@ convert_block(nir_block *block, nir_builder *b) > nir_load_base_instance(b)); > break; > > + case SYSTEM_VALUE_SUBGROUP_EQ_MASK: > + case SYSTEM_VALUE_SUBGROUP_GE_MASK: > + case SYSTEM_VALUE_SUBGROUP_GT_MASK: > + case SYSTEM_VALUE_SUBGROUP_LE_MASK: > + case SYSTEM_VALUE_SUBGROUP_LT_MASK: { > + nir_ssa_def *count = nir_load_subgroup_invocation(b); > + > + switch (var->data.location) { > + case SYSTEM_VALUE_SUBGROUP_EQ_MASK: > + sysval = nir_ishl(b, nir_imm_int64(b, 1ull), count); > + break; > + case SYSTEM_VALUE_SUBGROUP_GE_MASK: > + sysval = nir_ishl(b, nir_imm_int64(b, ~0ull), count); > + break; > + case SYSTEM_VALUE_SUBGROUP_GT_MASK: > + sysval = nir_ishl(b, nir_imm_int64(b, ~1ull), count); > + break; > + case SYSTEM_VALUE_SUBGROUP_LE_MASK: > + sysval = nir_inot(b, nir_ishl(b, nir_imm_int64(b, ~1ull), > count)); > + break; > + case SYSTEM_VALUE_SUBGROUP_LT_MASK: > + sysval = nir_inot(b, nir_ishl(b, nir_imm_int64(b, ~0ull), > count)); > + break; > + default: > + unreachable("you seriously can't tell this is unreachable?"); > + } > + } > +
While this fine to do for both Intel and AMD, Nvidia actually has special system values for these, and AMD has special instructions for bitCount(foo & gl_SubGroupLtMask), so I think we should have actual nir_load_subgroup_*_mask intrinsics for these. Also, that way you can use the same shrinking logic to turn these into 32-bit shifts on Intel. > default: > break; > } > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > index 264398f38e..17f35e081d 100644 > --- a/src/intel/compiler/brw_fs_nir.cpp > +++ b/src/intel/compiler/brw_fs_nir.cpp > @@ -4075,7 +4075,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, > nir_intrinsic_instr *instr > break; > } > > - case nir_intrinsic_load_channel_num: { > + case nir_intrinsic_load_subgroup_invocation: { > fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW); > dest = retype(dest, BRW_REGISTER_TYPE_UD); > const fs_builder allbld8 = bld.group(8, 0).exec_all(); > diff --git a/src/intel/compiler/brw_nir_intrinsics.c > b/src/intel/compiler/brw_nir_intrinsics.c > index d63570fa2a..abbbc6f93e 100644 > --- a/src/intel/compiler/brw_nir_intrinsics.c > +++ b/src/intel/compiler/brw_nir_intrinsics.c > @@ -88,10 +88,10 @@ lower_cs_intrinsics_convert_block(struct > lower_intrinsics_state *state, > /* We construct the local invocation index from: > * > * gl_LocalInvocationIndex = > - * cs_thread_local_id + channel_num; > + * cs_thread_local_id + subgroup_invocation; > */ > nir_ssa_def *thread_local_id = read_thread_local_id(state); > - nir_ssa_def *channel = nir_load_channel_num(b); > + nir_ssa_def *channel = nir_load_subgroup_invocation(b); > sysval = nir_iadd(b, channel, thread_local_id); > break; > } > -- > 2.13.0 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev