On 10/14/2018 07:16 PM, Jason Ekstrand wrote: > On Sun, Oct 14, 2018 at 5:12 PM Matt Turner <matts...@gmail.com > <mailto:matts...@gmail.com>> wrote: > > --- > src/compiler/nir/nir.h | 1 + > src/compiler/nir/nir_lower_int64.c | 142 > +++++++++++++++++++++++++++++++++++++ > 2 files changed, 143 insertions(+) > > diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h > index 12cbd030e21..2c477126acc 100644 > --- a/src/compiler/nir/nir.h > +++ b/src/compiler/nir/nir.h > @@ -3001,6 +3001,7 @@ typedef enum { > nir_lower_ineg64 = (1 << 7), > nir_lower_logic64 = (1 << 8), > nir_lower_minmax64 = (1 << 9), > + nir_lower_shift64 = (1 << 10), > } nir_lower_int64_options; > > bool nir_lower_int64(nir_shader *shader, nir_lower_int64_options > options); > diff --git a/src/compiler/nir/nir_lower_int64.c > b/src/compiler/nir/nir_lower_int64.c > index 9cdc8a9d592..25882d3a858 100644 > --- a/src/compiler/nir/nir_lower_int64.c > +++ b/src/compiler/nir/nir_lower_int64.c > @@ -90,6 +90,138 @@ lower_ixor64(nir_builder *b, nir_ssa_def *x, > nir_ssa_def *y) > nir_ixor(b, x_hi, y_hi)); > } > > +static nir_ssa_def * > +lower_ishl64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) > +{ > + /* Implemented as > + * > + * uint64_t lshift(uint64_t x, int c) > + * { > + * if (c == 0) return x; > + * > + * uint32_t lo = LO(x), hi = HI(x); > + * > + * if (c < 32) { > + * uint32_t lo_shifted = lo << (c & 0x1f); > + * uint32_t hi_shifted = hi << (c & 0x1f); > + * uint32_t lo_shifted_hi = lo >> (abs(32 - c) & 0x1f); > > > Why the abs and the &? it's already predicated on c < 32 and negative > or OOB shifts already have undefined results.
I think the & is unnecessary, and I tend towards removing them. The abs() is there so that it's the same expression as the else case. This is useful because the NIR code he generates uses a bcsel instead. Since the NIR code uses bcsel, I feel like the C pseudo-code should use ?:. > + * return pack_64(lo_shifted, hi_shifted | lo_shifted_hi); > + * } else { > + * uint32_t lo_shifted_hi = lo << (abs(32 - c) & 0x1f); > + * return pack_64(0, lo_shifted_hi); > + * } > + * } > + */ > + nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); > + nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); > + > + nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, > nir_imm_int(b, -32))); > > > This is iabs(c - 32) (which yields the same result but isn't the same > expression) and doesn't have the & 0x1f. > > > + nir_ssa_def *lo_shifted = nir_ishl(b, x_lo, y); > + nir_ssa_def *hi_shifted = nir_ishl(b, x_hi, y); > > > In general, all of the 0x1f are missing. While not having them works on > i965, there's no guarantee it works in general. Maybe we should add > them in and have an i965-specific optimization to delete them again? > Maybe it's ok to just not have them. In any case, the code down here > should match the code above or there should be a very good comment > saying why it doesn't. As long as shifting with a >32 value doesn't make the GPU crash, it should be fine. The values produced from those shifts aren't used in the final result. Right? Explaining that in the comment is a good idea. > + nir_ssa_def *lo_shifted_hi = nir_ushr(b, x_lo, reverse_count); > + > + nir_ssa_def *res_if_lt_32 = > + nir_pack_64_2x32_split(b, lo_shifted, > + nir_ior(b, hi_shifted, lo_shifted_hi)); > + nir_ssa_def *res_if_ge_32 = > + nir_pack_64_2x32_split(b, nir_imm_int(b, 0), > + nir_ishl(b, x_lo, reverse_count)); > + > + return nir_bcsel(b, > + nir_ieq(b, y, nir_imm_int(b, 0)), x, > + nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)), > + res_if_ge_32, res_if_lt_32)); > +} > + > +static nir_ssa_def * > +lower_ishr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) > +{ > + /* Implemented as > + * > + * uint64_t arshift(uint64_t x, int c) > + * { > + * if (c == 0) return x; > + * > + * uint32_t lo = LO(x); > + * int32_t hi = HI(x); > + * > + * if (c < 32) { > + * uint32_t lo_shifted = lo >> (c & 0x1f); > + * uint32_t hi_shifted = hi >> (c & 0x1f); > + * uint32_t hi_shifted_lo = hi << (abs(32 - c) & 0x1f); > + * return pack_64(hi_shifted, hi_shifted_lo | lo_shifted); > + * } else { > + * uint32_t hi_shifted = hi >> 31; > + * uint32_t hi_shifted_lo = hi >> (abs(32 - c) & 0x1f); > + * return pack_64(hi_shifted, hi_shifted_lo); > + * } > + * } > + */ > + nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); > + nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); > + > + nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, > nir_imm_int(b, -32))); > + nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y); > + nir_ssa_def *hi_shifted = nir_ishr(b, x_hi, y); > + nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count); > + > + nir_ssa_def *res_if_lt_32 = > + nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo), > + hi_shifted); > + nir_ssa_def *res_if_ge_32 = > + nir_pack_64_2x32_split(b, nir_ishr(b, x_hi, reverse_count), > + nir_ishr(b, x_hi, nir_imm_int(b, 31))); > + > + return nir_bcsel(b, > + nir_ieq(b, y, nir_imm_int(b, 0)), x, > + nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)), > + res_if_ge_32, res_if_lt_32)); > +} > + > +static nir_ssa_def * > +lower_ushr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) > +{ > + /* Implemented as > + * > + * uint64_t rshift(uint64_t x, int c) > + * { > + * if (c == 0) return x; > + * > + * uint32_t lo = LO(x), hi = HI(x); > + * > + * if (c < 32) { > + * uint32_t lo_shifted = lo >> (c & 0x1f); > + * uint32_t hi_shifted = hi >> (c & 0x1f); > + * uint32_t hi_shifted_lo = hi << (abs(32 - c) & 0x1f); > + * return pack_64(hi_shifted, hi_shifted_lo | lo_shifted); > + * } else { > + * uint32_t hi_shifted_lo = hi >> (abs(32 - c) & 0x1f); > + * return pack_64(0, hi_shifted_lo); > + * } > + * } > + */ > + > + nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); > + nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); > + > + nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, > nir_imm_int(b, -32))); > + nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y); > + nir_ssa_def *hi_shifted = nir_ushr(b, x_hi, y); > + nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count); > + > + nir_ssa_def *res_if_lt_32 = > + nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo), > + hi_shifted); > + nir_ssa_def *res_if_ge_32 = > + nir_pack_64_2x32_split(b, nir_ushr(b, x_hi, reverse_count), > + nir_imm_int(b, 0)); > + > + return nir_bcsel(b, > + nir_ieq(b, y, nir_imm_int(b, 0)), x, > + nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)), > + res_if_ge_32, res_if_lt_32)); > +} > + > static nir_ssa_def * > lower_iadd64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) > { > @@ -430,6 +562,10 @@ opcode_to_options_mask(nir_op opcode) > case nir_op_ixor: > case nir_op_inot: > return nir_lower_logic64; > + case nir_op_ishl: > + case nir_op_ishr: > + case nir_op_ushr: > + return nir_lower_shift64; > default: > return 0; > } > @@ -492,6 +628,12 @@ lower_int64_alu_instr(nir_builder *b, > nir_alu_instr *alu) > return lower_ixor64(b, src[0], src[1]); > case nir_op_inot: > return lower_inot64(b, src[0]); > + case nir_op_ishl: > + return lower_ishl64(b, src[0], src[1]); > + case nir_op_ishr: > + return lower_ishr64(b, src[0], src[1]); > + case nir_op_ushr: > + return lower_ushr64(b, src[0], src[1]); > default: > unreachable("Invalid ALU opcode to lower"); > } > -- > 2.16.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org <mailto:mesa-dev@lists.freedesktop.org> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev > > > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev