--- src/intel/compiler/brw_fs_bank_conflicts.cpp | 274 ++++++++++++++++++--------- 1 file changed, 188 insertions(+), 86 deletions(-)
diff --git a/src/intel/compiler/brw_fs_bank_conflicts.cpp b/src/intel/compiler/brw_fs_bank_conflicts.cpp index 0225c70..dc88cac 100644 --- a/src/intel/compiler/brw_fs_bank_conflicts.cpp +++ b/src/intel/compiler/brw_fs_bank_conflicts.cpp @@ -51,9 +51,6 @@ #include "brw_fs.h" #include "brw_cfg.h" -#include <vector> -#include <array> - #ifdef __SSE2__ #include <emmintrin.h> @@ -72,7 +69,9 @@ namespace { /** * SIMD integer vector data type. */ - typedef std::array<__m128i, 2> vector_type; + struct vector_type { + __m128i v[2]; + }; /** * Scalar data type matching the representation of a single component of \p @@ -88,8 +87,7 @@ namespace { /** * Number of components of a \p vector_type. */ - const unsigned vector_width = 2 * sizeof(vector_type::value_type) / - sizeof(scalar_type); + const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type); /** * Set the i-th component of vector \p v to \p x. @@ -98,7 +96,7 @@ namespace { set(vector_type &v, unsigned i, scalar_type x) { assert(i < vector_width); - memcpy((char *)v.data() + i * sizeof(x), &x, sizeof(x)); + memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x)); } /** @@ -109,7 +107,7 @@ namespace { { assert(i < vector_width); scalar_type x; - memcpy(&x, (char *)v.data() + i * sizeof(x), sizeof(x)); + memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x)); return x; } @@ -119,10 +117,10 @@ namespace { vector_type adds(const vector_type &v, const vector_type &w) { - const vector_type u = { - _mm_adds_epi16(v[0], w[0]), - _mm_adds_epi16(v[1], w[1]) - }; + const vector_type u = {{ + _mm_adds_epi16(v.v[0], w.v[0]), + _mm_adds_epi16(v.v[1], w.v[1]) + }}; return u; } @@ -132,10 +130,10 @@ namespace { vector_type subs(const vector_type &v, const vector_type &w) { - const vector_type u = { - _mm_subs_epi16(v[0], w[0]), - _mm_subs_epi16(v[1], w[1]) - }; + const vector_type u = {{ + _mm_subs_epi16(v.v[0], w.v[0]), + _mm_subs_epi16(v.v[1], w.v[1]) + }}; return u; } @@ -145,10 +143,10 @@ namespace { vector_type mask(const vector_type &v, const vector_type &w) { - const vector_type u = { - _mm_and_si128(v[0], w[0]), - _mm_and_si128(v[1], w[1]) - }; + const vector_type u = {{ + _mm_and_si128(v.v[0], w.v[0]), + _mm_and_si128(v.v[1], w.v[1]) + }}; return u; } @@ -158,7 +156,7 @@ namespace { scalar_type sums(const vector_type &v) { - const __m128i v8 = _mm_adds_epi16(v[0], v[1]); + const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]); const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e)); const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1)); const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1)); @@ -225,7 +223,7 @@ namespace { vector_type adds(vector_type v, vector_type w) { - return std::max(INT16_MIN, std::min(INT16_MAX, int(v) + w)); + return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w)); } /** @@ -234,7 +232,7 @@ namespace { vector_type subs(vector_type v, vector_type w) { - return std::max(INT16_MIN, std::min(INT16_MAX, int(v) - w)); + return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w)); } /** @@ -258,6 +256,15 @@ namespace { #endif +/** + * Swap \p x and \p y. + */ +#define SWAP(x, y) do { \ + __typeof(y) _swap_tmp = y; \ + y = x; \ + x = _swap_tmp; \ + } while (0) + namespace { /** * Variable-length vector type intended to represent cycle-count costs for @@ -267,7 +274,37 @@ namespace { * atoms are assigned the same bank b or opposite-parity banks b and b^1). * \sa shader_conflict_weight_matrix() */ - typedef std::vector<vector_type> weight_vector_type; + struct weight_vector_type { + weight_vector_type() : v(NULL), size(0) {} + + weight_vector_type(unsigned n) : + v(new vector_type[DIV_ROUND_UP(n, vector_width)]()), + size(n) {} + + weight_vector_type(const weight_vector_type &u) : + v(new vector_type[DIV_ROUND_UP(u.size, vector_width)]()), + size(u.size) + { + memcpy(v, u.v, + DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type)); + } + + ~weight_vector_type() + { + delete[] v; + } + + weight_vector_type & + operator=(weight_vector_type u) + { + SWAP(v, u.v); + SWAP(size, u.size); + return *this; + } + + vector_type *v; + unsigned size; + }; /** * Set the (i, p)-th component of weight vector \p v to \p x. @@ -275,7 +312,7 @@ namespace { void set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x) { - set(v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x); + set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x); } /** @@ -284,7 +321,7 @@ namespace { scalar_type get(const weight_vector_type &v, unsigned i, unsigned p) { - return get(v[(2 * i + p) / vector_width], (2 * i + p) % vector_width); + return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width); } /** @@ -316,13 +353,43 @@ namespace { * Create a (for the moment unrestricted) partitioning of a register * file of size \p n. The units are arbitrary. */ - partitioning(unsigned n) { + partitioning(unsigned n) : + max_reg(n), + offsets(new unsigned[n + num_terminator_atoms]), + atoms(new unsigned[n + num_terminator_atoms]) + { for (unsigned i = 0; i < n + num_terminator_atoms; i++) { - offsets.push_back(i); - atoms.push_back(i); + offsets[i] = i; + atoms[i] = i; } } + partitioning(const partitioning &p) : + max_reg(p.max_reg), + offsets(new unsigned[p.num_atoms() + num_terminator_atoms]), + atoms(new unsigned[p.max_reg + num_terminator_atoms]) + { + memcpy(offsets, p.offsets, + sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms)); + memcpy(atoms, p.atoms, + sizeof(unsigned) * (p.max_reg + num_terminator_atoms)); + } + + ~partitioning() + { + delete[] offsets; + delete[] atoms; + } + + partitioning & + operator=(partitioning p) + { + SWAP(max_reg, p.max_reg); + SWAP(offsets, p.offsets); + SWAP(atoms, p.atoms); + return *this; + } + /** * Require register range [reg, reg + n[ to be considered part of the * same atom. @@ -336,7 +403,7 @@ namespace { * case that the specified contiguity requirement leads to the fusion * (yay) of one or more existing atoms. */ - for (unsigned reg1 = reg + 1; reg1 < atoms.size(); reg1++) { + for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) { if (offsets[atoms[reg1]] < reg + n) { atoms[reg1] = r; } else { @@ -347,11 +414,6 @@ namespace { atoms[reg1] = r; } } - - /* Clean up the scraps if we ended up with less atoms than we started - * with. - */ - offsets.erase(offsets.begin() + r + 1, offsets.end()); } /** @@ -388,7 +450,7 @@ namespace { unsigned num_atoms() const { - return offsets.size() - num_terminator_atoms; + return atoms[max_reg]; } private: @@ -398,8 +460,9 @@ namespace { * size_of_atom(). */ static const unsigned num_terminator_atoms = 1; - std::vector<unsigned> offsets; - std::vector<unsigned> atoms; + unsigned max_reg; + unsigned *offsets; + unsigned *atoms; }; /** @@ -455,10 +518,10 @@ namespace { * Return the set of GRF atoms that should be left untouched at their * original location to avoid violating hardware or software assumptions. */ - std::vector<bool> + bool * shader_reg_constraints(const fs_visitor *v, const partitioning &p) { - std::vector<bool> constrained(p.num_atoms()); + bool *constrained = new bool[p.num_atoms()](); /* These are read implicitly by some send-message instructions without * any indication at the IR level. Assume they are unsafe to move @@ -520,12 +583,13 @@ namespace { * meantime optimizing based on Gen9 weights is likely to be more * helpful than not optimizing at all. */ - std::vector<weight_vector_type> + weight_vector_type * shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p) { - std::vector<weight_vector_type> conflicts(p.num_atoms(), - weight_vector_type(DIV_ROUND_UP(2 * p.num_atoms(), - vector_width))); + weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()]; + for (unsigned r = 0; r < p.num_atoms(); r++) + conflicts[r] = weight_vector_type(2 * p.num_atoms()); + /* Crude approximation of the number of times the current basic block * will be executed at run-time. */ @@ -575,8 +639,8 @@ namespace { * between atoms r and s. Note that the weight matrix is * symmetric with respect to indices r and s by construction. */ - const scalar_type w = std::min(unsigned(max_scalar), - get(conflicts[r], s, p) + cycle_scale); + const scalar_type w = MIN2(unsigned(max_scalar), + get(conflicts[r], s, p) + cycle_scale); set(conflicts[r], s, p, w); set(conflicts[s], r, p, w); } @@ -592,14 +656,16 @@ namespace { * the specified \p conflicts matrix (\sa * shader_conflict_weight_matrix()). */ - std::vector<bool> - have_any_conflicts(const std::vector<weight_vector_type> &conflicts) + bool * + have_any_conflicts(const partitioning &p, + const weight_vector_type *conflicts) { - std::vector<bool> any_conflicts(conflicts.size()); + bool *any_conflicts = new bool[p.num_atoms()](); - for (unsigned r = 0; r < conflicts.size(); r++) { - for (unsigned s = 0; s < conflicts[r].size(); s++) - any_conflicts[r] = any_conflicts[r] || sums(conflicts[r][s]); + for (unsigned r = 0; r < p.num_atoms(); r++) { + const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width); + for (unsigned s = 0; s < m; s++) + any_conflicts[r] |= sums(conflicts[r].v[s]); } return any_conflicts; @@ -627,27 +693,60 @@ namespace { const weight_vector_type &bank_mask_n, const weight_vector_type &conflicts) { + const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width); vector_type s_p = {}, s_n = {}; - for (unsigned r = 0; r < conflicts.size(); r++) { - s_p = adds(s_p, mask(bank_mask_p[r], conflicts[r])); - s_n = adds(s_n, mask(bank_mask_n[r], conflicts[r])); + for (unsigned r = 0; r < m; r++) { + s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r])); + s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r])); } return sums(subs(s_p, s_n)); } /** - * Return an identity permutation of GRF atoms, represented as the start GRF - * offset each atom is mapped into. + * Register atom permutation, represented as the start GRF offset each atom + * is mapped into. + */ + struct permutation { + permutation() : v(NULL), size(0) {} + + permutation(unsigned n) : + v(new unsigned[n]()), size(n) {} + + permutation(const permutation &p) : + v(new unsigned[p.size]), size(p.size) + { + memcpy(v, p.v, p.size * sizeof(unsigned)); + } + + ~permutation() + { + delete[] v; + } + + permutation & + operator=(permutation p) + { + SWAP(v, p.v); + SWAP(size, p.size); + return *this; + } + + unsigned *v; + unsigned size; + }; + + /** + * Return an identity permutation of GRF atoms. */ - std::vector<unsigned> + permutation identity_reg_permutation(const partitioning &p) { - std::vector<unsigned> map(p.num_atoms()); + permutation map(p.num_atoms()); - for (unsigned r = 0; r < map.size(); r++) - map[r] = p.reg_of_atom(r); + for (unsigned r = 0; r < map.size; r++) + map.v[r] = p.reg_of_atom(r); return map; } @@ -671,18 +770,18 @@ namespace { * characteristic function of each bank, if you regard it as a set * containing all atoms assigned to it according to the \p map array. */ - std::array<weight_vector_type, 4> - bank_characteristics(const std::vector<unsigned> &map) + weight_vector_type * + bank_characteristics(const permutation &map) { - std::array<weight_vector_type, 4> banks; + weight_vector_type *banks = new weight_vector_type[4]; - for (unsigned b = 0; b < banks.size(); b++) { - banks[b].resize(DIV_ROUND_UP(2 * map.size(), vector_width)); + for (unsigned b = 0; b < 4; b++) { + banks[b] = weight_vector_type(2 * map.size); - for (unsigned j = 0; j < map.size(); j++) { + for (unsigned j = 0; j < map.size; j++) { for (unsigned p = 0; p < 2; p++) set(banks[b], j, p, - (b ^ p) == bank_of(map[j]) ? -1 : 0); + (b ^ p) == bank_of(map.v[j]) ? -1 : 0); } } @@ -697,24 +796,24 @@ namespace { * may allow it to do a better job in some cases -- It simply reorders * existing atoms in the GRF space without affecting their identity. */ - std::vector<unsigned> + permutation optimize_reg_permutation(const partitioning &p, - const std::vector<bool> &constrained, - const std::vector<weight_vector_type> &conflicts, - std::vector<unsigned> map) + const bool *constrained, + const weight_vector_type *conflicts, + permutation map) { - const std::vector<bool> any_conflicts = have_any_conflicts(conflicts); - std::array<weight_vector_type, 4> banks = bank_characteristics(map); + const bool *any_conflicts = have_any_conflicts(p, conflicts); + weight_vector_type *banks = bank_characteristics(map); - for (unsigned r = 0; r < map.size(); r++) { - const unsigned bank_r = bank_of(map[r]); + for (unsigned r = 0; r < map.size; r++) { + const unsigned bank_r = bank_of(map.v[r]); if (!constrained[r]) { unsigned best_s = r; int best_benefit = 0; - for (unsigned s = 0; s < map.size(); s++) { - const unsigned bank_s = bank_of(map[s]); + for (unsigned s = 0; s < map.size; s++) { + const unsigned bank_s = bank_of(map.v[s]); if (bank_r != bank_s && !constrained[s] && p.size_of_atom(r) == p.size_of_atom(s) && @@ -731,16 +830,18 @@ namespace { } if (best_s != r) { - for (unsigned b = 0; b < banks.size(); b++) { + for (unsigned b = 0; b < 4; b++) { for (unsigned p = 0; p < 2; p++) swap(banks[b], r, p, best_s, p); } - std::swap(map[r], map[best_s]); + SWAP(map.v[r], map.v[best_s]); } } } + delete[] banks; + delete[] any_conflicts; return map; } @@ -749,13 +850,12 @@ namespace { * return the result. */ fs_reg - transform(const partitioning &p, const std::vector<unsigned> &map, - fs_reg r) + transform(const partitioning &p, const permutation &map, fs_reg r) { if (r.file == VGRF) { const unsigned reg = reg_of(r); const unsigned s = p.atom_of_reg(reg); - r.nr = map[s] + reg - p.reg_of_atom(s); + r.nr = map.v[s] + reg - p.reg_of_atom(s); r.offset = r.offset % REG_SIZE; } @@ -773,10 +873,10 @@ fs_visitor::opt_bank_conflicts() return false; const partitioning p = shader_reg_partitioning(this); - const std::vector<bool> constrained = shader_reg_constraints(this, p); - const std::vector<weight_vector_type> conflicts = + const bool *constrained = shader_reg_constraints(this, p); + const weight_vector_type *conflicts = shader_conflict_weight_matrix(this, p); - const std::vector<unsigned> map = + const permutation map = optimize_reg_permutation(p, constrained, conflicts, identity_reg_permutation(p)); @@ -787,5 +887,7 @@ fs_visitor::opt_bank_conflicts() inst->src[i] = transform(p, map, inst->src[i]); } + delete[] conflicts; + delete[] constrained; return true; } -- 2.10.2 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev