https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109811
JuzheZhong <juzhe.zhong at rivai dot ai> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |juzhe.zhong at rivai dot ai
--- Comment #7 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
(In reply to Jan Hubicka from comment #6)
> hottest loop in clang's profile is:
> for (size_t y = 0; y < opsin.ysize(); y++) {
> for (size_t x = 0; x < opsin.xsize(); x++) {
> if (is_background_row[y * is_background_stride + x]) continue;
> cc.clear();
> stack.clear();
> stack.emplace_back(x, y);
> size_t min_x = x;
> size_t max_x = x;
> size_t min_y = y;
> size_t max_y = y;
> std::pair<uint32_t, uint32_t> reference;
> bool found_border = false;
> bool all_similar = true;
> while (!stack.empty()) {
> std::pair<uint32_t, uint32_t> cur = stack.back();
> stack.pop_back();
> if (visited_row[cur.second * visited_stride + cur.first]) continue;
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> closed by this continue.
> visited_row[cur.second * visited_stride + cur.first] = 1;
> if (cur.first < min_x) min_x = cur.first;
> if (cur.first > max_x) max_x = cur.first;
> if (cur.second < min_y) min_y = cur.second;
> if (cur.second > max_y) max_y = cur.second;
> if (paint_ccs) {
> cc.push_back(cur);
> }
> for (int dx = -kSearchRadius; dx <= kSearchRadius; dx++) {
> for (int dy = -kSearchRadius; dy <= kSearchRadius; dy++) {
> if (dx == 0 && dy == 0) continue;
> int next_first = static_cast<int32_t>(cur.first) + dx;
> int next_second = static_cast<int32_t>(cur.second) + dy;
> if (next_first < 0 || next_second < 0 ||
> static_cast<uint32_t>(next_first) >= opsin.xsize() ||
> static_cast<uint32_t>(next_second) >= opsin.ysize()) {
> continue;
> }
> std::pair<uint32_t, uint32_t> next{next_first, next_second};
> if (!is_background_row[next.second * is_background_stride +
> next.first]) {
> stack.push_back(next);
> } else {
> if (!found_border) {
> reference = next;
> found_border = true;
> } else {
> if (!is_similar_b(next, reference)) all_similar = false;
> }
> }
> }
> }
> }
> if (!found_border || !all_similar || max_x - min_x >= kMaxPatchSize ||
> max_y - min_y >= kMaxPatchSize) {
> continue;
> }
> size_t bpos = background_stride * reference.second + reference.first;
> float ref[3] = {background_rows[0][bpos], background_rows[1][bpos],
> background_rows[2][bpos]};
> bool has_similar = false;
> for (size_t iy = std::max<int>(
> static_cast<int32_t>(min_y) - kHasSimilarRadius, 0);
> iy < std::min(max_y + kHasSimilarRadius + 1, opsin.ysize());
> iy++) {
> for (size_t ix = std::max<int>(
> static_cast<int32_t>(min_x) - kHasSimilarRadius, 0);
> ix < std::min(max_x + kHasSimilarRadius + 1, opsin.xsize());
> ix++) {
> size_t opos = opsin_stride * iy + ix;
> float px[3] = {opsin_rows[0][opos], opsin_rows[1][opos],
> opsin_rows[2][opos]};
> if (pci.is_similar_v(ref, px, kHasSimilarThreshold)) {
> has_similar = true;
> }
> }
> }
> if (!has_similar) continue;
> info.emplace_back();
> info.back().second.emplace_back(min_x, min_y);
> QuantizedPatch& patch = info.back().first;
> patch.xsize = max_x - min_x + 1;
> patch.ysize = max_y - min_y + 1;
> int max_value = 0;
> for (size_t c : {1, 0, 2}) {
> for (size_t iy = min_y; iy <= max_y; iy++) {
> for (size_t ix = min_x; ix <= max_x; ix++) {
> size_t offset = (iy - min_y) * patch.xsize + ix - min_x;
> patch.fpixels[c][offset] =
> opsin_rows[c][iy * opsin_stride + ix] - ref[c];
> int val = pci.Quantize(patch.fpixels[c][offset], c);
> patch.pixels[c][offset] = val;
> if (std::abs(val) > max_value) max_value = std::abs(val);
> }
> }
> }
> if (max_value < kMinPeak) {
> info.pop_back();
> continue;
> }
> if (paint_ccs) {
> float cc_color = rng.UniformF(0.5, 1.0);
> for (std::pair<uint32_t, uint32_t> p : cc) {
> ccs.Row(p.second)[p.first] = cc_color;
> }
> }
> }
> }
>
> I guess such a large loop nest with hottest loop not being the innermost is
> bad for register pressure.
> Clangs code is :
> 0.02 │1196:┌─→cmp %r10,-0xb8(%rbp)
> ▒
> │ │jxl::FindBestPatchDictionary(jxl::Image3<float> const&,
> jxl::PassesEncoderState*, JxlCmsInterface co▒
> │ │while (!stack.empty()) {
> ◆
> 1.39 │ │↓ je 1690
> ▒
> │ │std::pair<uint32_t, uint32_t> cur = stack.back();
> ▒
> │11a3:│ mov -0x8(%r10),%rbx
> ▒
> 9.80 │ │ mov -0x230(%rbp),%r14
> ▒
> │ │__gnu_cxx::__normal_iterator<std::pair<unsigned int, unsigned
> int>*, std::vector<std::pair<unsigned ▒
> │ │{ return __normal_iterator(_M_current - __n); }
> ▒
> 0.86 │ │ add $0xfffffffffffffff8,%r10
> ▒
> │ │jxl::FindBestPatchDictionary(jxl::Image3<float> const&,
> jxl::PassesEncoderState*, JxlCmsInterface co▒
> 0.36 │ │ mov %rbx,%r13
> ▒
> 0.01 │ │ shr $0x20,%r13
> ▒
> │ │if (visited_row[cur.second * visited_stride + cur.first])
> continue; ▒
> 2.69 │ │ mov %ebx,%eax
> ▒
> 0.00 │ │ mov %r13,%rcx
> ▒
> 1.12 │ │ imul -0x180(%rbp),%rcx
> ▒
> 0.78 │ │ add %rax,%rcx
> ▒
> 2.73 │ ├──cmpb $0x0,(%r14,%rcx,1)
> ▒
> 19.91 │ └──jne 1196
> ▒
>
> While GCC is longer:
> Percent│ Percent│ while (!stack.empty()) {
> ▒
> │1241:┌─→cmp %rax,-0x370(%rbp)
> ▒
> 0.70 │ │↓ je 1481
> ▒
> │ │std::pair<uint32_t, uint32_t> cur = stack.back();
> ▒
> 0.02 │124e:│ mov -0x8(%rax),%rdx
> ▒
> │ │std::vector<std::pair<unsigned int, unsigned int>,
> std::allocator<std::pair<unsigned int, unsigned int> > >::pop_back▒
> │ │--this->_M_impl._M_finish;
> ▒
> 0.63 │ │ sub $0x8,%rax
> ▒
> 0.74 │ │ mov %rax,-0x368(%rbp)
> ▒
> │ │FindTextLikePatches():
> ▒
> 0.01 │ │ mov %edx,%esi
> ◆
> 0.35 │ │ mov %rdx,%rdi
> ▒
> 0.82 │ │ mov %rdx,-0x490(%rbp)
> ▒
> │ │if (visited_row[cur.second * visited_stride + cur.first])
> continue; ▒
> 0.00 │ │ mov -0x5f8(%rbp),%rdx
> ▒
> │ │std::pair<uint32_t, uint32_t> cur = stack.back();
> ▒
> 1.32 │ │ shr $0x20,%rdi
> ▒
> 0.11 │ │ mov %rsi,%r15
> ▒
> 0.09 │ │ mov %edi,%ecx
> ▒
> │ │if (visited_row[cur.second * visited_stride + cur.first])
> continue; ▒
> 0.72 │ │ mov -0x5f0(%rbp),%rdi
> ▒
> │ │std::pair<uint32_t, uint32_t> cur = stack.back();
> ▒
> 0.03 │ │ mov %rcx,%r12
> ▒
> │ │if (visited_row[cur.second * visited_stride + cur.first])
> continue; ▒
> 0.39 │ │ imul %rcx,%rdx
> ▒
> 1.00 │ │ add %rsi,%rdx
> ▒
> 1.35 │ │ add %rdi,%rdx
> ▒
> 1.37 │ ├──cmpb $0x0,(%rdx)
> ▒
> 9.56 │ └──jne 1241
> ▒ while (!stack.empty()) {
> ▒
> │1241:┌─→cmp %rax,-0x370(%rbp)
> ▒
> 0.70 │ │↓ je 1481
> ▒
> │ │std::pair<uint32_t, uint32_t> cur = stack.back();
> ▒
> 0.02 │124e:│ mov -0x8(%rax),%rdx
> ▒
> │ │std::vector<std::pair<unsigned int, unsigned int>,
> std::allocator<std::pair<unsigned int, unsigned int> > >::pop_back▒
> │ │--this->_M_impl._M_finish;
> ▒
> 0.63 │ │ sub $0x8,%rax
> ▒
> 0.74 │ │ mov %rax,-0x368(%rbp)
> ▒
> │ │FindTextLikePatches():
> ▒
> 0.01 │ │ mov %edx,%esi
> ◆
> 0.35 │ │ mov %rdx,%rdi
> ▒
> 0.82 │ │ mov %rdx,-0x490(%rbp)
> ▒
> │ │if (visited_row[cur.second * visited_stride + cur.first])
> continue; ▒
> 0.00 │ │ mov -0x5f8(%rbp),%rdx
> ▒
> │ │std::pair<uint32_t, uint32_t> cur = stack.back();
> ▒
> 1.32 │ │ shr $0x20,%rdi
> ▒
> 0.11 │ │ mov %rsi,%r15
> ▒
> 0.09 │ │ mov %edi,%ecx
> ▒
> │ │if (visited_row[cur.second * visited_stride + cur.first])
> continue; ▒
> 0.72 │ │ mov -0x5f0(%rbp),%rdi
> ▒
> │ │std::pair<uint32_t, uint32_t> cur = stack.back();
> ▒
> 0.03 │ │ mov %rcx,%r12
> ▒
> │ │if (visited_row[cur.second * visited_stride + cur.first])
> continue; ▒
> 0.39 │ │ imul %rcx,%rdx
> ▒
> 1.00 │ │ add %rsi,%rdx
> ▒
> 1.35 │ │ add %rdi,%rdx
> ▒
> 1.37 │ ├──cmpb $0x0,(%rdx)
> ▒
> 9.56 │ └──jne 1241
> ▒
>
>
> Moreover we have heuristics saying that continue usually closes loops with
> low trip count.
> -fno-ivopts improves performance with --num_threads=0 however makes number
> of instructions worse. Curiously enought with --num_threads=1 and more
> ivopts is benefical.
Hi, thanks for the analysis.
It seems that Clang has better performance than GCC in case of no vectorizer?
What about with vectorizer ?
Well, we have a lot testing benchmark between our downstream RISCV Clang vs
GCC.
>From my observation, Clang do much better than GCC in case of scalar code.
For example, we found in mlperf, Clang 40% better than GCC when specifying
no-vectorizer. However, when enabling vectorization, gcc does better than clang
overall from my observation.
Thanks