https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104275

            Bug ID: 104275
           Summary: Os does not apply return value optimization while O2
                    and O3 does
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: andij.cr at gmail dot com
  Target Milestone: ---

tested from gcc 8 to gcc 11

an identity function (mark) interposed in a call stack that ends in a complex
type is reasonably elided in O2 and O3, but at Os it creates a somewhat strange
assembly.
tested on arm32 and x86_64. 
for a less artificial example, where the problem still appears:
https://gcc.godbolt.org/z/GbKrGKa6f

code:

https://godbolt.org/z/v95jEvvzc

// condensed result of a constexpr trasformation.
// in this form, it would be nice if it was transparent to the value
template <typename Ts>
auto mark(Ts&& head) noexcept -> decltype(auto) {
    return static_cast<Ts&&>(head);
}

#include <vector>
// generic producer of a complex type
auto generate() -> std::vector<double>;


// here is a stack of functions using mark
namespace {
// in an anonymous namespace to nudge the compiler to inline them
auto user_base() { return mark(generate()); }
auto user_mark() { return mark(user_base()); }
auto user_mark2() { return mark(user_mark()); }
auto user_mark3() { return mark(user_mark2()); }
}  // namespace

// this function has a normal assembly at O2 and O3
// but a silly one at Os
auto user_mark4() { return mark(user_mark3()); }


compiled with 
-std=c++17 -O2

user_mark4():
        push    r12
        mov     r12, rdi
        sub     rsp, 32
        mov     rdi, rsp
        call    generate()
        mov     rax, QWORD PTR [rsp]
        mov     QWORD PTR [r12], rax
        mov     rax, QWORD PTR [rsp+8]
        mov     QWORD PTR [r12+8], rax
        mov     rax, QWORD PTR [rsp+16]
        mov     QWORD PTR [r12+16], rax
        add     rsp, 32
        mov     rax, r12
        pop     r12
        ret

compiled with
-std=c++17 -Os 
user_mark4():
        push    r13
        push    r12
        mov     r12, rdi
        push    rbp
        push    rbx
        sub     rsp, 40
        lea     rdi, [rsp+8]
        call    generate()
        lea     rdi, [rsp+8]
        mov     r13, QWORD PTR [rsp+8]
        mov     rbp, QWORD PTR [rsp+16]
        mov     QWORD PTR [rsp+8], 0
        mov     rbx, QWORD PTR [rsp+24]
        mov     QWORD PTR [rsp+16], 0
        mov     QWORD PTR [rsp+24], 0
        call    std::_Vector_base<double, std::allocator<double>
>::~_Vector_base() [base object destructor]
        lea     rdi, [rsp+8]
        mov     QWORD PTR [rsp+24], 0
        mov     QWORD PTR [rsp+16], 0
        mov     QWORD PTR [rsp+8], 0
        call    std::_Vector_base<double, std::allocator<double>
>::~_Vector_base() [base object destructor]
        lea     rdi, [rsp+8]
        mov     QWORD PTR [rsp+24], 0
        mov     QWORD PTR [rsp+16], 0
        mov     QWORD PTR [rsp+8], 0
        call    std::_Vector_base<double, std::allocator<double>
>::~_Vector_base() [base object destructor]
        lea     rdi, [rsp+8]
        mov     QWORD PTR [rsp+24], 0
        mov     QWORD PTR [rsp+16], 0
        mov     QWORD PTR [rsp+8], 0
        call    std::_Vector_base<double, std::allocator<double>
>::~_Vector_base() [base object destructor]
        mov     QWORD PTR [r12], r13
        lea     rdi, [rsp+8]
        mov     QWORD PTR [r12+8], rbp
        mov     QWORD PTR [r12+16], rbx
        mov     QWORD PTR [rsp+24], 0
        mov     QWORD PTR [rsp+16], 0
        mov     QWORD PTR [rsp+8], 0
        call    std::_Vector_base<double, std::allocator<double>
>::~_Vector_base() [base object destructor]
        add     rsp, 40
        mov     rax, r12
        pop     rbx
        pop     rbp
        pop     r12
        pop     r13
        ret

Reply via email to