Hi, compile the following 128-bit GCD routine for the AMD64 processor with full optimization:
--- gcdti3.c --- // Stein's algorithm: greatest common divisor __uint128_t __gcdti3(__uint128_t p, __uint128_t q) { unsigned r, s = 0; __uint128_t t; if (p == 0) return q; if (q == 0) return p; if (p == q) return p; if (((unsigned long long) p | (unsigned long long) q) == 0) p >>= 64, q >>= 64, s = 64; r = __builtin_ctzll((unsigned long long) p | (unsigned long long) q), p >>= r, q >>= r, s += r; if ((unsigned long long) p == 0) p >>= 64; r = __builtin_ctzll(p), p >>= r; do { if ((unsigned long long) q == 0) q >>= 64; r = __builtin_ctzll(q), q >>= r; if (p < q) t = q, q = p, p = t; } while (q -= p); return p << s; } --- EOF --- GCC 12.2: gcc -O3 gcdti3.c # https://godbolt.org/z/d1Ma9qnsf __gcdti3(unsigned __int128, unsigned __int128): mov rax, rsi # OOPS: GCCs plays six rounds of shell game! mov r8, rdi # mov rsi, rdi # mov rdi, rax # mov rax, rdx # mov rdx, rcx # mov rcx, rdi or rcx, r8 je .L1 mov rcx, r8 mov r8, rdi xor rcx, rax xor r8, rdx or rcx, r8 je .L9 mov rcx, rax or rcx, rdx je .L9 mov rcx, rsi xor r10d, r10d or rcx, rax jne .L3 mov rsi, rdi mov rax, rdx xor edi, edi xor edx, edx mov rcx, rsi mov r10d, 64 or rcx, rax .L3: rep bsf rcx, rcx xor r8d, r8d # OUCH: BSF and TZCNT return at most 63, shrd rsi, rdi, cl shr rdi, cl test cl, 64 # so this is dead code! cmovne rsi, rdi # cmovne rdi, r8 # shrd rax, rdx, cl xor r11d, r11d # OUCH: BSF and TZCNT return at most 63, shr rdx, cl test cl, 64 # so this is dead code! cmovne rax, rdx # cmovne rdx, r11 # mov r8, rsi mov r9, rdi add r10d, ecx mov rcx, r8 mov rsi, rax mov rdi, rdx test r8, r8 je .L14 .L4: rep bsf rcx, rcx mov rax, r8 mov rdx, r9 xor r11d, r11d # OUCH: BSF and TZCNT return at most 63, shr rdx, cl shrd rax, r9, cl and ecx, 64 # (there's also no need to modify ECX) cmovne rax, rdx # so this is dead code! cmovne rdx, r11 # .L7: mov rcx, rsi test rsi, rsi jne .L5 mov rsi, rdi xor edi, edi mov rcx, rsi .L5: rep bsf rcx, rcx xor r8d, r8d # OUCH: BSF and TZCNT return at most 63, shrd rsi, rdi, cl shr rdi, cl test cl, 64 # so this is dead code, mov rcx, rdx cmovne rsi, rdi # and that too! cmovne rdi, r8 # cmp rax, rsi sbb rcx, rdi jnc .L6 mov r8, rax mov r9, rdx mov rax, rsi mov rdx, rdi mov rsi, r8 mov rdi, r9 .L6: sub rsi, rax sbb rdi, rdx mov rcx, rdi or rcx, rsi jne .L7 mov ecx, r10d xor esi, esi shld rdx, rax, cl sal rax, cl and ecx, 64 # Oops: there's no need to modify ECX! cmovne rdx, rax cmovne rax, rsi ret .L9: mov rax, rsi mov rdx, rdi .L1: ret .L14: mov r8, r9 xor r9d, r9d mov rcx, r8 jmp .L4 20 superfluous instructions of the total 102 instructions! NOT AMUSED Stefan Kanthak PS: <https://skanthak.homepage.t-online.de/gcc.html#case27-amd64-s> shows properly written assembly code.