That was with gcc 4.9.1 -O2 -march=native, by the way. Here's clang 3.5.0
-O2 -march=native:
0000000000400570 <squareroot>:
400570: c5 fb 59 c8 vmulsd %xmm0,%xmm0,%xmm1
400574: c5 f3 5c d0 vsubsd %xmm0,%xmm1,%xmm2
400578: c5 e9 54 0d 80 04 00 vandpd 0x480(%rip),%xmm2,%xmm1 #
400a00 <_IO_stdin_used+0x60>
40057f: 00
400580: c5 f9 2e 0d 28 04 00 vucomisd 0x428(%rip),%xmm1 # 4009b0
<_IO_stdin_used+0x10>
400587: 00
400588: 76 3d jbe 4005c7 <squareroot+0x57>
40058a: c5 fb 10 1d 26 04 00 vmovsd 0x426(%rip),%xmm3 # 4009b8
<_IO_stdin_used+0x18>
400591: 00
400592: c5 fb 10 25 66 04 00 vmovsd 0x466(%rip),%xmm4 # 400a00
<_IO_stdin_used+0x60>
400599: 00
40059a: c5 f8 28 c8 vmovaps %xmm0,%xmm1
40059e: 66 90 xchg %ax,%ax
4005a0: c5 f3 59 eb vmulsd %xmm3,%xmm1,%xmm5
4005a4: c5 eb 5e d5 vdivsd %xmm5,%xmm2,%xmm2
4005a8: c5 f3 58 ca vaddsd %xmm2,%xmm1,%xmm1
4005ac: c5 f3 59 d1 vmulsd %xmm1,%xmm1,%xmm2
4005b0: c5 eb 5c d0 vsubsd %xmm0,%xmm2,%xmm2
4005b4: c5 e9 54 ec vandpd %xmm4,%xmm2,%xmm5
4005b8: c5 f9 2e 2d f0 03 00 vucomisd 0x3f0(%rip),%xmm5 # 4009b0
<_IO_stdin_used+0x10>
4005bf: 00
4005c0: 77 de ja 4005a0 <squareroot+0x30>
4005c2: c5 f8 28 c1 vmovaps %xmm1,%xmm0
4005c6: c3 retq
4005c7: c3 retq
4005c8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
4005cf: 00
On Tuesday, January 27, 2015 at 7:17:58 PM UTC-5, Miles Lubin wrote:
>
> Here's the Julia assembly:
>
> function squareroot(x)
> it = x
> err = it*it-x # line 5
> while abs(err) > 1e-13
> it = it - (err)/(2it) # line 7
> err = it*it-x
> end
> return it
> end
> Source line: 5
> push RBP
> mov RBP, RSP
> Source line: 5
> vmulsd XMM1, XMM0, XMM0
> vsubsd XMM2, XMM1, XMM0
> Source line: 6
> vmovq RCX, XMM2
> movabs RAX, 9223372036854775807
> and RCX, RAX
> vmovq XMM1, RCX
> movabs RCX, 140067103257920
> vucomisd XMM1, QWORD PTR [RCX]
> ja 9
> vmovaps XMM1, XMM0
> jmpq 61
> movabs RDX, 140067103257928
> vmovsd XMM3, QWORD PTR [RDX]
> vmovaps XMM1, XMM0
> Source line: 7
> vmulsd XMM4, XMM1, XMM3
> vdivsd XMM2, XMM2, XMM4
> vaddsd XMM1, XMM1, XMM2
> Source line: 8
> vmulsd XMM2, XMM1, XMM1
> vsubsd XMM2, XMM2, XMM0
> vmovq RDX, XMM2
> and RDX, RAX
> vmovq XMM4, RDX
> vucomisd XMM4, QWORD PTR [RCX]
> ja -43
> Source line: 10
> vmovaps XMM0, XMM1
> pop RBP
> ret
>
> And for C:
> double squareroot(double x)
> {
> double it = x;
> double err = it*it-x;
> while (fabs(err) > 1e-13) {
> it = it - (err)/(2*it);
> err = it*it-x;
> }
> return it;
> }
> 00000000004007d0 <squareroot>:
> 4007d0: c5 fb 59 c8 vmulsd %xmm0,%xmm0,%xmm1
> 4007d4: c5 f9 28 d0 vmovapd %xmm0,%xmm2
> 4007d8: c5 fb 10 2d e0 01 00 vmovsd 0x1e0(%rip),%xmm5 # 4009c0
> <_IO_stdin_used+0x70>
> 4007df: 00
> 4007e0: c5 fb 10 25 a8 01 00 vmovsd 0x1a8(%rip),%xmm4 # 400990
> <_IO_stdin_used+0x40>
> 4007e7: 00
> 4007e8: c5 f3 5c c8 vsubsd %xmm0,%xmm1,%xmm1
> 4007ec: c5 f9 28 d9 vmovapd %xmm1,%xmm3
> 4007f0: c5 e1 54 dd vandpd %xmm5,%xmm3,%xmm3
> 4007f4: c5 f9 2e dc vucomisd %xmm4,%xmm3
> 4007f8: 76 28 jbe 400822 <squareroot+0x52>
> 4007fa: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
> 400800: c5 eb 58 da vaddsd %xmm2,%xmm2,%xmm3
> 400804: c5 f3 5e cb vdivsd %xmm3,%xmm1,%xmm1
> 400808: c5 eb 5c d1 vsubsd %xmm1,%xmm2,%xmm2
> 40080c: c5 eb 59 ca vmulsd %xmm2,%xmm2,%xmm1
> 400810: c5 f3 5c c8 vsubsd %xmm0,%xmm1,%xmm1
> 400814: c5 f9 28 d9 vmovapd %xmm1,%xmm3
> 400818: c5 e1 54 dd vandpd %xmm5,%xmm3,%xmm3
> 40081c: c5 f9 2e dc vucomisd %xmm4,%xmm3
> 400820: 77 de ja 400800 <squareroot+0x30>
> 400822: c5 f9 28 c2 vmovapd %xmm2,%xmm0
> 400826: c3 retq
> 400827: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
> 40082e: 00 00
>
> I'm definitely not trying to making the fastest possible sqrt
> implementation. This is just for benchmarking purposes.
>
> Thanks,
> Miles
>
> On Tuesday, January 27, 2015 at 7:01:29 PM UTC-5, Erik Schnetter wrote:
>>
>> I would begin by looking at the generated assembler code. You would use
>> "objdump -d" for C, and the "code_native" function for Julia. Can you post
>> the results?
>>
>> The discussion below is beside your points, but nevertheless:
>>
>> Your sqrt implementation is very slow for two reasons:
>>
>> (1) You are using a while loop to check for termination. If you use
>> exponent manipulation, then your starting guess can be within sqrt(2) of
>> the result, and then you can determine the necessary number of iterations
>> ahead of time. You probably need 5 or 6 iterations for the accuracy you
>> chose.
>>
>> (2) You are using a division in your algorithm, where divisions are
>> expensive. If you calculated 1/sqrt(x) instead of sqrt(x), then you don't
>> need a division. Later you can calculate sqrt(x) = x * (1/sqrt(x)).
>>
>> A good sqrt routine should take less than 10^-8 seconds.
>>
>> -erik
>>
>