> I would say so. It saves code size and also uop space unless the two > can magically fuse to a immediate to %xmm move (I doubt that). I made simple benchmark
double a=10; int main() { long int i; double sum,val1,val2,val3,val4; for (i=0;i<1000000000;i++) { #if 1 #if 1 asm __volatile__("movabsq $0x3ff03db8fde2ef4e, %%r8;vmovq %%r8, %0": "=x"(val1): :"r8","xmm11"); asm __volatile__("movabsq $0x3ff03db8fde2ef4e, %%r8;vmovq %%r8, %0": "=x"(val2): :"r8","xmm11"); asm __volatile__("movabsq $0x3ff03db8fde2ef4e, %%r8;vmovq %%r8, %0": "=x"(val3): :"r8","xmm11"); asm __volatile__("movabsq $0x3ff03db8fde2ef4e, %%r8;vmovq %%r8, %0": "=x"(val4): :"r8","xmm11"); #else asm __volatile__("movq %1, %%r8;vmovq %%r8, %0": "=x"(val1):"m"(a) :"r8","xmm11"); asm __volatile__("movq %1, %%r8;vmovq %%r8, %0": "=x"(val2):"m"(a) :"r8","xmm11"); asm __volatile__("movq %1, %%r8;vmovq %%r8, %0": "=x"(val3):"m"(a) :"r8","xmm11"); asm __volatile__("movq %1, %%r8;vmovq %%r8, %0": "=x"(val4):"m"(a) :"r8","xmm11"); #endif #else asm __volatile__("vmovq %1, %0": "=x"(val1):"m"(a) :"r8","xmm11"); asm __volatile__("vmovq %1, %0": "=x"(val2):"m"(a) :"r8","xmm11"); asm __volatile__("vmovq %1, %0": "=x"(val3):"m"(a) :"r8","xmm11"); asm __volatile__("vmovq %1, %0": "=x"(val4):"m"(a) :"r8","xmm11"); #endif sum+=val1+val2+val3+val4; } return sum; and indeed the third variant runs 1.2s while the first two takes equal time 2.4s on my zen2 laptop.