Good morning. I have some code that looks like
typedef unsigned long long uint64; typedef unsigned int uint32; typedef struct { uint64 x[8]; } __attribute__((aligned(64))) v_t; inline v_t xor(v_t a, v_t b) { v_t Q; for (int i=0; i<8; i++) Q.x[i] = a.x[i] ^ b.x[i]; return Q; } void xor_matrix_precomp(v_t* __restrict__ a, v_t* __restrict__ c, v_t* __restrict__ d, int n) { uint32 i,j; for (i=0; i<n; i++) { v_t vi = a[i]; v_t acc = c[0*256 + (vi.x[0] & 0xff)]; for (j=1; j<64; j++) { uint32 w = j>>3, b=j&7; acc = xor(acc, c[j*256 + ((vi.x[w] >> (8*b))&0xff)]); } d[i] = xor(d[i], acc); } } built with /home/nfsworld/tooling/gcc-9.1-isl16/bin/gcc -O3 -fomit-frame-pointer -march=skylake-avx512 -mprefer-vector-width=512 -S badeg.c and the inner xor_matrix loop is not vectorised at all: it carries ‘acc’ in eight x86-64 registers rather than one ZMM. What am I missing? -fopt-info-all-vec is not helping me, it points out that it can’t vectorise the i or j loop but says nothing about the loop in the inlined xor() call. I’m sure I’m missing something obvious; many thanks for your help. Tom