https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110015
jun zhang <zhangjungcc at gmail dot com> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |zhangjungcc at gmail dot com --- Comment #2 from jun zhang <zhangjungcc at gmail dot com> --- The following loop couldn't vectorize in gcc, but could in llvm. it has 3% improvement. more info, please refer: https://godbolt.org/z/zMbjq41h5 #include<string.h> typedef signed int OPJ_INT32; typedef unsigned int OPJ_UINT32; typedef int OPJ_BOOL; #define OPJ_TRUE 1 #define OPJ_FALSE 0 typedef char OPJ_CHAR; typedef float OPJ_FLOAT32; typedef double OPJ_FLOAT64; typedef unsigned char OPJ_BYTE; #define T1_NMSEDEC_FRACBITS 6 #define OPJ_RESTRICT restrict #define OPJ_TLS_KEY_T1 0 #include <stdio.h> typedef size_t OPJ_SIZE_T; typedef struct opj_tcd_cblk_enc { OPJ_BYTE* data; /* Data */ // opj_tcd_layer_t* layers; /* layer information */ // opj_tcd_pass_t* passes; /* information about the passes */ OPJ_INT32 x0, y0, x1, y1; /* dimension of the code-blocks : left upper corner (x0, y0) right low corner (x1,y1) */ OPJ_UINT32 numbps; OPJ_UINT32 numlenbits; OPJ_UINT32 data_size; /* Size of allocated data buffer */ OPJ_UINT32 numpasses; /* number of pass already done for the code-blocks */ OPJ_UINT32 numpassesinlayers; /* number of passes in the layer */ OPJ_UINT32 totalpasses; /* total number of passes */ } opj_tcd_cblk_enc_t; typedef struct opj_t1 { /** MQC component */ // opj_mqc_t mqc; OPJ_INT32 *data; /** Flags used by decoder and encoder. * Such that flags[1+0] is for state of col=0,row=0..3, flags[1+1] for col=1, row=0..3, flags[1+flags_stride] for col=0,row=4..7, ... This array avoids too much cache trashing when processing by 4 vertical samples as done in the various decoding steps. */ // opj_flag_t *flags; OPJ_UINT32 w; OPJ_UINT32 h; OPJ_UINT32 datasize; OPJ_UINT32 flagssize; OPJ_BOOL encoder; /* Thre 3 variables below are only used by the decoder */ /* set to TRUE in multithreaded context */ OPJ_BOOL mustuse_cblkdatabuffer; /* Temporary buffer to concatenate all chunks of a codebock */ OPJ_BYTE *cblkdatabuffer; /* Maximum size available in cblkdatabuffer */ OPJ_UINT32 cblkdatabuffersize; } opj_t1_t; #define INLINE __inline__ static INLINE OPJ_INT32 opj_int_max(OPJ_INT32 a, OPJ_INT32 b) { return (a > b) ? a : b; } #define opj_to_smr(x) ((x) >= 0 ? (OPJ_UINT32)(x) : ((OPJ_UINT32)(-x) | 0x80000000U)) OPJ_FLOAT64 opj_t1_encode_cblk(opj_t1_t *t1, opj_tcd_cblk_enc_t* cblk, OPJ_UINT32 orient, OPJ_UINT32 compno, OPJ_UINT32 level, OPJ_UINT32 qmfbid, OPJ_FLOAT64 stepsize, OPJ_UINT32 cblksty, OPJ_UINT32 numcomps, const OPJ_FLOAT64 * mct_norms, OPJ_UINT32 mct_numcomps) { OPJ_INT32 max; OPJ_UINT32 i, j; OPJ_INT32* datap; max = 0; datap = t1->data; for (j = 0; j < t1->h; ++j) { const OPJ_UINT32 w = t1->w; for (i = 0; i < w; ++i, ++datap) { OPJ_INT32 tmp = *datap; if (tmp < 0) { OPJ_UINT32 tmp_unsigned; max = opj_int_max(max, -tmp); tmp_unsigned = opj_to_smr(tmp); memcpy(datap, &tmp_unsigned, sizeof(OPJ_INT32)); } else { max = opj_int_max(max, tmp); } } } cblk->numbps = max ? 6 : 0; }