jun zhang <zhangjungcc at gmail dot com> changed:

           What    |Removed                     |Added
                 CC|                            |zhangjungcc at gmail dot com

--- Comment #2 from jun zhang <zhangjungcc at gmail dot com> ---
  The following loop couldn't vectorize in gcc, but could in llvm. it has 3%
more info, please refer:

typedef signed int  OPJ_INT32;
typedef unsigned int OPJ_UINT32;
typedef int OPJ_BOOL;
#define OPJ_TRUE 1
#define OPJ_FALSE 0
typedef char          OPJ_CHAR;
typedef float         OPJ_FLOAT32;
typedef double        OPJ_FLOAT64;
typedef unsigned char OPJ_BYTE;
#define OPJ_RESTRICT restrict
#define OPJ_TLS_KEY_T1  0
#include <stdio.h>
typedef size_t   OPJ_SIZE_T;

typedef struct opj_tcd_cblk_enc {
    OPJ_BYTE* data;               /* Data */
//    opj_tcd_layer_t* layers;      /* layer information */
//    opj_tcd_pass_t* passes;       /* information about the passes */
    OPJ_INT32 x0, y0, x1,
              y1;     /* dimension of the code-blocks : left upper corner (x0,
y0) right low corner (x1,y1) */
    OPJ_UINT32 numbps;
    OPJ_UINT32 numlenbits;
    OPJ_UINT32 data_size;         /* Size of allocated data buffer */
    numpasses;         /* number of pass already done for the code-blocks */
    OPJ_UINT32 numpassesinlayers; /* number of passes in the layer */
    OPJ_UINT32 totalpasses;       /* total number of passes */
} opj_tcd_cblk_enc_t;
typedef struct opj_t1 {

    /** MQC component */
//    opj_mqc_t mqc;

    OPJ_INT32  *data;
    /** Flags used by decoder and encoder.
     * Such that flags[1+0] is for state of col=0,row=0..3,
       flags[1+1] for col=1, row=0..3, flags[1+flags_stride] for
col=0,row=4..7, ...
       This array avoids too much cache trashing when processing by 4 vertical
       as done in the various decoding steps. */
//    opj_flag_t *flags;

    OPJ_UINT32 w;
    OPJ_UINT32 h;
    OPJ_UINT32 datasize;
    OPJ_UINT32 flagssize;
    OPJ_BOOL   encoder;

    /* Thre 3 variables below are only used by the decoder */
    /* set to TRUE in multithreaded context */
    OPJ_BOOL     mustuse_cblkdatabuffer;
    /* Temporary buffer to concatenate all chunks of a codebock */
    OPJ_BYTE    *cblkdatabuffer;
    /* Maximum size available in cblkdatabuffer */
    OPJ_UINT32   cblkdatabuffersize;
} opj_t1_t;

#define INLINE __inline__
static INLINE OPJ_INT32 opj_int_max(OPJ_INT32 a, OPJ_INT32 b)
    return (a > b) ? a : b;
#define opj_to_smr(x)   ((x) >= 0 ? (OPJ_UINT32)(x) : ((OPJ_UINT32)(-x) |
OPJ_FLOAT64 opj_t1_encode_cblk(opj_t1_t *t1,
                                      opj_tcd_cblk_enc_t* cblk,
                                      OPJ_UINT32 orient,
                                      OPJ_UINT32 compno,
                                      OPJ_UINT32 level,
                                      OPJ_UINT32 qmfbid,
                                      OPJ_FLOAT64 stepsize,
                                      OPJ_UINT32 cblksty,
                                      OPJ_UINT32 numcomps,
                                      const OPJ_FLOAT64 * mct_norms,
                                      OPJ_UINT32 mct_numcomps)
    OPJ_INT32 max;
    OPJ_UINT32 i, j;
    OPJ_INT32* datap;

    max = 0;
    datap = t1->data;
    for (j = 0; j < t1->h; ++j) {
        const OPJ_UINT32 w = t1->w;
        for (i = 0; i < w; ++i, ++datap) {
            OPJ_INT32 tmp = *datap;
            if (tmp < 0) {
                OPJ_UINT32 tmp_unsigned;
                max = opj_int_max(max, -tmp);
                tmp_unsigned = opj_to_smr(tmp);
                memcpy(datap, &tmp_unsigned, sizeof(OPJ_INT32));
            } else {
                max = opj_int_max(max, tmp);
        cblk->numbps = max ? 6 : 0;

Reply via email to