[Bug middle-end/55266] vector expansion: 24 movs for 4 adds

vincenzo.innocente at cern dot ch Sun, 03 Mar 2013 03:58:45 -0800


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55266




--- Comment #4 from vincenzo Innocente <vincenzo.innocente at cern dot ch> 
2013-03-03 11:58:24 UTC ---

I see still problems when calling inline functions.

It seems that the code to satisfy the "calling ABI" is generated anyhow.



take the example below and compare the code generated for "dotd1" wrt "dotd2"

dotd2 has a "storm" of move before the reduction



c++ -std=c++11 -Ofast -march=corei7 -S conversions.cc -fabi-version=0 

the avx version is better but for dotd4 (actually dotd1 is lelf see like)



typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;

typedef double  __attribute__( ( vector_size( 32 ) ) ) float64x4_t;





inline 

float64x4_t convert(float32x4_t f) {

  return float64x4_t{f[0],f[1],f[2],f[3]};

}



float dotf(float32x4_t x, float32x4_t y) {

  float ret=0;

  for (int i=0;i!=4;++i) ret+=x[i]*y[i];

  return ret;

}



inline

double dotd(float64x4_t x, float64x4_t y) {

  double ret=0;

  for (int i=0;i!=4;++i) ret+=x[i]*y[i];

  return ret;

}







float dotd1(float32x4_t x, float32x4_t y) {

  float64x4_t dx,dy;

  for (int i=0;i!=4;++i) {

    dx[i]=x[i]; dy[i]=y[i];

  }

  double ret=0;

  for (int i=0;i!=4;++i) ret+=dx[i]*dy[i];

  return ret;

}



float dotd2(float32x4_t x, float32x4_t y) {

  float64x4_t dx=convert(x);

  float64x4_t dy=convert(y);

  return dotd(dx,dy);

}





float dotd3(float32x4_t x, float32x4_t y) {

  float64x4_t dx{x[0],x[1],x[2],x[3]};

  float64x4_t dy{y[0],y[1],y[2],y[3]};

  double ret=0;

  for (int i=0;i!=4;++i) ret+=dx[i]*dy[i];

  return ret;

}



float dotd4(float32x4_t x, float32x4_t y) {

  float64x4_t dx,dy;

  for (int i=0;i!=4;++i) {

    dx[i]=x[i]; dy[i]=y[i];

  }

  return dotd(dx,dy);

}

[Bug middle-end/55266] vector expansion: 24 movs for 4 adds

Reply via email to