I have very simple program which basically does complex matrix convolution operation. I am seeing 3 times performance degradation if this program is compiled with 4.3.2 version vs compiled with 4.0.2. I am compiling this program with -O3 option, no additional optimization flags supplied. Also one more interesting thing to note is that this behavior is seen only with complex data type, if i use plain float data type, timings are better with 4.3.2 version. Please help me.
#include <complex> #include <iostream> #include <stdio.h> #include <time.h> float procTimeInSeconds() { return clock()/static_cast<float>(CLOCKS_PER_SEC); } using namespace std; int main(int argc , char** arg ) { const int Nc = 32; // total matrix const int Nx = 512; // columns const int Nn = 16; //typical value const int Ns = 10; const int Nw = Nc * Nn; complex<float>* all_weights = new complex<float>[Nx*Nw*Nc]; complex<float>* input = (complex<float>*)new complex<float>[Nx*Nw*Ns]; complex<float>* output = (complex<float>*)new complex<float>[Nx*Nc*Ns]; int weights_stride_c = Nx * Nw; int weights_stride_w = Nx; int weights_stride_x = 1; int input_stride_s = Nx * Nw; int input_stride_w = Nx; int input_stride_x = 1; int output_stride_s = Nx * Nc; int output_stride_c = Nx; int output_stride_x = 1; // ================================================================ // Round 1 // Do array reductions as we decend into the loop nesting, // keeping temporary pointers for each result. // Results: Faster for unoptimized compilation, but slower for // compiler optimization on. // ================================================================ int count = 0; float startTime = procTimeInSeconds(); complex<float>* input_s; complex<float>* output_s ; complex<float>* curr_weight_c; complex<float>* output_sc; complex<float>* curr_weight_cw; complex<float>* input_sw; for(int is = 0; is < Ns; ++is ) { input_s = &input[is*input_stride_s]; output_s = &output[is*output_stride_s]; for (int ic=0; ic<Nc; ++ic) { curr_weight_c = &all_weights[ic * weights_stride_c]; output_sc = &output_s[ic*output_stride_c]; // for that matrix, loop through w for (int iw=0; iw<Nw; ++iw) { curr_weight_cw = &curr_weight_c[weights_stride_w * iw]; input_sw = &input_s[iw*input_stride_w]; for (int ix=0; ix<Nx; ++ix) { output_sc[ix*output_stride_x] += curr_weight_cw[ix*weights_stride_x] * input_sw[ix*input_stride_x]; ++count; } } } } //delete [] all_weights; float netTime = procTimeInSeconds() - startTime; cout << count << " in " << netTime << " seconds, round 1" << std::endl; return 0; } -- Summary: performance degradation with STL complex convolution operation Product: gcc Version: 4.3.3 Status: UNCONFIRMED Severity: major Priority: P3 Component: c++ AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: jagjeet dot nain at gmail dot com http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42194