https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118189
Bug ID: 118189 Summary: Weired vec_contruct of elements who's from continuous memory Product: gcc Version: 15.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: liuhongt at gcc dot gnu.org Blocks: 53947 Target Milestone: --- double foo (double* a, double* b, double c) { c += (a[0] - b[0]) * (a[0] - b[0]); c += (a[1] - b[1]) * (a[1] - b[1]); c += (a[2] - b[2]) * (a[2] - b[2]); return c; } gcc -O2 -march=x86-64-v3 /app/example.cpp:6:7: note: node (external) 0x25c51bd0 (max_nunits=2, refcnt=1) vector(2) double /app/example.cpp:6:7: note: stmt 0 _9 = MEM[(double *)a_14(D) + 16B]; /app/example.cpp:6:7: note: stmt 1 _5 = MEM[(double *)a_14(D) + 8B]; /app/example.cpp:6:7: note: node (external) 0x25c51c60 (max_nunits=2, refcnt=1) vector(2) double /app/example.cpp:6:7: note: stmt 0 _10 = MEM[(double *)b_15(D) + 16B]; /app/example.cpp:6:7: note: stmt 1 _6 = MEM[(double *)b_15(D) + 8B]; /app/example.cpp:6:7: note: Cost model analysis: powmult_4 + c_18 3 times scalar_stmt costs 12 in body _11 * _11 1 times scalar_stmt costs 20 in body _7 * _7 1 times scalar_stmt costs 20 in body _9 - _10 1 times scalar_stmt costs 12 in body _5 - _6 1 times scalar_stmt costs 12 in body _9 - _10 1 times vector_stmt costs 12 in body node 0x25c51bd0 1 times vec_construct costs 4 in prologue node 0x25c51c60 1 times vec_construct costs 4 in prologue _11 * _11 1 times vector_stmt costs 20 in body powmult_4 + c_18 1 times vector_stmt costs 12 in body powmult_4 + c_18 1 times vec_perm costs 4 in body powmult_4 + c_18 1 times vec_to_scalar costs 4 in body powmult_4 + c_18 2 times scalar_stmt costs 8 in body /app/example.cpp:6:7: note: Cost model analysis for part in loop 0: Vector cost: 68 Scalar cost: 76 ... <bb 2> [local count: 1073741824]: # DEBUG BEGIN_STMT _1 = *a_14(D); _2 = *b_15(D); _3 = _1 - _2; powmult_12 = _3 * _3; # DEBUG c => powmult_12 + c_16(D) # DEBUG BEGIN_STMT _5 = MEM[(double *)a_14(D) + 8B]; _6 = MEM[(double *)b_15(D) + 8B]; _7 = _5 - _6; powmult_8 = _7 * _7; _20 = powmult_8 + powmult_12; c_18 = c_16(D) + _20; # DEBUG c => c_18 # DEBUG BEGIN_STMT _9 = MEM[(double *)a_14(D) + 16B]; _23 = {_9, _5}; ------------------------- from a + 8 and a + 16 _10 = MEM[(double *)b_15(D) + 16B]; _22 = {_10, _6}; vect__11.3_21 = _23 - _22; vect_powmult_4.4_17 = vect__11.3_21 * vect__11.3_21; _11 = _9 - _10; powmult_4 = _11 * _11; _24 = .REDUC_PLUS (vect_powmult_4.4_17); _25 = c_16(D) + powmult_12; _26 = _24 + _25; c_19 = _26; # DEBUG c => c_19 # DEBUG BEGIN_STMT return c_19; } Referenced Bugs: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947 [Bug 53947] [meta-bug] vectorizer missed-optimizations