http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50256
Bug #: 50256
Summary: AVR GCC - several unnecessary register moves
Classification: Unclassified
Product: gcc
Version: 4.3.3
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
AssignedTo: [email protected]
ReportedBy: [email protected]
Hi,
AVR GCC seems to generate inefficent code. Function below multiplies two
unsigned 24-bit max values, then effectively shifts right by 24 shifts.
uint32_t MulU3U3S3(uint32_t a_u3, uint32_t b_u3)
{
uint32_t answer;
asm volatile
(
"push r0" "\n\t"
"push r1" "\n\t"
"clr r20" "\n\t" // zero register
// 0 byte shifts
"mul %A1,%A2" "\n\t" // a1a2
"mov r2,r0" "\n\t"
"mov r3,r1" "\n\t"
// 1 byte shifts
"mul %A1,%B2" "\n\t"
"add r3,r0" "\n\t"
"adc r4,r1" "\n\t"
"adc r5,r20" "\n\t"
"mul %A2,%B1" "\n\t"
"add r3,r0" "\n\t"
"adc r4,r1" "\n\t"
"adc r5,r20" "\n\t"
// 2 byte shifts
"mul %A1,%C2" "\n\t"
"add r4,r0" "\n\t"
"adc r5,r1" "\n\t"
"adc r6,r20" "\n\t"
"mul %A2,%C1" "\n\t"
"add r4,r0" "\n\t"
"adc r5,r1" "\n\t"
"adc r6,r20" "\n\t"
"mul %B2,%B1" "\n\t"
"add r4,r0" "\n\t"
"adc r5,r1" "\n\t"
"adc r6,r20" "\n\t"
// 3 byte shifts
"mul %B1,%C2" "\n\t"
"add r5,r0" "\n\t"
"adc r6,r1" "\n\t"
"adc r7,r20" "\n\t"
"mul %B2,%C1" "\n\t"
"add r5,r0" "\n\t"
"adc r6,r1" "\n\t"
"adc r7,r20" "\n\t"
// 4 byte shifts
"mul %C2,%C1" "\n\t"
"add r6,r0" "\n\t"
"adc r7,r1" "\n\t"
"mov %A0,r5" "\n\t"
"mov %B0,r6" "\n\t"
"mov %C0,r7" "\n\t"
"clr %D0" "\n\t"
"pop r1" "\n\t"
"pop r0" "\n\t"
: "=&r" (answer)
: "r" (a_u3), "r" (b_u3)
: "r0","r1","r2","r3","r4","r5","r6","r7","r20"
);
return (answer);
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Calling code
(note moves after function..why cant function leave answer in place?)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
878 040c 6CE5 ldi r22,lo8(167772)
879 040e 7FE8 ldi r23,hi8(167772)
880 0410 82E0 ldi r24,hlo8(167772)
881 0412 90E0 ldi r25,hhi8(167772)
882 0414 20EA ldi r18,lo8(100000)
883 0416 36E8 ldi r19,hi8(100000)
884 0418 41E0 ldi r20,hlo8(100000)
885 041a 50E0 ldi r21,hhi8(100000)
886 041c 0E94 0000 call MulU3U3S3
887 0420 7B01 movw r14,r22
888 0422 8C01 movw r16,r24
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Called code is below. Note that
- one argument is unnecessarily moved to a new location
- at end, result is unnecessarily moved to a new location
also this code is unnecessary too
283 010e 8901 movw r16,r18
284 0110 9A01 movw r18,r20
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
263 .global MulU3U3S3
265 MulU3U3S3:
266 .LFB8:
267 .LM19:
268 .LVL22:
269 00f6 2F92 push r2
270 00f8 3F92 push r3
271 00fa 4F92 push r4
272 00fc 5F92 push r5
273 00fe 6F92 push r6
274 0100 7F92 push r7
275 0102 CF92 push r12
276 0104 DF92 push r13
277 0106 EF92 push r14
278 0108 FF92 push r15
279 010a 0F93 push r16
280 010c 1F93 push r17
281 /* prologue: function */
282 /* frame size = 0 */
283 010e 8901 movw r16,r18
284 0110 9A01 movw r18,r20
285 .LM20:
286 0112 6801 movw r12,r16
287 0114 7901 movw r14,r18
288 /* #APP */
289 ; 326 "maths_mul.c" 1
290 0116 0F92 push r0
291 0118 1F92 push r1
292 011a 4427 clr r20
293 011c 6C9D mul r22,r12
294 011e 202C mov r2,r0
295 0120 312C mov r3,r1
296 0122 6D9D mul r22,r13
297 0124 300C add r3,r0
298 0126 411C adc r4,r1
299 0128 541E adc r5,r20
300 012a C79E mul r12,r23
301 012c 300C add r3,r0
302 012e 411C adc r4,r1
303 0130 541E adc r5,r20
304 0132 6E9D mul r22,r14
305 0134 400C add r4,r0
306 0136 511C adc r5,r1
307 0138 641E adc r6,r20
308 013a C89E mul r12,r24
309 013c 400C add r4,r0
310 013e 511C adc r5,r1
311 0140 641E adc r6,r20
312 0142 D79E mul r13,r23
313 0144 400C add r4,r0
314 0146 511C adc r5,r1
315 0148 641E adc r6,r20
316 014a 7E9D mul r23,r14
317 014c 500C add r5,r0
318 014e 611C adc r6,r1
319 0150 741E adc r7,r20
320 0152 D89E mul r13,r24
321 0154 500C add r5,r0
322 0156 611C adc r6,r1
323 0158 741E adc r7,r20
324 015a E89E mul r14,r24
325 015c 600C add r6,r0
326 015e 711C adc r7,r1
327 0160 052D mov r16,r5
328 0162 162D mov r17,r6
329 0164 272D mov r18,r7
330 0166 3327 clr r19
331 0168 1F90 pop r1
332 016a 0F90 pop r0
333
334 ; 0 "" 2
335 .LVL23:
336 .LM21:
337 /* #NOAPP */
338 016c B801 movw r22,r16
339 .LVL24:
340 016e C901 movw r24,r18
341 .LVL25:
342 /* epilogue start */
343 0170 1F91 pop r17
344 0172 0F91 pop r16
345 0174 FF90 pop r15
346 0176 EF90 pop r14
347 0178 DF90 pop r13
348 017a CF90 pop r12
349 017c 7F90 pop r7
350 017e 6F90 pop r6
351 0180 5F90 pop r5
352 0182 4F90 pop r4
353 0184 3F90 pop r3
354 0186 2F90 pop r2
355 0188 0895 ret