------- Additional Comments From pinskia at gcc dot gnu dot org 2005-05-05 17:00 ------- Note with the following code, I get back to what it is without -mmx: union b { int i[2]; __m64 j; }a; __m64 sum = _mm_set_pi32(0, 0); for (int j=0 ; j < yl ; j++) { short *p = &pic_data[j][0]; short *r = &ref_data[j][0]; for (int i=0 ; i < xl ; i+=4, p +=4, r+=4 ) { __m64 pic = *(__m64 *)p; __m64 ref = *(__m64 *)r; // pic - ref pic = _mm_sub_pi16 (pic, ref); // abs (pic - ref) ref = _mm_srai_pi16(pic, 15); pic = _mm_xor_si64(pic, ref); pic = _mm_sub_pi16 (pic, ref); // sum += abs(pic -ref) ref = _mm_xor_si64(ref, ref); ref = _mm_unpackhi_pi16(pic, ref); pic = _mm_unpacklo_pi16(pic, pic); pic = _mm_srai_pi32 (pic, 16); //ref = _mm_srai_pi32 (ref, 16); pic = _mm_add_pi32 (pic, ref); sum = _mm_add_pi32 (sum, pic); } } a.j = sum; // int *result = (int *) ∑ _mm_empty();
// return result[0] + result[1]; return a.i[0] + a.i[1]; -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21395