On Tue, Apr 14, 2015 at 11:32 PM, Pawel Wodkowski < pawelx.wodkowski at intel.com> wrote:
> On 2015-04-14 23:31, Ravi Kerur wrote: > >> + >> + for (i = 0; i < 8; i++) { >> + ymm = _mm256_loadu_si256((const __m256i *)(src + >> i * 32)); >> + _mm256_storeu_si256((__m256i *)(dst + i * 32), >> ymm); >> + } >> + >> n -= 256; >> - ymm1 = _mm256_loadu_si256((const __m256i *)((const >> uint8_t *)src + 1 * 32)); >> - ymm2 = _mm256_loadu_si256((const __m256i *)((const >> uint8_t *)src + 2 * 32)); >> - ymm3 = _mm256_loadu_si256((const __m256i *)((const >> uint8_t *)src + 3 * 32)); >> - ymm4 = _mm256_loadu_si256((const __m256i *)((const >> uint8_t *)src + 4 * 32)); >> - ymm5 = _mm256_loadu_si256((const __m256i *)((const >> uint8_t *)src + 5 * 32)); >> - ymm6 = _mm256_loadu_si256((const __m256i *)((const >> uint8_t *)src + 6 * 32)); >> - ymm7 = _mm256_loadu_si256((const __m256i *)((const >> uint8_t *)src + 7 * 32)); >> - src = (const uint8_t *)src + 256; >> - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), >> ymm0); >> - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), >> ymm1); >> - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), >> ymm2); >> - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), >> ymm3); >> - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), >> ymm4); >> - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), >> ymm5); >> - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), >> ymm6); >> - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), >> ymm7); >> - dst = (uint8_t *)dst + 256; >> + src = src + 256; >> + dst = dst + 256; >> } >> > > Did you perform a performance test on that part? > > I ran "make test" which runs "memcpy perf" results were given in "cover-letter". I am pasting it here again. /**********************With changes*************************************/ Start memcpy_perf: Success [00m 00s] Memcpy performance autotest: Success [09m 36s] [17m 45s] /**********************Without changes**********************************/ Start memcpy_perf: Success [00m 00s] Memcpy performance autotest: Success [09m 35s] [13m 57s] -- > Pawel >