Hi guys,
Since we are all quite curious about which is the best implementation
for the performance, I just did some test on my server.
There will be 3 implementations.
1. Clear all the array1 and array2 bits first, then set the bits we
needed.(The current implementation in the patch).
2. Set all the bits in array1 and array2 first, then clear the not
needed bits.
3. Set the needed bits in array1 and array2, and clear the left not need
bits.
(As we are allocate more memory as the alignment, clear not needed bits
should be done anyway.)
So it's call the 3 implementation Cs, Sc, sc:
Capital 'C' means clear all bits.
Lowercase 'c' means clear not needed bits.
Capital 'S' means set all bits.
Lowercase 's' means set needed bits.
I add some test code in the bitmap_test code, here is the cycle for
different bits with different implementations.
RTE>>bitmap_test
Set bits:63
Cs Sc sc
1018 1089 1078
Set bits:126
Cs Sc sc
972 1082 1048
Set bits:252
Cs Sc sc
918 1039 1029
Set bits:504
Cs Sc sc
861 986 957
Set bits:1008
Cs Sc sc
802 882 851
Set bits:2016
Cs Sc sc
618 646 625
Set bits:4032
Cs Sc sc
272 215 209
Set bits:8064
Cs Sc sc
537 392 391
Set bits:16128
Cs Sc sc
1083 786 798
As we can see, after 4K bits, the Cs case comes disadvantage, before 4K
bits, it works much better.
And since the cycles before 4K bits does not show more significant
differences, it should be OK to use the Sc or sc cases.
Maybe better to choose the sc code.
===================================
Testing code as below:
static void
test_tsc(uint32_t n_bits)
{
void *mem;
uint32_t i;
uint64_t start, cost, cost2, cost3;
uint32_t bmp_size;
struct rte_bitmap *bmp;
bmp_size =
rte_bitmap_get_memory_footprint(n_bits);
mem = rte_zmalloc("test_bmap", bmp_size, RTE_CACHE_LINE_SIZE);
if (mem == NULL) {
printf("Failed to allocate memory for bitmap\n");
return;
}
/* Make the memory hot.*/
bmp = rte_bitmap_init_with_all_set(n_bits, mem, bmp_size);
if (bmp == NULL) {
printf("Failed to init bitmap\n");
return;
}
/* Clear all bits first, set needed. */
start = rte_rdtsc();
for (i = 0; i < 1000; i++)
rte_bitmap_init_with_all_set(n_bits, mem, bmp_size);
cost = (rte_rdtsc() - start) / 1000;
/* Set all bits first, clear not needed. */
start = rte_rdtsc();
for (i = 0; i < 1000; i++)
rte_bitmap_init_with_all_set2(n_bits, mem, bmp_size);
cost2 = (rte_rdtsc() - start) / 1000;
/* Set needed bits, clear left. */
start = rte_rdtsc();
for (i = 0; i < 1000; i++)
rte_bitmap_init_with_all_set3(n_bits, mem, bmp_size);
cost3 = (rte_rdtsc() - start) / 1000;
printf("Set bits:%d\nCs Sc sc\n", n_bits);
printf("%-4ld %-4ld %-4ld\n\n", cost, cost2, cost3);
rte_free(mem);
}
uint32_t i;
for (i = 63; i < (63 << 9); i<<=1)
test_tsc(i);
===================================
Sc code as below:
static inline struct rte_bitmap *
rte_bitmap_init_with_all_set2(uint32_t n_bits, uint8_t *mem, uint32_t
mem_size)
{
uint32_t i;
uint32_t slabs;
struct rte_bitmap *bmp;
bmp = __rte_bitmap_init(n_bits, mem, mem_size);
if (!bmp)
return NULL;
memset(bmp->array1, 0xff, bmp->array1_size * sizeof(uint64_t));
memset(bmp->array2, 0xff, bmp->array2_size * sizeof(uint64_t));
/* Fill the arry1 slab aligned bits. */
slabs = n_bits >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 +
RTE_BITMAP_CL_BIT_SIZE_LOG2);
/* Clear the array1 left slabs. */
memset(&bmp->array1[slabs], 0, (bmp->array1_size - slabs) *
sizeof(bmp->array1[0]));
/* Fill the array1 middle not full set slab. */
i = slabs << (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 +
RTE_BITMAP_CL_BIT_SIZE_LOG2);
for (;i < n_bits; i += RTE_BITMAP_CL_BIT_SIZE)
rte_bitmap_set(bmp, i);
/* Clear the array2 left slabs. */
slabs = n_bits >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2;
memset(&bmp->array2[slabs], 0, (bmp->array2_size - slabs) *
sizeof(bmp->array2[0]));
/* Fill the array2 middle not full set slab. */
for (i = slabs * RTE_BITMAP_SLAB_BIT_SIZE; i < n_bits; i++)
rte_bitmap_set(bmp, i);
return bmp;
}
===================================
sc code as below:
static inline struct rte_bitmap *
rte_bitmap_init_with_all_set3(uint32_t n_bits, uint8_t *mem, uint32_t
mem_size)
{
uint32_t i;
uint32_t slabs;
struct rte_bitmap *bmp;
bmp = __rte_bitmap_init(n_bits, mem, mem_size);
if (!bmp)
return NULL;
/* Fill the arry1 slab aligned bits. */
slabs = n_bits >> (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 +
RTE_BITMAP_CL_BIT_SIZE_LOG2);
memset(bmp->array1, 0xff, slabs * sizeof(bmp->array1[0]));
/* Clear the array1 left slabs. */
memset(&bmp->array1[slabs], 0, (bmp->array1_size - slabs) *
sizeof(bmp->array1[0]));
/* Fill the array1 middle not full set slab. */
i = slabs << (RTE_BITMAP_SLAB_BIT_SIZE_LOG2 +
RTE_BITMAP_CL_BIT_SIZE_LOG2);
for (;i < n_bits; i += RTE_BITMAP_CL_BIT_SIZE)
rte_bitmap_set(bmp, i);
/* Fill the arry2 slab aligned bits. */
slabs = n_bits >> RTE_BITMAP_SLAB_BIT_SIZE_LOG2;
memset(bmp->array2, 0xff, slabs * sizeof(bmp->array2[0]));
/* Clear the array2 left slabs. */
memset(&bmp->array2[slabs], 0, (bmp->array2_size - slabs) *
sizeof(bmp->array2[0]));
/* Fill the array2 middle not full set slab. */
for (i = slabs * RTE_BITMAP_SLAB_BIT_SIZE; i < n_bits; i++)
rte_bitmap_set(bmp, i);
return bmp;
}
Any comments or suggestions?
Thanks,
SuanmingMou