On Tue, 10 Jun 2025 12:49:55 GMT, Johan Sjölen <jsjo...@openjdk.org> wrote:
>> @coleenp Can't find the comment to reply... I've replaced all >> `_r.next_uint()` with just `next_uint()`, it's a bikeshed argument. > > Hi @rvansa , > > What about this type of API for dealing with the compressed table? We do the > 8 byte accesses as unsigned chars (important so that they're 0-extended and > not sign extended) and write the compressed KV down in little-endian. We use > bitwise OR to not squash whatever was there before. Comparing GCC x64 and PPC > (power), it looks good. > > ```c++ > #include <cstdint> > #include <cstdio> > #include <cstdlib> > #include <cstring> > > > // For this hard-coded example we use 2 byte keys > // and 3 byte values -- for an element_bytes of 5 > uint32_t key_mask = (1 << 16) - 1; > uint32_t value_mask = (1 << 24) - 1; > uint32_t value_shift = 16; > > uint32_t element_bytes = 5; > > uint64_t* kv_array; > > uint32_t unpack_key(uint64_t kv) { > return kv & key_mask; > } > uint32_t unpack_value(uint64_t kv) { return (kv >> value_shift) & value_mask; > } > > uint64_t pack_kv(uint64_t k, uint64_t v) { > return k | (v << value_shift); > } > > > uint32_t align_down(uint32_t x, uint32_t align) { return x & -align; } > uint32_t align_up(uint32_t x, uint32_t align) { > return (x + (align - 1)) & -align; > } > > uint64_t u64s_required(int n) { > uint32_t bytes_required = element_bytes * n; > return align_up(bytes_required, 8) / 8; > } > > uint64_t read_u8(uint8_t* p) { > uint64_t result = 0; > result |= ((uint64_t)*(p + 0)) << (0 * 8); > result |= ((uint64_t)*(p + 1)) << (1 * 8); > result |= ((uint64_t)*(p + 2)) << (2 * 8); > result |= ((uint64_t)*(p + 3)) << (3 * 8); > result |= ((uint64_t)*(p + 4)) << (4 * 8); > result |= ((uint64_t)*(p + 5)) << (5 * 8); > result |= ((uint64_t)*(p + 6)) << (6 * 8); > result |= ((uint64_t)*(p + 7)) << (7 * 8); > return result; > } > > void write_u8(uint8_t* p, uint64_t u8) { > p[0] |= u8 & 0xFF; > p[1] |= (u8 >> 8*1) & 0xFF; > p[2] |= (u8 >> 8*2) & 0xFF; > p[3] |= (u8 >> 8*3) & 0xFF; > p[4] |= (u8 >> 8*4) & 0xFF; > p[5] |= (u8 >> 8*5) & 0xFF; > p[6] |= (u8 >> 8*6) & 0xFF; > p[7] |= (u8 >> 8*7) & 0xFF; > } > > uint64_t read(int n) { > uint32_t byte_index = element_bytes * n; > return read_u8(&reinterpret_cast<uint8_t*>(kv_array)[byte_index]); > } > > void fill(int n, uint64_t kv) { > uint32_t byte_index = element_bytes * n; > write_u8(&reinterpret_cast<uint8_t*>(kv_array)[byte_index], kv); > return; > } > > int main() { > int num_elts = 65536; > uint64_t sz = u64s_required(num_elts); > kv_array = (uint64_t*)malloc(sz * 8); > for (int i = 0; i < sz; i++) { > kv_array[i] = 0; > } > > for (int i = 0; i < num_elts; i++) { > uint64_t kv = pack_kv(i, 2 * i); > fill(i, kv); > } > for (int i = 0; i < num_... Shouldn't the Copy package or memcpy do all of this? @jdksjolen I don't think this should be in the packedTable code. ------------- PR Comment: https://git.openjdk.org/jdk/pull/24847#issuecomment-2959112151