Thanks for your suggestion, this is the modified patch and two test files. -----Original Message----- From: Michael Paquier <mich...@paquier.xyz> Sent: Friday, October 20, 2023 4:19 PM To: Xiang Gao <xiang....@arm.com> Cc: pgsql-hackers@lists.postgresql.org Subject: Re: CRC32C Parallel Computation Optimization on ARM
On Fri, Oct 20, 2023 at 07:08:58AM +0000, Xiang Gao wrote: > This patch uses a parallel computing optimization algorithm to improve > crc32c computing performance on ARM. The algorithm comes from Intel > whitepaper: > crc-iscsi-polynomial-crc32-instruction-paper. Input data is divided > into three equal-sized blocks.Three parallel blocks (crc0, crc1, > crc2) for 1024 Bytes.One Block: 42(BLK_LENGTH) * 8(step length: > crc32c_u64) bytes > > Crc32c unitest: > https://gist.github.com/gaoxyt/138fd53ca1eead8102eeb9204067f7e4 > Crc32c benchmark: > https://gist.github.com/gaoxyt/4506c10fc06b3501445e32c4257113e9 > It gets ~2x speedup compared to linear Arm crc32c instructions. Interesting. Could you attached to this thread the test files you used and the results obtained please? If this data gets deleted from github, then it would not be possible to refer back to what you did at the related benchmark results. Note that your patch is forgetting about meson; it just patches ./configure. -- Michael IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
0002-crc32c-parallel-computation-optimization-on-arm.patch
Description: 0002-crc32c-parallel-computation-optimization-on-arm.patch
/********************************************************************* * compile postgres first with different crc32c implementation(use arm vmull_p64 or not) * we should comment out some codes about elog in pg_crc32c_armv8_choose.c to compile correctly and simply. * $ gcc -I ../postgres/_install/include -I ../postgres/_install/include/server main.c \ * -L ../postgres/build/src/port -l pgport_srv -O2 -o main * this test was run on Neoverse-N1 * $ ./main.no_vmull * data size is 512 bytes, and compute crc cost 139 us totally, 0.135742 us per loop * data size is 4096 bytes, and compute crc cost 1061 us totally, 1.036133 us per loop * $ ./main.use_vmull * data size is 512 bytes, and compute crc cost 101 us totally, 0.098633 us per loop * data size is 4096 bytes, and compute crc cost 540 us totally, 0.527344 us per loop * We can see that the cost of computing crc32c without vmull_p64 is about two times than * the cost that using vmull_p64 when data size is large. and the cost is almost same when * data size is small. *********************************************************************/ #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <sys/time.h> #include <memory.h> #include "c.h" #include "port/pg_crc32c.h" uint64_t GetTickCount() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec * 1000000 + tv.tv_usec; } int main() { #define CASE_CNT 2 uint32_t test_size[CASE_CNT] = {512, 1024 * 4}; for (int case_cnt = 0; case_cnt < CASE_CNT; case_cnt++) { uint8_t *buf = (uint8_t *) malloc(test_size[case_cnt] * sizeof(uint8_t)); srand(0); for (int i = 0; i < test_size[case_cnt]; i++) { *(buf + i) = (uint8_t) (rand() % 256u); } static const uint32_t kLoop = 1024; uint32_t crc = 0; uint64_t start = GetTickCount(); INIT_CRC32C(crc); for (int i = 0; i < kLoop; i++) { COMP_CRC32C(crc, buf, test_size[case_cnt]); } FIN_CRC32C(crc); uint64_t stop = GetTickCount(); printf("data size is %d bytes, and compute crc cost %ld us totally, %f us per loop\n", test_size[case_cnt], stop - start, (double) (stop - start) / kLoop); free(buf); } #undef CASE_CNT return 0; }
/******************************************************************************* * We use libcheck(https://github.com/libcheck/check) as unit testing framework. * compile postgres first with different crc32c implementation(use arm crc32c * and vmull intrisics or not). we should comment out some codes about elog in * pg_crc32c_armv8_choose.c to compile correctly and simply. * $ gcc -I ../postgres/_install/include -I ../postgres/_install/include/server \ crc32c_unittest.c -L ../postgres/build/src/port -l pgport_srv -L /usr/local/lib \ -lcheck -o crc32c_unittest * this test was run on Neoverse-N1 * $ ./crc32c_unittest * Running suite(s): CRC32C * 100%: Checks: 3, Failures: 0, Errors: 0 *******************************************************************************/ #include <stdlib.h> #include <check.h> #include "c.h" #include "port/pg_crc32c.h" START_TEST (test_crc32c_0) { int crc = 0; int data = 0; INIT_CRC32C(crc); COMP_CRC32C(crc, &data, sizeof(int)); FIN_CRC32C(crc); ck_assert_int_eq(crc, 0x48674bc7); } END_TEST START_TEST (test_crc32c_small_size) { int crc = 0; int size = 512; uint8_t *buf = (uint8_t*)malloc(size * sizeof(uint8_t)); memset(buf, 0, size * sizeof(uint8_t)); INIT_CRC32C(crc); COMP_CRC32C(crc, buf, size * sizeof(uint8_t)); FIN_CRC32C(crc); ck_assert_int_eq(crc, 0x30fcedc0); free(buf); } END_TEST START_TEST (test_crc32c_large_size) { int crc = 0; int size = 4096; uint8_t *buf = (uint8_t*)malloc(size * sizeof(uint8_t)); for (int i = 0; i < size; i++) { *(buf + i) |= 0xFF; } INIT_CRC32C(crc); COMP_CRC32C(crc, buf, size * sizeof(uint8_t)); FIN_CRC32C(crc); ck_assert_int_eq(crc, 0x25c1fe13); free(buf); } END_TEST Suite * crc32c_suite(void) { Suite *s; TCase *tc_core; s = suite_create("CRC32C"); /* Core test case */ tc_core = tcase_create("Core"); tcase_add_test(tc_core, test_crc32c_0); tcase_add_test(tc_core, test_crc32c_small_size); tcase_add_test(tc_core, test_crc32c_large_size); suite_add_tcase(s, tc_core); return s; } int main() { int number_failed; Suite *s; SRunner *sr; s = crc32c_suite(); sr = srunner_create(s); srunner_run_all(sr, CK_NORMAL); number_failed = srunner_ntests_failed(sr); srunner_free(sr); return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE; }