This is an automated email from the ASF dual-hosted git repository. lihaopeng pushed a commit to branch vectorized in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
commit 0a68fc3138057d536cae16e0c4052469cdd2c76e Author: Zeno Yang <cookie...@qq.com> AuthorDate: Mon Jan 17 16:54:40 2022 +0800 [Vectorized][Improvement] Speed up column filtering via SIMD (#7775) --- be/src/vec/columns/column_decimal.cpp | 25 +++++++++++++++++++++++++ be/src/vec/columns/column_vector.cpp | 28 ++++++++++------------------ be/src/vec/columns/columns_common.cpp | 20 +++++++++----------- be/src/vec/columns/columns_common.h | 30 ++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 29 deletions(-) diff --git a/be/src/vec/columns/column_decimal.cpp b/be/src/vec/columns/column_decimal.cpp index 8e5ae12..5cc5853 100644 --- a/be/src/vec/columns/column_decimal.cpp +++ b/be/src/vec/columns/column_decimal.cpp @@ -162,6 +162,31 @@ ColumnPtr ColumnDecimal<T>::filter(const IColumn::Filter& filt, ssize_t result_s const UInt8* filt_end = filt_pos + size; const T* data_pos = data.data(); + /** A slightly more optimized version. + * Based on the assumption that often pieces of consecutive values + * completely pass or do not pass the filter. + * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. + */ + static constexpr size_t SIMD_BYTES = 32; + const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES; + + while (filt_pos < filt_end_sse) { + uint32_t mask = bytes32_mask_to_bits32_mask(filt_pos); + + if (0xFFFFFFFF == mask) { + res_data.insert(data_pos, data_pos + SIMD_BYTES); + } else { + while (mask) { + const size_t idx = __builtin_ctzll(mask); + res_data.push_back(data_pos[idx]); + mask = mask & (mask - 1); + } + } + + filt_pos += SIMD_BYTES; + data_pos += SIMD_BYTES; + } + while (filt_pos < filt_end) { if (*filt_pos) res_data.push_back(*data_pos); diff --git a/be/src/vec/columns/column_vector.cpp b/be/src/vec/columns/column_vector.cpp index f6627d0..017ae29 100644 --- a/be/src/vec/columns/column_vector.cpp +++ b/be/src/vec/columns/column_vector.cpp @@ -26,6 +26,8 @@ #include <cmath> #include <cstring> +#include "runtime/datetime_value.h" +#include "vec/columns/columns_common.h" #include "vec/common/arena.h" #include "vec/common/bit_cast.h" #include "vec/common/exception.h" @@ -33,12 +35,6 @@ #include "vec/common/sip_hash.h" #include "vec/common/unaligned.h" -#include "runtime/datetime_value.h" - -#ifdef __SSE2__ -#include <emmintrin.h> -#endif - namespace doris::vectorized { template <typename T> @@ -237,34 +233,30 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter& filt, ssize_t result_si const UInt8* filt_end = filt_pos + size; const T* data_pos = data.data(); -#ifdef __SSE2__ /** A slightly more optimized version. * Based on the assumption that often pieces of consecutive values * completely pass or do not pass the filter. * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. */ - - static constexpr size_t SIMD_BYTES = 16; - const __m128i zero16 = _mm_setzero_si128(); + static constexpr size_t SIMD_BYTES = 32; const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_sse) { - int mask = _mm_movemask_epi8(_mm_cmpgt_epi8( - _mm_loadu_si128(reinterpret_cast<const __m128i*>(filt_pos)), zero16)); + uint32_t mask = bytes32_mask_to_bits32_mask(filt_pos); - if (0 == mask) { - /// Nothing is inserted. - } else if (0xFFFF == mask) { + if (0xFFFFFFFF == mask) { res_data.insert(data_pos, data_pos + SIMD_BYTES); } else { - for (size_t i = 0; i < SIMD_BYTES; ++i) - if (filt_pos[i]) res_data.push_back(data_pos[i]); + while (mask) { + const size_t idx = __builtin_ctzll(mask); + res_data.push_back(data_pos[idx]); + mask = mask & (mask - 1); + } } filt_pos += SIMD_BYTES; data_pos += SIMD_BYTES; } -#endif while (filt_pos < filt_end) { if (*filt_pos) res_data.push_back(*data_pos); diff --git a/be/src/vec/columns/columns_common.cpp b/be/src/vec/columns/columns_common.cpp index 02d650a..3045c5b 100644 --- a/be/src/vec/columns/columns_common.cpp +++ b/be/src/vec/columns/columns_common.cpp @@ -24,6 +24,7 @@ #include "vec/columns/column.h" #include "vec/columns/column_vector.h" +#include "vec/columns/columns_common.h" #include "vec/common/typeid_cast.h" namespace doris::vectorized { @@ -173,18 +174,13 @@ void filter_arrays_impl_generic(const PaddedPODArray<T>& src_elems, memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T)); }; -#ifdef __SSE2__ - const __m128i zero_vec = _mm_setzero_si128(); - static constexpr size_t SIMD_BYTES = 16; + static constexpr size_t SIMD_BYTES = 32; const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_aligned) { - const auto mask = _mm_movemask_epi8(_mm_cmpgt_epi8( - _mm_loadu_si128(reinterpret_cast<const __m128i*>(filt_pos)), zero_vec)); + auto mask = bytes32_mask_to_bits32_mask(filt_pos); - if (mask == 0) { - /// SIMD_BYTES consecutive rows do not pass the filter - } else if (mask == 0xffff) { + if (mask == 0xffffffff) { /// SIMD_BYTES consecutive rows pass the filter const auto first = offsets_pos == offsets_begin; @@ -199,14 +195,16 @@ void filter_arrays_impl_generic(const PaddedPODArray<T>& src_elems, res_elems.resize(elems_size_old + chunk_size); memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T)); } else { - for (size_t i = 0; i < SIMD_BYTES; ++i) - if (filt_pos[i]) copy_array(offsets_pos + i); + while (mask) { + const size_t bit_pos = __builtin_ctzll(mask); + copy_array(offsets_pos + bit_pos); + mask = mask & (mask - 1); + } } filt_pos += SIMD_BYTES; offsets_pos += SIMD_BYTES; } -#endif while (filt_pos < filt_end) { if (*filt_pos) copy_array(offsets_pos); diff --git a/be/src/vec/columns/columns_common.h b/be/src/vec/columns/columns_common.h index ef9c00c..83c38c4 100644 --- a/be/src/vec/columns/columns_common.h +++ b/be/src/vec/columns/columns_common.h @@ -22,10 +22,40 @@ #include "vec/columns/column.h" +#ifdef __AVX2__ +#include <immintrin.h> +#elif __SSE2__ +#include <emmintrin.h> +#endif + /// Common helper methods for implementation of different columns. namespace doris::vectorized { +/// Transform 32-byte mask to 32-bit mask +inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* filt_pos) { +#ifdef __AVX2__ + auto zero32 = _mm256_setzero_si256(); + uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpgt_epi8( + _mm256_loadu_si256(reinterpret_cast<const __m256i*>(filt_pos)), zero32))); +#elif __SSE2__ + auto zero16 = _mm_setzero_si128(); + uint32_t mask = + (static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8( + _mm_loadu_si128(reinterpret_cast<const __m128i*>(filt_pos)), zero16)))) | + ((static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8( + _mm_loadu_si128(reinterpret_cast<const __m128i*>(filt_pos + 16)), zero16))) + << 16) & + 0xffff0000); +#else + uint32_t mask = 0; + for (size_t i = 0; i < 32; ++i) { + mask |= static_cast<uint32_t>(1 == *(filt_pos + i)) << i; + } +#endif + return mask; +} + /// Counts how many bytes of `filt` are greater than zero. size_t count_bytes_in_filter(const IColumn::Filter& filt); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org