This is the same optimization as was recently applied to std::shuffle.
It reduces the runtime of the following program by 20% on my machine: int main() { std::vector<uint64_t> a(10000), b(1000); std::mt19937 gen; uint64_t c = 0; for (int i = 0; i != 1000; ++i) { std::sample(a.begin(), a.end(), b.begin(), b.size(), gen); c += uint64_t(b[32]); } std::cout << c; }
Index: bits/stl_algo.h =================================================================== --- bits/stl_algo.h (revision 241299) +++ bits/stl_algo.h (working copy) @@ -3741,6 +3741,37 @@ #ifdef _GLIBCXX_USE_C99_STDINT_TR1 /** + * @brief Generate two uniformly distributed integers using a + * single distribution invocation. + * @param __b0 The upper bound for the first integer. + * @param __b1 The upper bound for the second integer. + * @param __g A UniformRandomBitGenerator. + * @return A pair (i, j) with i and j uniformly distributed + * over [0, __b0) and [0, __b1), respectively. + * + * Requires: __b0 * __b1 <= __g.max() - __g.min(). + * + * Using uniform_int_distribution with a range that is very + * small relative to the range of the generator ends up wasting + * potentially expensively generated randomness, since + * uniform_int_distribution does not store leftover randomness + * between invocations. + * + * If we know we want two integers in ranges that are sufficiently + * small, we can compose the ranges, use a single distribution + * invocation, and significantly reduce the waste. + */ + template<typename _IntType, + typename _UniformRandomBitGenerator> + std::pair<_IntType, _IntType> + __gen_two_uniform_ints(_IntType __b0, _IntType __b1, + _UniformRandomBitGenerator&& __g) + { + _IntType __x = uniform_int_distribution<_IntType>{0, (__b0 * __b1) - 1}(__g); + return std::make_pair(__x / __b1, __x % __b1); + } + + /** * @brief Shuffle the elements of a sequence using a uniform random * number generator. * @ingroup mutating_algorithms @@ -3801,13 +3832,12 @@ while (__i != __last) { const __uc_type __swap_range = __uc_type(__i - __first) + 1; - const __uc_type __comp_range = __swap_range * (__swap_range + 1); - std::uniform_int_distribution<__uc_type> __d{0, __comp_range - 1}; - const __uc_type __pospos = __d(__g); + const std::pair<__uc_type, __uc_type> __pospos = + __gen_two_uniform_ints(__swap_range, __swap_range + 1); - std::iter_swap(__i++, __first + (__pospos % __swap_range)); - std::iter_swap(__i++, __first + (__pospos / __swap_range)); + std::iter_swap(__i++, __first + __pospos.first); + std::iter_swap(__i++, __first + __pospos.second); } return; @@ -5695,9 +5725,51 @@ { using __distrib_type = uniform_int_distribution<_Size>; using __param_type = typename __distrib_type::param_type; + using _USize = typename std::make_unsigned<_Size>::type; + using _Gen = typename std::remove_reference<_UniformRandomBitGenerator>::type; + using __uc_type = typename std::common_type<typename _Gen::result_type, _USize>::type; + __distrib_type __d{}; _Size __unsampled_sz = std::distance(__first, __last); - for (__n = std::min(__n, __unsampled_sz); __n != 0; ++__first) + __n = std::min(__n, __unsampled_sz); + + // If possible, we use __gen_two_uniform_ints to efficiently produce + // two random numbers using a single distribution invocation: + + const __uc_type __urngrange = __g.max() - __g.min(); + if (__urngrange / __uc_type(__unsampled_sz) >= __uc_type(__unsampled_sz)) + // I.e. (__urngrange >= __unsampled_sz * __unsampled_sz) but without wrap issues. + { + while (__n != 0 && __unsampled_sz >= 2) + { + const std::pair<_Size, _Size> p = + __gen_two_uniform_ints(__unsampled_sz, __unsampled_sz - 1, __g); + + --__unsampled_sz; + if (p.first < __n) + { + *__out++ = *__first; + --__n; + } + + ++__first; + + if (__n == 0) break; + + --__unsampled_sz; + if (p.second < __n) + { + *__out++ = *__first; + --__n; + } + + ++__first; + } + } + + // The loop above is otherwise equivalent to this one-at-a-time version: + + for (; __n != 0; ++__first) if (__d(__g, __param_type{0, --__unsampled_sz}) < __n) { *__out++ = *__first;