I like Yibo's stack overflow theory given the "error reading variable" but I did confirm that I can cause a segmentation fault if std::atomic_store / std::atomic_load are unavailable. I simulated this by simply commenting out the specializations rather than actually run against GCC 4.9.2 so it may not be perfect. I've attached a patch with my stress test (based on the latest master, #c697a41ab9c11511113e7387fe4710df920c36ed). Running that stress test while running `stress -c 16` on my server reproduces it pretty reliably.
Thread 1 (Thread 0x7f6ae05fc700 (LWP 2308757)): #0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50 #1 0x00007f6ae352e859 in __GI_abort () at abort.c:79 #2 0x00007f6ae37fe892 in __gnu_cxx::__verbose_terminate_handler () at /home/conda/feedstock_root/build_artifacts/ctng-compilers_1601682258120/work/.build/x86_64-conda-linux-gnu/src/gcc/libstdc++-v3/libsupc++/vterminate.cc:95 #3 0x00007f6ae37fcf69 in __cxxabiv1::__terminate (handler=<optimized out>) at /home/conda/feedstock_root/build_artifacts/ctng-compilers_1601682258120/work/.build/x86_64-conda-linux-gnu/src/gcc/libstdc++-v3/libsupc++/eh_terminate.cc:48 #4 0x00007f6ae37fcfab in std::terminate () at /home/conda/feedstock_root/build_artifacts/ctng-compilers_1601682258120/work/.build/x86_64-conda-linux-gnu/src/gcc/libstdc++-v3/libsupc++/eh_terminate.cc:58 #5 0x00007f6ae37fd9d0 in __cxxabiv1::__cxa_pure_virtual () at /home/conda/feedstock_root/build_artifacts/ctng-compilers_1601682258120/work/.build/x86_64-conda-linux-gnu/src/gcc/libstdc++-v3/libsupc++/pure.cc:50 #6 0x000055a64bc4400a in std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0x7f6ad0001160) at /home/pace/anaconda3/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/9.3.0/bits/shared_ptr_base.h:155 #7 0x000055a64bc420f3 in std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count (this=0x7f6ae05fa568, __in_chrg=<optimized out>) at /home/pace/anaconda3/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/9.3.0/bits/shared_ptr_base.h:730 #8 0x000055a64bc3a4a2 in std::__shared_ptr<arrow::Array, (__gnu_cxx::_Lock_policy)2>::~__shared_ptr (this=0x7f6ae05fa560, __in_chrg=<optimized out>) at /home/pace/anaconda3/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/9.3.0/bits/shared_ptr_base.h:1169 #9 0x000055a64bc3a4be in std::shared_ptr<arrow::Array>::~shared_ptr (this=0x7f6ae05fa560, __in_chrg=<optimized out>) at /home/pace/anaconda3/envs/arrow-dev/x86_64-conda-linux-gnu/include/c++/9.3.0/bits/shared_ptr.h:103 #10 0x000055a64bc557ca in arrow::TestRecordBatch_BatchColumnBoxingStress_Test::<lambda()>::operator()(void) const (__closure=0x55a64d5f5218) at ../src/arrow/record_batch_test.cc:206 As a workaround to see if this is indeed your issue, you can call RecordBatch::column on each of the columns as soon as you create the RecordBatch (from one thread) which will force the boxed columns to materialize. -Weston On Thu, May 20, 2021 at 11:40 AM Wes McKinney <wesmck...@gmail.com> wrote: > > Also, is it possible that the field is not an Int64Array? > > On Wed, May 19, 2021 at 10:19 PM Yibo Cai <yibo....@arm.com> wrote: > > > > On 5/20/21 4:15 AM, Rares Vernica wrote: > > > Hello, > > > > > > I'm using Arrow for accessing data outside the SciDB database engine. It > > > generally works fine but we are running into Segmentation Faults in a > > > corner multi-threaded case. I identified two threads that work on the same > > > Record Batch. I wonder if there is something internal about RecordBatch > > > that might help solve the mystery. > > > > > > We are using Arrow 0.16.0. The backtrace of the triggering thread looks > > > like this: > > > > > > Program received signal SIGSEGV, Segmentation fault. > > > [Switching to Thread 0x7fdad5fb4700 (LWP 3748)] > > > 0x00007fdaa805abe0 in ?? () > > > (gdb) thread > > > [Current thread is 2 (Thread 0x7fdad5fb4700 (LWP 3748))] > > > (gdb) bt > > > #0 0x00007fdaa805abe0 in ?? () > > > #1 0x0000000000850212 in > > > std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release() () > > > #2 0x00007fdae4b1fbf1 in > > > std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count > > > (this=0x7fdad5fb1ae8, __in_chrg=<optimized out>) at > > > /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/shared_ptr_base.h:666 > > > #3 0x00007fdae4b39d74 in std::__shared_ptr<arrow::Array, > > > (__gnu_cxx::_Lock_policy)2>::~__shared_ptr (this=0x7fdad5fb1ae0, > > > __in_chrg=<optimized out>) at > > > /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/shared_ptr_base.h:914 > > > #4 0x00007fdae4b39da8 in std::shared_ptr<arrow::Array>::~shared_ptr > > > (this=0x7fdad5fb1ae0, __in_chrg=<optimized out>) at > > > /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/shared_ptr.h:93 > > > #5 0x00007fdae4b6a8e1 in scidb::XChunkIterator::getCoord > > > (this=0x7fdaa807f9f0, dim=1, index=1137) at XArray.cpp:358 > > > #6 0x00007fdae4b68ecb in scidb::XChunkIterator::XChunkIterator > > > (this=0x7fdaa807f9f0, chunk=..., iterationMode=0, arrowBatch=<error > > > reading > > > variable: Cannot access memory at address 0xd5fb1b90>) at XArray.cpp:157 > > > > FWIW, this "error reading variable" looks suspicious. Maybe the argument > > 'arrowBatch' is trashed accidentally (stack overflow)? > > https://github.com/Paradigm4/bridge/blob/master/src/XArray.cpp#L132 > > > > > ... > > > > > > The backtrace of the other thread working on exactly the same Record Batch > > > looks like this: > > > > > > (gdb) thread > > > [Current thread is 3 (Thread 0x7fdad61b5700 (LWP 3746))] > > > (gdb) bt > > > #0 0x00007fdae3bc1ec7 in arrow::SimpleRecordBatch::column(int) const () > > > from /lib64/libarrow.so.16 > > > #1 0x00007fdae4b6a888 in scidb::XChunkIterator::getCoord > > > (this=0x7fdab00c0bb0, dim=0, index=71) at XArray.cpp:357 > > > #2 0x00007fdae4b6a5a2 in scidb::XChunkIterator::operator++ > > > (this=0x7fdab00c0bb0) at XArray.cpp:305 > > > ... > > > > > > In both cases, the last non-Arrow code is the getCorord function > > > https://github.com/Paradigm4/bridge/blob/master/src/XArray.cpp#L355 > > > > > > int64_t XChunkIterator::getCoord(size_t dim, int64_t index) > > > { > > > return std::static_pointer_cast<arrow::Int64Array>( > > > _arrowBatch->column(_nAtts + dim))->raw_values()[index]; > > > } > > > ... > > > std::shared_ptr<const arrow::RecordBatch> _arrowBatch; > > > > > > Do you see anything suspicious about this code? What would trigger the > > > shared_ptr destruction which takes place in thread 2? > > > > > > Thank you! > > > Rares > > >
diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 9de57f183..99322c7f1 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -22,6 +22,7 @@ #include <cstdint> #include <memory> +#include <thread> #include <vector> #include "arrow/array/array_base.h" @@ -178,6 +179,46 @@ TEST_F(TestRecordBatch, AddColumn) { ASSERT_TRUE(new_batch2->schema()->field(1)->nullable()); } +TEST_F(TestRecordBatch, BatchColumnBoxingStress) { + constexpr int length = 10; + constexpr int NTRIALS = 100000; + constexpr int NTHREADS = 20; + constexpr int NUM_FIELDS = 50; + + FieldVector fields; + ArrayDataVector arrays; + for (int i = 0; i < NUM_FIELDS; i++) { + fields.push_back(field("f" + std::to_string(i), int32())); + arrays.push_back(MakeRandomArray<Int32Array>(length)->data()); + } + + auto schema = ::arrow::schema(fields); + + for (int trial_index = 0; trial_index < NTRIALS; trial_index++) { + auto batch = RecordBatch::Make(schema, length, arrays); + std::vector<std::thread> threads; + for (int thread_index = 0; thread_index < NTHREADS; thread_index++) { + bool forwards = thread_index % 2 == 0; + threads.push_back(std::thread([&batch, &length, forwards] { + if (forwards) { + for (int column_index = 0; column_index < batch->num_columns(); + column_index++) { + ASSERT_EQ(length, batch->column(column_index)->length()); + } + } else { + for (int column_index = batch->num_columns() - 1; column_index >= 0; + column_index--) { + ASSERT_EQ(length, batch->column(column_index)->length()); + } + } + })); + } + for (auto& thread : threads) { + thread.join(); + } + } +} + TEST_F(TestRecordBatch, SetColumn) { const int length = 10; diff --git a/cpp/src/arrow/util/atomic_shared_ptr.h b/cpp/src/arrow/util/atomic_shared_ptr.h index d93ad921d..9dc6a4cf7 100644 --- a/cpp/src/arrow/util/atomic_shared_ptr.h +++ b/cpp/src/arrow/util/atomic_shared_ptr.h @@ -33,10 +33,10 @@ namespace internal { template <typename T, typename = void> struct is_atomic_load_shared_ptr_available : std::false_type {}; -template <typename T> -struct is_atomic_load_shared_ptr_available< - T, void_t<decltype(std::atomic_load(std::declval<const std::shared_ptr<T>*>()))>> - : std::true_type {}; +// template <typename T> +// struct is_atomic_load_shared_ptr_available< +// T, void_t<decltype(std::atomic_load(std::declval<const std::shared_ptr<T>*>()))>> +// : std::true_type {}; template <typename T> using enable_if_atomic_load_shared_ptr_available = @@ -61,11 +61,11 @@ enable_if_atomic_load_shared_ptr_unavailable<std::shared_ptr<T>> atomic_load( template <typename T, typename = void> struct is_atomic_store_shared_ptr_available : std::false_type {}; -template <typename T> -struct is_atomic_store_shared_ptr_available< - T, void_t<decltype(std::atomic_store(std::declval<std::shared_ptr<T>*>(), - std::declval<std::shared_ptr<T>>()))>> - : std::true_type {}; +// template <typename T> +// struct is_atomic_store_shared_ptr_available< +// T, void_t<decltype(std::atomic_store(std::declval<std::shared_ptr<T>*>(), +// std::declval<std::shared_ptr<T>>()))>> +// : std::true_type {}; template <typename T> using enable_if_atomic_store_shared_ptr_available =