Alexey Serbin created KUDU-3620:
-----------------------------------

             Summary: Race condition in OpDriver::ReplicationFinished()
                 Key: KUDU-3620
                 URL: https://issues.apache.org/jira/browse/KUDU-3620
             Project: Kudu
          Issue Type: Bug
          Components: master, tserver
            Reporter: Alexey Serbin


As of There is a race condition in {{OpDriver::ReplicationFinished}} that with 
[1b99da532f52d143c46440c3903785d642fb45a3] manifests itself in the following 
ways when running ts_recovery-itest:
# A tablet server crashing with SIGSEGV
# Address sanitizer issues warnings

The stack trace for item 1:
{noformat}
*** Aborted at 1727269462 (unix time) try "date -d @1727269462" if you are 
using GNU date ***
PC: @                0x0 (unknown)
*** SIGSEGV (@0x30) received by PID 14694 (TID 0x7f734f91b700) from PID 48; 
stack trace: ***
    @     0x7f73830a5980 (unknown) at ??:0
    @     0x7f73848b3db6 kudu::tablet::OpState::tablet_replica() at ??:0
    @     0x7f73848d55c3 kudu::tablet::OpDriver::ReplicationFinished() at ??:0
    @     0x7f73848aa27e 
_ZZN4kudu6tablet13TabletReplica15StartFollowerOpERK13scoped_refptrINS_9consensus14ConsensusRoundEEENKUlRKNS_6StatusEE_clESA_
 at ??:0
    @     0x7f73848b0f41 
_ZNSt17_Function_handlerIFvRKN4kudu6StatusEEZNS0_6tablet13TabletReplica15StartFollowerOpERK13scoped_refptrINS0_9consensus14ConsensusRoundEEEUlS3_E_E9_M_invokeERKSt9_Any_dataS3_
 at ??:0
    @     0x7f7386351325 std::function<>::operator()() at ??:0
    @     0x7f7384407f2b 
kudu::consensus::ConsensusRound::NotifyReplicationFinished() at ??:0
    @     0x7f73843d774b 
kudu::consensus::PendingRounds::AdvanceCommittedIndex() at ??:0
    @     0x7f73843f6888 kudu::consensus::RaftConsensus::UpdateReplica() at ??:0
    @     0x7f73843f1ef5 kudu::consensus::RaftConsensus::Update() at ??:0
    @     0x7f7385467de7 kudu::tserver::ConsensusServiceImpl::UpdateConsensus() 
at ??:0
    @     0x7f7383c95fd2 
_ZZN4kudu9consensus18ConsensusServiceIfC4ERK13scoped_refptrINS_12MetricEntityEERKS2_INS_3rpc13ResultTrackerEEENKUlPKN6google8protobuf7MessageEPSE_PNS7_10RpcContextEE0_clESG_SH_SJ_
 at ??:0
    @     0x7f7383c9a063 
_ZNSt17_Function_handlerIFvPKN6google8protobuf7MessageEPS2_PN4kudu3rpc10RpcContextEEZNS6_9consensus18ConsensusServiceIfC4ERK13scoped_refptrINS6_12MetricEntityEERKSD_INS7_13ResultTrackerEEEUlS4_S5_S9_E0_E9_M_invokeERKSt9_Any_dataOS4_OS5_OS9_
 at ??:0
    @     0x7f73834af4b8 std::function<>::operator()() at ??:0
    @     0x7f73834aed6c kudu::rpc::GeneratedServiceIf::Handle() at ??:0
    @     0x7f73834b1a7d kudu::rpc::ServicePool::RunThread() at ??:0
    @     0x7f73834b03c7 _ZZN4kudu3rpc11ServicePool4InitEiENKUlvE_clEv at ??:0
    @     0x7f73834b1e06 
_ZNSt17_Function_handlerIFvvEZN4kudu3rpc11ServicePool4InitEiEUlvE_E9_M_invokeERKSt9_Any_data
 at ??:0
    @     0x55ab245f526e std::function<>::operator()() at ??:0
    @     0x7f7382853bb1 kudu::Thread::SuperviseThread() at ??:0
    @     0x7f738309a6db start_thread at ??:0
    @     0x7f73805ae71f clone at ??:0
{noformat}

A sample of output for item 2:
{noformat}
==26864==ERROR: AddressSanitizer: heap-use-after-free on address 0x617000212830 
at pc 0x7fd36dc2c636 bp 0x7fd32f986530 sp 0x7fd32f986528
READ of size 8 at 0x617000212830 thread T84 (rpc worker-2694)
    #0 0x7fd36dc2c635 in kudu::tablet::OpState::tablet_replica() const 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/tablet/ops/op.h:189:12
    #1 0x7fd36dc70732 in 
kudu::tablet::OpDriver::ReplicationFinished(kudu::Status const&) 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/tablet/ops/op_driver.cc:443:37
    #2 0x7fd36dc20493 in 
kudu::tablet::TabletReplica::StartFollowerOp(scoped_refptr<kudu::consensus::ConsensusRound>
 const&)::$_7::operator()(kudu::Status const&) const 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/tablet/tablet_replica.cc:857:51
    #3 0x7fd36dc202fc in std::_Function_handler<void (kudu::Status const&), 
kudu::tablet::TabletReplica::StartFollowerOp(scoped_refptr<kudu::consensus::ConsensusRound>
 const&)::$_7>::_M_invoke(std::_Any_data const&, kudu::Status const&) 
../../../include/c++/7.5.0/bits/std_function.h:316:2
    #4 0x7fd37460bd0d in std::function<void (kudu::Status 
const&)>::operator()(kudu::Status const&) const 
../../../include/c++/7.5.0/bits/std_function.h:706:14
    #5 0x7fd36c940afc in 
kudu::consensus::ConsensusRound::NotifyReplicationFinished(kudu::Status const&) 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/consensus/raft_consensus.cc:3311:3
    #6 0x7fd36c8cdbbc in 
kudu::consensus::PendingRounds::AdvanceCommittedIndex(long) 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/consensus/pending_rounds.cc:185:12
    #7 0x7fd36c916f16 in 
kudu::consensus::RaftConsensus::UpdateReplica(kudu::consensus::ConsensusRequestPB
 const*, kudu::consensus::ConsensusResponsePB*) 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/consensus/raft_consensus.cc:1530:5
    #8 0x7fd36c914e57 in 
kudu::consensus::RaftConsensus::Update(kudu::consensus::ConsensusRequestPB 
const*, kudu::consensus::ConsensusResponsePB*) 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/consensus/raft_consensus.cc:1097:14
    #9 0x7fd3705ec7ad in 
kudu::tserver::ConsensusServiceImpl::UpdateConsensus(kudu::consensus::ConsensusRequestPB
 const*, kudu::consensus::ConsensusResponsePB*, kudu::rpc::RpcContext*) 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/tserver/tablet_service.cc:1764:25
    #10 0x7fd36ace9b56 in 
kudu::consensus::ConsensusServiceIf::ConsensusServiceIf(scoped_refptr<kudu::MetricEntity>
 const&, scoped_refptr<kudu::rpc::ResultTracker> 
const&)::$_1::operator()(google::protobuf::Message const*, 
google::protobuf::Message*, kudu::rpc::RpcContext*) const 
/home/jenkins-slave/workspace/build_and_test_flaky@2/build/asan/src/kudu/consensus/consensus.service.cc:299:13
    #11 0x7fd36ace9885 in std::_Function_handler<void 
(google::protobuf::Message const*, google::protobuf::Message*, 
kudu::rpc::RpcContext*), 
kudu::consensus::ConsensusServiceIf::ConsensusServiceIf(scoped_refptr<kudu::MetricEntity>
 const&, scoped_refptr<kudu::rpc::ResultTracker> 
const&)::$_1>::_M_invoke(std::_Any_data const&, google::protobuf::Message 
const*&&, google::protobuf::Message*&&, kudu::rpc::RpcContext*&&) 
../../../include/c++/7.5.0/bits/std_function.h:316:2
    #12 0x7fd367dc924e in std::function<void (google::protobuf::Message const*, 
google::protobuf::Message*, 
kudu::rpc::RpcContext*)>::operator()(google::protobuf::Message const*, 
google::protobuf::Message*, kudu::rpc::RpcContext*) const 
../../../include/c++/7.5.0/bits/std_function.h:706:14
    #13 0x7fd367dc812e in 
kudu::rpc::GeneratedServiceIf::Handle(kudu::rpc::InboundCall*) 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/rpc/service_if.cc:137:3
    #14 0x7fd367dce365 in kudu::rpc::ServicePool::RunThread() 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/rpc/service_pool.cc:229:15
    #15 0x7fd367dcec8f in kudu::rpc::ServicePool::Init(int)::$_0::operator()() 
const 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/rpc/service_pool.cc:92:5
    #16 0x7fd367dceab8 in std::_Function_handler<void (), 
kudu::rpc::ServicePool::Init(int)::$_0>::_M_invoke(std::_Any_data const&) 
../../../include/c++/7.5.0/bits/std_function.h:316:2
    #17 0xa86d2c in std::function<void ()>::operator()() const 
../../../include/c++/7.5.0/bits/std_function.h:706:14
    #18 0x7fd36108db5d in kudu::Thread::SuperviseThread(void*) 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/util/thread.cc:693:3
    #19 0x7fd36446b6da in start_thread 
(/lib/x86_64-linux-gnu/libpthread.so.0+0x76da)
    #20 0x7fd35d1fa71e in clone (/lib/x86_64-linux-gnu/libc.so.6+0x12171e)

0x617000212830 is located 48 bytes inside of 688-byte region 
[0x617000212800,0x617000212ab0)
freed by thread T140 (apply [worker]-) here:
    #0 0x9557b0 in operator delete(void*) 
/home/jenkins-slave/workspace/build_and_test_flaky@2/thirdparty/src/llvm-11.0.0.src/projects/compiler-rt/l
ib/asan/asan_new_delete.cpp:160
    #1 0x7fd36dca4f0a in kudu::tablet::WriteOpState::~WriteOpState() 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/tablet/ops/write_
op.cc:665:31
    #2 0x7fd37472bf41 in 
std::default_delete<kudu::tablet::WriteOpState>::operator()(kudu::tablet::WriteOpState*)
 const ../../../include/c++/7.5.0/bits/unique_ptr.h:78:2
    #3 0x7fd37471974b in std::unique_ptr<kudu::tablet::WriteOpState, 
std::default_delete<kudu::tablet::WriteOpState> >::~unique_ptr() 
../../../include/c++/7.5.0/bits/unique_ptr.h:263:4
    #4 0x7fd36dca9c64 in kudu::tablet::WriteOp::~WriteOp() 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/tablet/ops/write_op.h:345:7
    #5 0x7fd36dca9ca2 in kudu::tablet::WriteOp::~WriteOp() 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/tablet/ops/write_op.h:345:7
    #6 0x7fd36dc348d1 in 
std::default_delete<kudu::tablet::Op>::operator()(kudu::tablet::Op*) const 
../../../include/c++/7.5.0/bits/unique_ptr.h:78:2
    #7 0x7fd36dc2700b in std::unique_ptr<kudu::tablet::Op, 
std::default_delete<kudu::tablet::Op> >::~unique_ptr() 
../../../include/c++/7.5.0/bits/unique_ptr.h:263:4
    #8 0x7fd36dc44252 in kudu::tablet::OpDriver::~OpDriver() 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/tablet/ops/op_driver.h:304:16
    #9 0x7fd36dc4421a in kudu::RefCountedThreadSafe<kudu::tablet::OpDriver, 
kudu::DefaultRefCountedThreadSafeTraits<kudu::tablet::OpDriver> 
>::DeleteInternal(kudu::tablet::OpDriver const*) 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/gutil/ref_counted.h:153:44
    #10 0x7fd36dc441f0 in 
kudu::DefaultRefCountedThreadSafeTraits<kudu::tablet::OpDriver>::Destruct(kudu::tablet::OpDriver
 const*) 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/gutil/ref_counted.h:116:5
    #11 0x7fd36dc441be in kudu::RefCountedThreadSafe<kudu::tablet::OpDriver, 
kudu::DefaultRefCountedThreadSafeTraits<kudu::tablet::OpDriver> >::Release() 
const 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/gutil/ref_counted.h:144:7
    #12 0x7fd36dc270e7 in 
scoped_refptr<kudu::tablet::OpDriver>::~scoped_refptr() 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/gutil/ref_counted.h:266:13
    #13 0x7fd36dc71f53 in kudu::tablet::OpDriver::ApplyTask() 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/tablet/ops/op_driver.cc:563:1
    #14 0x7fd36dc74ccb in 
kudu::tablet::OpDriver::ApplyAsync()::$_2::operator()() const 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/tablet/ops/op_driver.cc:504:47
    #15 0x7fd36dc74b48 in std::_Function_handler<void (), 
kudu::tablet::OpDriver::ApplyAsync()::$_2>::_M_invoke(std::_Any_data const&) 
../../../include/c++/7.5.0/bits/std_function.h:316:2
    #16 0xa86d2c in std::function<void ()>::operator()() const 
../../../include/c++/7.5.0/bits/std_function.h:706:14
    #17 0x7fd3610af604 in kudu::ThreadPool::DispatchThread() 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/util/threadpool.cc:776:7
    #18 0x7fd3610b2c2b in kudu::ThreadPool::CreateThread()::$_2::operator()() 
const 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/util/threadpool.cc:849:48
    #19 0x7fd3610b2aa8 in std::_Function_handler<void (), 
kudu::ThreadPool::CreateThread()::$_2>::_M_invoke(std::_Any_data const&) 
../../../include/c++/7.5.0/bits/std_function.h:316:2
    #20 0xa86d2c in std::function<void ()>::operator()() const 
../../../include/c++/7.5.0/bits/std_function.h:706:14
    #21 0x7fd36108db5d in kudu::Thread::SuperviseThread(void*) 
/home/jenkins-slave/workspace/build_and_test_flaky@2/src/kudu/util/thread.cc:693:3
    #22 0x7fd36446b6da in start_thread 
(/lib/x86_64-linux-gnu/libpthread.so.0+0x76da)
{noformat}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to