This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 2abbc7898a3 [Opt](cloud) Add inject points for cloud mow (#48190)
2abbc7898a3 is described below
commit 2abbc7898a3d2f06b89ba783afd875239a6558bd
Author: bobhan1 <[email protected]>
AuthorDate: Tue Feb 25 22:07:38 2025 +0800
[Opt](cloud) Add inject points for cloud mow (#48190)
---
be/src/cloud/cloud_meta_mgr.cpp | 11 ++++
be/src/cloud/cloud_schema_change_job.cpp | 16 +++++
.../cloud/test_cloud_mow_correctness_inject.out | Bin 185 -> 368 bytes
.../cloud/test_cloud_mow_correctness_inject.groovy | 71 ++++++++++++++++++++-
4 files changed, 95 insertions(+), 3 deletions(-)
diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp
index afac47e3645..41e60b5e264 100644
--- a/be/src/cloud/cloud_meta_mgr.cpp
+++ b/be/src/cloud/cloud_meta_mgr.cpp
@@ -1183,6 +1183,17 @@ Status
CloudMetaMgr::cloud_update_delete_bitmap_without_lock(const CloudTablet&
Status CloudMetaMgr::get_delete_bitmap_update_lock(const CloudTablet& tablet,
int64_t lock_id,
int64_t initiator) {
+ DBUG_EXECUTE_IF("get_delete_bitmap_update_lock.inject_fail", {
+ auto p = dp->param("percent", 0.01);
+ std::mt19937 gen {std::random_device {}()};
+ std::bernoulli_distribution inject_fault {p};
+ if (inject_fault(gen)) {
+ return Status::Error<ErrorCode::DELETE_BITMAP_LOCK_ERROR>(
+ "injection error when get get_delete_bitmap_update_lock, "
+ "tablet_id={}, lock_id={}, initiator={}",
+ tablet.tablet_id(), lock_id, initiator);
+ }
+ });
VLOG_DEBUG << "get_delete_bitmap_update_lock , tablet_id: " <<
tablet.tablet_id()
<< ",lock_id:" << lock_id;
GetDeleteBitmapUpdateLockRequest req;
diff --git a/be/src/cloud/cloud_schema_change_job.cpp
b/be/src/cloud/cloud_schema_change_job.cpp
index d12bcdaa01e..7c584d999bf 100644
--- a/be/src/cloud/cloud_schema_change_job.cpp
+++ b/be/src/cloud/cloud_schema_change_job.cpp
@@ -21,6 +21,7 @@
#include <chrono>
#include <memory>
+#include <random>
#include <thread>
#include "cloud/cloud_meta_mgr.h"
@@ -463,6 +464,9 @@ Status CloudSchemaChangeJob::_process_delete_bitmap(int64_t
alter_version,
}
}
+
DBUG_EXECUTE_IF("CloudSchemaChangeJob::_process_delete_bitmap.before_new_inc.block",
+ DBUG_BLOCK);
+
// step 2, process incremental rowset with delete bitmap update lock
RETURN_IF_ERROR(_cloud_storage_engine.meta_mgr().get_delete_bitmap_update_lock(
*_new_tablet, SCHEMA_CHANGE_DELETE_BITMAP_LOCK_ID, initiator));
@@ -484,6 +488,18 @@ Status
CloudSchemaChangeJob::_process_delete_bitmap(int64_t alter_version,
}
}
+
DBUG_EXECUTE_IF("CloudSchemaChangeJob::_process_delete_bitmap.inject_sleep", {
+ auto p = dp->param("percent", 0.01);
+ auto sleep_time = dp->param("sleep", 100);
+ std::mt19937 gen {std::random_device {}()};
+ std::bernoulli_distribution inject_fault {p};
+ if (inject_fault(gen)) {
+ LOG_INFO("injection sleep for {} seconds, tablet_id={}, sc
job_id={}", sleep_time,
+ _new_tablet->tablet_id(), _job_id);
+ std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
+ }
+ });
+
auto& delete_bitmap = tmp_tablet->tablet_meta()->delete_bitmap();
// step4, store delete bitmap
diff --git
a/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.out
b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.out
index 79839efff32..57619853130 100644
Binary files
a/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.out
and
b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.out
differ
diff --git
a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.groovy
b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.groovy
index 3c6ce3e8294..fa447e131d9 100644
---
a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.groovy
+++
b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.groovy
@@ -39,8 +39,7 @@ suite("test_cloud_mow_correctness_inject", "nonConcurrent") {
PROPERTIES (
"enable_mow_light_delete" = "false",
"enable_unique_key_merge_on_write" = "true",
- "disable_auto_compaction" = "true",
- "replication_num" = "1"); """
+ "disable_auto_compaction" = "true"); """
sql "insert into ${table1} values(1,1,1);"
sql "insert into ${table1} values(2,2,2);"
@@ -48,10 +47,22 @@ suite("test_cloud_mow_correctness_inject", "nonConcurrent")
{
sql "sync;"
qt_sql "select * from ${table1} order by k1;"
+ def waitForSC = {
+ Awaitility.await().atMost(30, TimeUnit.SECONDS).pollDelay(100,
TimeUnit.MILLISECONDS).pollInterval(1000, TimeUnit.MILLISECONDS).until(() -> {
+ def res = sql_return_maparray "SHOW ALTER TABLE COLUMN WHERE
TableName='${table1}' ORDER BY createtime DESC LIMIT 1"
+ assert res.size() == 1
+ if (res[0].State == "FINISHED" || res[0].State == "CANCELLED") {
+ return true;
+ }
+ return false;
+ });
+ }
+
def customFeConfig = [
delete_bitmap_lock_expiration_seconds : 10,
calculate_delete_bitmap_task_timeout_seconds : 2,
- mow_calculate_delete_bitmap_retry_times : 3
+ mow_calculate_delete_bitmap_retry_times : 3,
+ enable_schema_change_retry_in_cloud_mode : false // turn off to
shorten the test's time consumption
]
setFeConfigTemporary(customFeConfig) {
@@ -90,5 +101,59 @@ suite("test_cloud_mow_correctness_inject", "nonConcurrent")
{
GetDebugPoint().clearDebugPointsForAllBEs()
}
+
+ try {
+
GetDebugPoint().enableDebugPointForAllBEs("get_delete_bitmap_update_lock.inject_fail",
[percent: "1.0"])
+
GetDebugPoint().enableDebugPointForAllBEs("CloudSchemaChangeJob.process_alter_tablet.sleep")
+ sql "alter table ${table1} modify column c2 varchar(100);"
+ Thread.sleep(1000)
+ sql "insert into ${table1} values(10,10,10);"
+ qt_sql "select * from ${table1} order by k1;"
+ Thread.sleep(200)
+
GetDebugPoint().disableDebugPointForAllBEs("CloudSchemaChangeJob.process_alter_tablet.sleep")
+
+ waitForSC()
+
+ def res = sql_return_maparray "SHOW ALTER TABLE COLUMN WHERE
TableName='${table1}' ORDER BY createtime DESC LIMIT 1"
+ assert res[0].State == "CANCELLED"
+ assert res[0].Msg.contains("injection error when get
get_delete_bitmap_update_lock")
+
+ qt_sql "select * from ${table1} order by k1;"
+ } catch(Exception e) {
+ logger.info(e.getMessage())
+ throw e
+ } finally {
+ GetDebugPoint().clearDebugPointsForAllBEs()
+ }
+
+
+ try {
+ // sleep enough time to let sc's delete bitmap lock expired
+
GetDebugPoint().enableDebugPointForAllBEs("CloudSchemaChangeJob::_process_delete_bitmap.inject_sleep",
[percent: "1.0", sleep: "20"])
+
GetDebugPoint().enableDebugPointForAllBEs("CloudSchemaChangeJob::_process_delete_bitmap.before_new_inc.block")
+ sql "alter table ${table1} modify column c2 varchar(100);"
+ Thread.sleep(3000)
+ sql "insert into ${table1} values(11,11,11);"
+ qt_sql "select * from ${table1} order by k1;"
+ Thread.sleep(1000)
+
GetDebugPoint().disableDebugPointForAllBEs("CloudSchemaChangeJob::_process_delete_bitmap.before_new_inc.block")
+
+ // wait until sc's delete bitmap expired
+ Thread.sleep(10000)
+ sql "insert into ${table1} values(12,12,12);"
+
+ waitForSC()
+
+ def res = sql_return_maparray "SHOW ALTER TABLE COLUMN WHERE
TableName='${table1}' ORDER BY createtime DESC LIMIT 1"
+ assert res[0].State == "CANCELLED"
+ assert res[0].Msg.contains("[DELETE_BITMAP_LOCK_ERROR]lock expired
when update delete bitmap")
+
+ qt_sql "select * from ${table1} order by k1;"
+ } catch(Exception e) {
+ logger.info(e.getMessage())
+ throw e
+ } finally {
+ GetDebugPoint().clearDebugPointsForAllBEs()
+ }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]