Re: [PR] [feature](OSS) Retry when encountering TooManyRequest response [doris]

via GitHub Sat, 13 Jul 2024 08:05:22 -0700


gavinchou commented on code in PR #37199:
URL: https://github.com/apache/doris/pull/37199#discussion_r1676840335



##########
be/src/io/fs/s3_file_reader.cpp:
##########
@@ -120,65 +112,25 @@ Status S3FileReader::read_at_impl(size_t offset, Slice 
result, size_t* bytes_rea
     if (!client) {
         return Status::InternalError("init s3 client error");
     }
-    // // clang-format off
-    // auto resp = client->get_object( { .bucket = _bucket, .key = _key, },
-    //         to, offset, bytes_req, bytes_read);
-    // // clang-format on
-    // if (resp.status.code != ErrorCode::OK) {
-    //     return std::move(Status(resp.status.code, 
std::move(resp.status.msg))
-    //                              .append(fmt::format("failed to read from 
{}", _path.native())));
-    // }
-    // if (*bytes_read != bytes_req) {
-    //     return Status::InternalError("failed to read from {}(bytes read: 
{}, bytes req: {})",
-    //                                  _path.native(), *bytes_read, 
bytes_req);
-    SCOPED_BVAR_LATENCY(s3_bvar::s3_get_latency);
-
-    int retry_count = 0;
-    const int base_wait_time = config::s3_read_base_wait_time_ms; // Base wait 
time in milliseconds
-    const int max_wait_time = config::s3_read_max_wait_time_ms; // Maximum 
wait time in milliseconds
-    const int max_retries = config::max_s3_client_retry; // wait 1s, 2s, 4s, 
8s for each backoff
-
-    int total_sleep_time = 0;
-    while (retry_count <= max_retries) {
-        s3_file_reader_read_counter << 1;
-        // clang-format off
-        auto resp = client->get_object( { .bucket = _bucket, .key = _key, },
-                to, offset, bytes_req, bytes_read);
-        // clang-format on
-        _s3_stats.total_get_request_counter++;
-        if (resp.status.code != ErrorCode::OK) {
-            if (resp.http_code ==
-                
static_cast<int>(Aws::Http::HttpResponseCode::TOO_MANY_REQUESTS)) {
-                s3_file_reader_too_many_request_counter << 1;
-                retry_count++;
-                int wait_time = std::min(base_wait_time * (1 << retry_count),
-                                         max_wait_time); // Exponential backoff
-                
std::this_thread::sleep_for(std::chrono::milliseconds(wait_time));
-                _s3_stats.too_many_request_err_counter++;
-                _s3_stats.too_many_request_sleep_time_ms += wait_time;
-                total_sleep_time += wait_time;
-                continue;
-            } else {
-                // Handle other errors
-                return std::move(Status(resp.status.code, 
std::move(resp.status.msg))
-                                         .append("failed to read"));
-            }
-        }
-        if (*bytes_read != bytes_req) {
-            return Status::InternalError("failed to read (bytes read: {}, 
bytes req: {})",
-                                         *bytes_read, bytes_req);
-        }
-        _s3_stats.total_bytes_read += bytes_req;
-        s3_bytes_read_total << bytes_req;
-        s3_bytes_per_read << bytes_req;
-        DorisMetrics::instance()->s3_bytes_read_total->increment(bytes_req);
-        if (retry_count > 0) {
-            LOG(INFO) << fmt::format("read s3 file {} succeed after {} times 
with {} ms sleeping",
-                                     _path.native(), retry_count, 
total_sleep_time);
-        }
-        return Status::OK();
+    // clang-format off
+    auto resp = client->get_object( { .bucket = _bucket, .key = _key, },

Review Comment:
   Considering create a new client with `CustomRetryStrategy` which can carry 
enough context
   such as number retires and duration of all retires.



##########
be/src/util/s3_util.cpp:
##########
@@ -73,6 +73,20 @@ bool to_int(std::string_view str, int& res) {
     return ec == std::errc {};
 }
 
+class CustomRetryStrategy final : public Aws::Client::DefaultRetryStrategy {
+public:
+    CustomRetryStrategy(int maxRetries) : DefaultRetryStrategy(maxRetries) {}
+
+    bool ShouldRetry(const Aws::Client::AWSError<Aws::Client::CoreErrors>& 
error,
+                     long attemptedRetries) const override {
+        if (attemptedRetries < m_maxRetries &&
+            error.GetResponseCode() == 
Aws::Http::HttpResponseCode::TOO_MANY_REQUESTS) {
+            return true;

Review Comment:
   We should record something here if we are doing retry.
   e.g. a retry counter or something useful for us to observe the status of 
requesting the object storage service.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [feature](OSS) Retry when encountering TooManyRequest response [doris]

Reply via email to