This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 669009edfc ARROW-16521 [C++][Python] Configure curl timeout policy for 
S3 (#13385)
669009edfc is described below

commit 669009edfc4f7f1fe85a1de641e16f831eee45d6
Author: Ziheng Wang <[email protected]>
AuthorDate: Thu Aug 4 00:27:23 2022 -0700

    ARROW-16521 [C++][Python] Configure curl timeout policy for S3 (#13385)
    
    added timeout options to c++ and python s3fs
    
    Lead-authored-by: Ziheng Wang <[email protected]>
    Co-authored-by: Ziheng Wang <[email protected]>
    Co-authored-by: Ziheng Wang <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 cpp/src/arrow/filesystem/s3fs.cc        | 17 +++++++++++++----
 cpp/src/arrow/filesystem/s3fs.h         | 11 +++++++++++
 python/pyarrow/_s3fs.pyx                | 22 ++++++++++++++++++----
 python/pyarrow/includes/libarrow_fs.pxd |  2 ++
 python/pyarrow/tests/test_fs.py         |  9 +++++++++
 5 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index dd3973ba77..aafb9a7cbf 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -386,10 +386,11 @@ bool S3Options::Equals(const S3Options& other) const {
       default_metadata_size
           ? (other.default_metadata && 
other.default_metadata->Equals(*default_metadata))
           : (!other.default_metadata || other.default_metadata->size() == 0);
-  return (region == other.region && endpoint_override == 
other.endpoint_override &&
-          scheme == other.scheme && role_arn == other.role_arn &&
-          session_name == other.session_name && external_id == 
other.external_id &&
-          load_frequency == other.load_frequency &&
+  return (region == other.region && connect_timeout == other.connect_timeout &&
+          request_timeout == other.request_timeout &&
+          endpoint_override == other.endpoint_override && scheme == 
other.scheme &&
+          role_arn == other.role_arn && session_name == other.session_name &&
+          external_id == other.external_id && load_frequency == 
other.load_frequency &&
           proxy_options.Equals(other.proxy_options) &&
           credentials_kind == other.credentials_kind &&
           background_writes == other.background_writes &&
@@ -718,6 +719,14 @@ class ClientBuilder {
     if (!options_.region.empty()) {
       client_config_.region = ToAwsString(options_.region);
     }
+    if (options_.request_timeout > 0) {
+      // Use ceil() to avoid setting it to 0 as that probably means no timeout.
+      client_config_.requestTimeoutMs = ceil(options_.request_timeout * 1000);
+    }
+    if (options_.connect_timeout > 0) {
+      client_config_.connectTimeoutMs = ceil(options_.connect_timeout * 1000);
+    }
+
     client_config_.endpointOverride = ToAwsString(options_.endpoint_override);
     if (options_.scheme == "http") {
       client_config_.scheme = Aws::Http::Scheme::HTTP;
diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h
index 05fa404162..3f578aedb2 100644
--- a/cpp/src/arrow/filesystem/s3fs.h
+++ b/cpp/src/arrow/filesystem/s3fs.h
@@ -103,6 +103,17 @@ struct ARROW_EXPORT S3Options {
   /// server).
   std::string region;
 
+  /// \brief Socket connection timeout, in seconds
+  ///
+  /// If negative, the AWS SDK default value is used (typically 1 second).
+  double connect_timeout = -1;
+
+  /// \brief Socket read timeout on Windows and macOS, in seconds
+  ///
+  /// If negative, the AWS SDK default value is used (typically 3 seconds).
+  /// This option is ignored on non-Windows, non-macOS systems.
+  double request_timeout = -1;
+
   /// If non-empty, override region with a connect string such as 
"localhost:9000"
   // XXX perhaps instead take a URL like "http://localhost:9000";?
   std::string endpoint_override;
diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx
index f668038e62..47cb87c23d 100644
--- a/python/pyarrow/_s3fs.pyx
+++ b/python/pyarrow/_s3fs.pyx
@@ -137,6 +137,13 @@ cdef class S3FileSystem(FileSystem):
         assumed role session will be refreshed.
     region : str, default 'us-east-1'
         AWS region to connect to.
+    request_timeout : double, default None
+        Socket read timeouts on Windows and macOS, in seconds.
+        If omitted, the AWS SDK default value is used (typically 3 seconds).
+        This option is ignored on non-Windows, non-macOS systems.
+    connect_timeout : double, default None
+        Socket connection timeout, in seconds.
+        If omitted, the AWS SDK default value is used (typically 1 second).
     scheme : str, default 'https'
         S3 connection transport scheme.
     endpoint_override : str, default None
@@ -183,10 +190,11 @@ cdef class S3FileSystem(FileSystem):
         CS3FileSystem* s3fs
 
     def __init__(self, *, access_key=None, secret_key=None, session_token=None,
-                 bint anonymous=False, region=None, scheme=None,
-                 endpoint_override=None, bint background_writes=True,
-                 default_metadata=None, role_arn=None, session_name=None,
-                 external_id=None, load_frequency=900, proxy_options=None,
+                 bint anonymous=False, region=None, request_timeout=None,
+                 connect_timeout=None, scheme=None, endpoint_override=None,
+                 bint background_writes=True, default_metadata=None,
+                 role_arn=None, session_name=None, external_id=None,
+                 load_frequency=900, proxy_options=None,
                  allow_bucket_creation=False, allow_bucket_deletion=False):
         cdef:
             CS3Options options
@@ -254,6 +262,10 @@ cdef class S3FileSystem(FileSystem):
 
         if region is not None:
             options.region = tobytes(region)
+        if request_timeout is not None:
+            options.request_timeout = request_timeout
+        if connect_timeout is not None:
+            options.connect_timeout = connect_timeout
         if scheme is not None:
             options.scheme = tobytes(scheme)
         if endpoint_override is not None:
@@ -324,6 +336,8 @@ cdef class S3FileSystem(FileSystem):
                            CS3CredentialsKind_Anonymous),
                 region=frombytes(opts.region),
                 scheme=frombytes(opts.scheme),
+                connect_timeout=opts.connect_timeout,
+                request_timeout=opts.request_timeout,
                 endpoint_override=frombytes(opts.endpoint_override),
                 role_arn=frombytes(opts.role_arn),
                 session_name=frombytes(opts.session_name),
diff --git a/python/pyarrow/includes/libarrow_fs.pxd 
b/python/pyarrow/includes/libarrow_fs.pxd
index d47b462aa5..69d5dc0ebe 100644
--- a/python/pyarrow/includes/libarrow_fs.pxd
+++ b/python/pyarrow/includes/libarrow_fs.pxd
@@ -152,6 +152,8 @@ cdef extern from "arrow/filesystem/api.h" namespace 
"arrow::fs" nogil:
 
     cdef cppclass CS3Options "arrow::fs::S3Options":
         c_string region
+        double connect_timeout
+        double request_timeout
         c_string endpoint_override
         c_string scheme
         c_bool background_writes
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index 05ebf4ed4c..238bcb73b6 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -1131,6 +1131,15 @@ def test_s3_options():
     assert isinstance(fs, S3FileSystem)
     assert pickle.loads(pickle.dumps(fs)) == fs
 
+    fs = S3FileSystem(request_timeout=0.5, connect_timeout=0.25)
+    assert isinstance(fs, S3FileSystem)
+    assert pickle.loads(pickle.dumps(fs)) == fs
+
+    fs2 = S3FileSystem(request_timeout=0.25, connect_timeout=0.5)
+    assert isinstance(fs2, S3FileSystem)
+    assert pickle.loads(pickle.dumps(fs2)) == fs2
+    assert fs2 != fs
+
     with pytest.raises(ValueError):
         S3FileSystem(access_key='access')
     with pytest.raises(ValueError):

Reply via email to