This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 669009edfc ARROW-16521 [C++][Python] Configure curl timeout policy for
S3 (#13385)
669009edfc is described below
commit 669009edfc4f7f1fe85a1de641e16f831eee45d6
Author: Ziheng Wang <[email protected]>
AuthorDate: Thu Aug 4 00:27:23 2022 -0700
ARROW-16521 [C++][Python] Configure curl timeout policy for S3 (#13385)
added timeout options to c++ and python s3fs
Lead-authored-by: Ziheng Wang <[email protected]>
Co-authored-by: Ziheng Wang <[email protected]>
Co-authored-by: Ziheng Wang <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
cpp/src/arrow/filesystem/s3fs.cc | 17 +++++++++++++----
cpp/src/arrow/filesystem/s3fs.h | 11 +++++++++++
python/pyarrow/_s3fs.pyx | 22 ++++++++++++++++++----
python/pyarrow/includes/libarrow_fs.pxd | 2 ++
python/pyarrow/tests/test_fs.py | 9 +++++++++
5 files changed, 53 insertions(+), 8 deletions(-)
diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index dd3973ba77..aafb9a7cbf 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -386,10 +386,11 @@ bool S3Options::Equals(const S3Options& other) const {
default_metadata_size
? (other.default_metadata &&
other.default_metadata->Equals(*default_metadata))
: (!other.default_metadata || other.default_metadata->size() == 0);
- return (region == other.region && endpoint_override ==
other.endpoint_override &&
- scheme == other.scheme && role_arn == other.role_arn &&
- session_name == other.session_name && external_id ==
other.external_id &&
- load_frequency == other.load_frequency &&
+ return (region == other.region && connect_timeout == other.connect_timeout &&
+ request_timeout == other.request_timeout &&
+ endpoint_override == other.endpoint_override && scheme ==
other.scheme &&
+ role_arn == other.role_arn && session_name == other.session_name &&
+ external_id == other.external_id && load_frequency ==
other.load_frequency &&
proxy_options.Equals(other.proxy_options) &&
credentials_kind == other.credentials_kind &&
background_writes == other.background_writes &&
@@ -718,6 +719,14 @@ class ClientBuilder {
if (!options_.region.empty()) {
client_config_.region = ToAwsString(options_.region);
}
+ if (options_.request_timeout > 0) {
+ // Use ceil() to avoid setting it to 0 as that probably means no timeout.
+ client_config_.requestTimeoutMs = ceil(options_.request_timeout * 1000);
+ }
+ if (options_.connect_timeout > 0) {
+ client_config_.connectTimeoutMs = ceil(options_.connect_timeout * 1000);
+ }
+
client_config_.endpointOverride = ToAwsString(options_.endpoint_override);
if (options_.scheme == "http") {
client_config_.scheme = Aws::Http::Scheme::HTTP;
diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h
index 05fa404162..3f578aedb2 100644
--- a/cpp/src/arrow/filesystem/s3fs.h
+++ b/cpp/src/arrow/filesystem/s3fs.h
@@ -103,6 +103,17 @@ struct ARROW_EXPORT S3Options {
/// server).
std::string region;
+ /// \brief Socket connection timeout, in seconds
+ ///
+ /// If negative, the AWS SDK default value is used (typically 1 second).
+ double connect_timeout = -1;
+
+ /// \brief Socket read timeout on Windows and macOS, in seconds
+ ///
+ /// If negative, the AWS SDK default value is used (typically 3 seconds).
+ /// This option is ignored on non-Windows, non-macOS systems.
+ double request_timeout = -1;
+
/// If non-empty, override region with a connect string such as
"localhost:9000"
// XXX perhaps instead take a URL like "http://localhost:9000"?
std::string endpoint_override;
diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx
index f668038e62..47cb87c23d 100644
--- a/python/pyarrow/_s3fs.pyx
+++ b/python/pyarrow/_s3fs.pyx
@@ -137,6 +137,13 @@ cdef class S3FileSystem(FileSystem):
assumed role session will be refreshed.
region : str, default 'us-east-1'
AWS region to connect to.
+ request_timeout : double, default None
+ Socket read timeouts on Windows and macOS, in seconds.
+ If omitted, the AWS SDK default value is used (typically 3 seconds).
+ This option is ignored on non-Windows, non-macOS systems.
+ connect_timeout : double, default None
+ Socket connection timeout, in seconds.
+ If omitted, the AWS SDK default value is used (typically 1 second).
scheme : str, default 'https'
S3 connection transport scheme.
endpoint_override : str, default None
@@ -183,10 +190,11 @@ cdef class S3FileSystem(FileSystem):
CS3FileSystem* s3fs
def __init__(self, *, access_key=None, secret_key=None, session_token=None,
- bint anonymous=False, region=None, scheme=None,
- endpoint_override=None, bint background_writes=True,
- default_metadata=None, role_arn=None, session_name=None,
- external_id=None, load_frequency=900, proxy_options=None,
+ bint anonymous=False, region=None, request_timeout=None,
+ connect_timeout=None, scheme=None, endpoint_override=None,
+ bint background_writes=True, default_metadata=None,
+ role_arn=None, session_name=None, external_id=None,
+ load_frequency=900, proxy_options=None,
allow_bucket_creation=False, allow_bucket_deletion=False):
cdef:
CS3Options options
@@ -254,6 +262,10 @@ cdef class S3FileSystem(FileSystem):
if region is not None:
options.region = tobytes(region)
+ if request_timeout is not None:
+ options.request_timeout = request_timeout
+ if connect_timeout is not None:
+ options.connect_timeout = connect_timeout
if scheme is not None:
options.scheme = tobytes(scheme)
if endpoint_override is not None:
@@ -324,6 +336,8 @@ cdef class S3FileSystem(FileSystem):
CS3CredentialsKind_Anonymous),
region=frombytes(opts.region),
scheme=frombytes(opts.scheme),
+ connect_timeout=opts.connect_timeout,
+ request_timeout=opts.request_timeout,
endpoint_override=frombytes(opts.endpoint_override),
role_arn=frombytes(opts.role_arn),
session_name=frombytes(opts.session_name),
diff --git a/python/pyarrow/includes/libarrow_fs.pxd
b/python/pyarrow/includes/libarrow_fs.pxd
index d47b462aa5..69d5dc0ebe 100644
--- a/python/pyarrow/includes/libarrow_fs.pxd
+++ b/python/pyarrow/includes/libarrow_fs.pxd
@@ -152,6 +152,8 @@ cdef extern from "arrow/filesystem/api.h" namespace
"arrow::fs" nogil:
cdef cppclass CS3Options "arrow::fs::S3Options":
c_string region
+ double connect_timeout
+ double request_timeout
c_string endpoint_override
c_string scheme
c_bool background_writes
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index 05ebf4ed4c..238bcb73b6 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -1131,6 +1131,15 @@ def test_s3_options():
assert isinstance(fs, S3FileSystem)
assert pickle.loads(pickle.dumps(fs)) == fs
+ fs = S3FileSystem(request_timeout=0.5, connect_timeout=0.25)
+ assert isinstance(fs, S3FileSystem)
+ assert pickle.loads(pickle.dumps(fs)) == fs
+
+ fs2 = S3FileSystem(request_timeout=0.25, connect_timeout=0.5)
+ assert isinstance(fs2, S3FileSystem)
+ assert pickle.loads(pickle.dumps(fs2)) == fs2
+ assert fs2 != fs
+
with pytest.raises(ValueError):
S3FileSystem(access_key='access')
with pytest.raises(ValueError):