This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new a32b94021f [python] Add doc and refine config for using pyjindosdk in
pypaimon (#7565)
a32b94021f is described below
commit a32b94021fefa95e43d8d935b2db337da3e75ed9
Author: timmyyao <[email protected]>
AuthorDate: Tue Mar 31 20:18:27 2026 +0800
[python] Add doc and refine config for using pyjindosdk in pypaimon (#7565)
Using pyjindosdk as the default implementation for pypaimon as long as
pyjindosdk is installed. Fall back is introduced in the doc.
---
docs/content/pypaimon/pyjindosdk-support.md | 56 ++++++++++++++++++++++
paimon-python/pypaimon/common/options/config.py | 4 +-
.../pypaimon/filesystem/pyarrow_file_io.py | 23 +++++++--
paimon-python/pypaimon/tests/file_io_test.py | 2 +
4 files changed, 78 insertions(+), 7 deletions(-)
diff --git a/docs/content/pypaimon/pyjindosdk-support.md
b/docs/content/pypaimon/pyjindosdk-support.md
new file mode 100644
index 0000000000..7db012c928
--- /dev/null
+++ b/docs/content/pypaimon/pyjindosdk-support.md
@@ -0,0 +1,56 @@
+---
+title: "PyJindoSDK Support"
+weight: 8
+type: docs
+aliases:
+ - /pypaimon/pyjindosdk-support.html
+---
+
+# PyJindoSDK Support
+
+## Introduction
+
+[JindoSDK](https://github.com/aliyun/alibabacloud-jindodata) is a
high-performance storage SDK developed by Alibaba Cloud for accessing OSS
(Object Storage Service) and other cloud storage systems. It provides optimized
I/O performance and deep integration with the Alibaba Cloud ecosystem.
+
+PyPaimon now supports using
[PyJindoSDK](https://github.com/aliyun/alibabacloud-jindodata) (the Python
binding of JindoSDK) to access OSS. Compared to the legacy implementation based
on PyArrow's S3FileSystem, PyJindoSDK offers better performance and
compatibility when working with OSS.
+
+## Usage
+
+### Installation
+
+Install `pyjindosdk` via pip:
+
+```shell
+pip install pyjindosdk
+```
+
+Once installed, PyPaimon will automatically use PyJindoSDK as the default file
I/O implementation for accessing OSS. No additional configuration is required.
+
+### Fallback to Legacy Implementation
+
+Since JindoSDK is a native implementation, pre-built Python packages may not
be available for all OS or platform versions. If you need to fall back to the
legacy PyArrow-based implementation for any reason, there are two ways to do so:
+
+**Option 1: Set catalog option `fs.oss.impl` to `legacy`**
+
+```python
+from pypaimon import CatalogFactory
+
+catalog_options = {
+ 'metastore': 'rest',
+ 'uri': 'http://rest-server:8080',
+ 'warehouse': 'oss://my-bucket/warehouse',
+
+ # Fallback to the legacy PyArrow S3FileSystem implementation
+ 'fs.oss.impl': 'legacy',
+}
+
+catalog = CatalogFactory.create(catalog_options)
+```
+
+**Option 2: Uninstall pyjindosdk**
+
+Simply uninstalling the `pyjindosdk` package will cause PyPaimon to
automatically fall back to the legacy implementation:
+
+```shell
+pip uninstall pyjindosdk
+```
diff --git a/paimon-python/pypaimon/common/options/config.py
b/paimon-python/pypaimon/common/options/config.py
index 83d46c85bf..249ad810a2 100644
--- a/paimon-python/pypaimon/common/options/config.py
+++ b/paimon-python/pypaimon/common/options/config.py
@@ -18,8 +18,8 @@ from pypaimon.common.options.config_options import
ConfigOptions
class OssOptions:
- OSS_IMPL =
ConfigOptions.key("fs.oss.impl").string_type().default_value("default").with_description(
- "OSS filesystem implementation: default or jindo")
+ OSS_IMPL =
ConfigOptions.key("fs.oss.impl").string_type().default_value("jindo").with_description(
+ "OSS filesystem implementation: legacy or jindo")
OSS_ACCESS_KEY_ID =
ConfigOptions.key("fs.oss.accessKeyId").string_type().no_default_value().with_description(
"OSS access key ID")
OSS_ACCESS_KEY_SECRET = ConfigOptions.key(
diff --git a/paimon-python/pypaimon/filesystem/pyarrow_file_io.py
b/paimon-python/pypaimon/filesystem/pyarrow_file_io.py
index 2f4f8fd974..db06eb6f99 100644
--- a/paimon-python/pypaimon/filesystem/pyarrow_file_io.py
+++ b/paimon-python/pypaimon/filesystem/pyarrow_file_io.py
@@ -34,7 +34,7 @@ from pypaimon.common.file_io import FileIO
from pypaimon.common.options import Options
from pypaimon.common.options.config import OssOptions, S3Options
from pypaimon.common.uri_reader import UriReaderFactory
-from pypaimon.filesystem.jindo_file_system_handler import
JindoFileSystemHandler
+from pypaimon.filesystem.jindo_file_system_handler import
JindoFileSystemHandler, JINDO_AVAILABLE
from pypaimon.schema.data_types import (AtomicType, DataField,
PyarrowFieldParser)
from pypaimon.table.row.blob import Blob, BlobData, BlobDescriptor
@@ -57,12 +57,24 @@ class PyArrowFileIO(FileIO):
self.uri_reader_factory = UriReaderFactory(catalog_options)
self._is_oss = scheme in {"oss"}
self._oss_bucket = None
- self._oss_impl = self.properties.get(OssOptions.OSS_IMPL)
+ _oss_impl = self.properties.get(OssOptions.OSS_IMPL)
+ self._use_jindo = False
+
if self._is_oss:
self._oss_bucket = self._extract_oss_bucket(path)
- if self._oss_impl == "jindo":
+ if _oss_impl not in ("jindo", "legacy"):
+ raise ValueError(
+ f"Unsupported fs.oss.impl value: '{_oss_impl}'. "
+ f"Supported values are 'jindo' and 'legacy'.")
+ if _oss_impl == "legacy":
+ self.filesystem = self._initialize_oss_fs(path)
+ elif JINDO_AVAILABLE:
self.filesystem = self._initialize_jindo_fs(path)
else:
+ self.logger.info(
+ "fs.oss.impl is 'jindo' but pyjindosdk is not installed. "
+ "Falling back to legacy PyArrow S3FileSystem
implementation. "
+ "Install pyjindosdk for better performance: pip install
pyjindosdk")
self.filesystem = self._initialize_oss_fs(path)
elif scheme in {"s3", "s3a", "s3n"}:
self.filesystem = self._initialize_s3_fs()
@@ -126,6 +138,7 @@ class PyArrowFileIO(FileIO):
self.logger.info(f"Initializing JindoFileSystem for OSS access:
{path}")
root_path = f"oss://{self._oss_bucket}/"
fs_handler = JindoFileSystemHandler(root_path, self.properties)
+ self._use_jindo = True
return pafs.PyFileSystem(fs_handler)
def _initialize_oss_fs(self, path) -> FileSystem:
@@ -210,7 +223,7 @@ class PyArrowFileIO(FileIO):
def new_output_stream(self, path: str):
path_str = self.to_filesystem_path(path)
- if self._oss_impl == "jindo":
+ if self._use_jindo:
pass
elif self._is_oss and not self._pyarrow_gte_7:
# For PyArrow 6.x + OSS, path_str is already just the key part
@@ -592,7 +605,7 @@ class PyArrowFileIO(FileIO):
path_part = normalized_path.lstrip('/')
return f"{drive_letter}:/{path_part}" if path_part else
f"{drive_letter}:"
- if self._oss_impl == "jindo":
+ if self._use_jindo:
# For JindoFileSystem, pass key only
path_part = normalized_path.lstrip('/')
return path_part if path_part else '.'
diff --git a/paimon-python/pypaimon/tests/file_io_test.py
b/paimon-python/pypaimon/tests/file_io_test.py
index 5cc4d7a821..d39d0c6461 100644
--- a/paimon-python/pypaimon/tests/file_io_test.py
+++ b/paimon-python/pypaimon/tests/file_io_test.py
@@ -70,6 +70,7 @@ class FileIOTest(unittest.TestCase):
OssOptions.OSS_ENDPOINT.key(): 'oss-cn-hangzhou.aliyuncs.com',
OssOptions.OSS_ACCESS_KEY_ID.key(): 'test-key',
OssOptions.OSS_ACCESS_KEY_SECRET.key(): 'test-secret',
+ OssOptions.OSS_IMPL.key(): 'legacy',
}))
got = oss_io.to_filesystem_path("oss://test-bucket/path/to/file.txt")
self.assertEqual(got, "path/to/file.txt" if lt7 else
"test-bucket/path/to/file.txt")
@@ -291,6 +292,7 @@ class FileIOTest(unittest.TestCase):
OssOptions.OSS_ENDPOINT.key(): 'oss-cn-hangzhou.aliyuncs.com',
OssOptions.OSS_ACCESS_KEY_ID.key(): 'test-key',
OssOptions.OSS_ACCESS_KEY_SECRET.key(): 'test-secret',
+ OssOptions.OSS_IMPL.key(): 'legacy',
}))
mock_fs = MagicMock()
mock_fs.get_file_info.return_value = [