This is an automated email from the ASF dual-hosted git repository.
marong pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new ad4393e32c [VL] Separate filesystem configuration initialization
(#10540)
ad4393e32c is described below
commit ad4393e32c7307071be5b9ca6a03aba2adafa296
Author: Rong Ma <[email protected]>
AuthorDate: Fri Aug 29 09:31:14 2025 +0100
[VL] Separate filesystem configuration initialization (#10540)
---
.../operators/writer/VeloxParquetDataSourceABFS.h | 5 +-
.../operators/writer/VeloxParquetDataSourceHDFS.h | 5 +-
.../operators/writer/VeloxParquetDataSourceS3.h | 5 +-
cpp/velox/utils/ConfigExtractor.cc | 76 +++++++++++++++++-----
cpp/velox/utils/ConfigExtractor.h | 5 +-
5 files changed, 72 insertions(+), 24 deletions(-)
diff --git a/cpp/velox/operators/writer/VeloxParquetDataSourceABFS.h
b/cpp/velox/operators/writer/VeloxParquetDataSourceABFS.h
index f8f6e5878c..6dd7027cea 100644
--- a/cpp/velox/operators/writer/VeloxParquetDataSourceABFS.h
+++ b/cpp/velox/operators/writer/VeloxParquetDataSourceABFS.h
@@ -43,8 +43,9 @@ class VeloxParquetDataSourceABFS final : public
VeloxParquetDataSource {
: VeloxParquetDataSource(filePath, veloxPool, sinkPool, schema) {}
void initSink(const std::unordered_map<std::string, std::string>&
sparkConfs) override {
- auto hiveConf =
getHiveConfig(std::make_shared<facebook::velox::config::ConfigBase>(
- std::unordered_map<std::string, std::string>(sparkConfs)));
+ auto hiveConf = getHiveConfig(
+
std::make_shared<facebook::velox::config::ConfigBase>(std::unordered_map<std::string,
std::string>(sparkConfs)),
+ FileSystemType::kAbfs);
auto fileSystem = filesystems::getFileSystem(filePath_, hiveConf);
auto* abfsFileSystem =
dynamic_cast<filesystems::AbfsFileSystem*>(fileSystem.get());
sink_ = std::make_unique<dwio::common::WriteFileSink>(
diff --git a/cpp/velox/operators/writer/VeloxParquetDataSourceHDFS.h
b/cpp/velox/operators/writer/VeloxParquetDataSourceHDFS.h
index 5f61d9145f..2e7b313118 100644
--- a/cpp/velox/operators/writer/VeloxParquetDataSourceHDFS.h
+++ b/cpp/velox/operators/writer/VeloxParquetDataSourceHDFS.h
@@ -43,8 +43,9 @@ class VeloxParquetDataSourceHDFS final : public
VeloxParquetDataSource {
: VeloxParquetDataSource(filePath, veloxPool, sinkPool, schema) {}
void initSink(const std::unordered_map<std::string, std::string>&
sparkConfs) override {
- auto hiveConf =
getHiveConfig(std::make_shared<facebook::velox::config::ConfigBase>(
- std::unordered_map<std::string, std::string>(sparkConfs)));
+ auto hiveConf = getHiveConfig(
+
std::make_shared<facebook::velox::config::ConfigBase>(std::unordered_map<std::string,
std::string>(sparkConfs)),
+ FileSystemType::kHdfs);
sink_ = dwio::common::FileSink::create(filePath_, {.connectorProperties =
hiveConf, .pool = sinkPool_.get()});
}
};
diff --git a/cpp/velox/operators/writer/VeloxParquetDataSourceS3.h
b/cpp/velox/operators/writer/VeloxParquetDataSourceS3.h
index 788eda19e5..f366953422 100644
--- a/cpp/velox/operators/writer/VeloxParquetDataSourceS3.h
+++ b/cpp/velox/operators/writer/VeloxParquetDataSourceS3.h
@@ -43,8 +43,9 @@ class VeloxParquetDataSourceS3 final : public
VeloxParquetDataSource {
: VeloxParquetDataSource(filePath, veloxPool, sinkPool, schema) {}
void initSink(const std::unordered_map<std::string, std::string>&
sparkConfs) override {
- auto hiveConf =
getHiveConfig(std::make_shared<facebook::velox::config::ConfigBase>(
- std::unordered_map<std::string, std::string>(sparkConfs)));
+ auto hiveConf = getHiveConfig(
+
std::make_shared<facebook::velox::config::ConfigBase>(std::unordered_map<std::string,
std::string>(sparkConfs)),
+ FileSystemType::kS3);
sink_ = dwio::common::FileSink::create(filePath_, {.connectorProperties =
hiveConf, .pool = sinkPool_.get()});
}
};
diff --git a/cpp/velox/utils/ConfigExtractor.cc
b/cpp/velox/utils/ConfigExtractor.cc
index 2e1aa92ff7..96da5069a2 100644
--- a/cpp/velox/utils/ConfigExtractor.cc
+++ b/cpp/velox/utils/ConfigExtractor.cc
@@ -22,29 +22,18 @@
#include "config/VeloxConfig.h"
#include "utils/Exception.h"
+#include "utils/Macros.h"
#include "velox/connectors/hive/HiveConfig.h"
#include "velox/connectors/hive/storage_adapters/s3fs/S3Config.h"
namespace gluten {
-std::string getConfigValue(
- const std::unordered_map<std::string, std::string>& confMap,
- const std::string& key,
- const std::optional<std::string>& fallbackValue) {
- auto got = confMap.find(key);
- if (got == confMap.end()) {
- if (fallbackValue == std::nullopt) {
- throw std::runtime_error("No such config key: " + key);
- }
- return fallbackValue.value();
- }
- return got->second;
-}
-
-std::shared_ptr<facebook::velox::config::ConfigBase> getHiveConfig(
- std::shared_ptr<facebook::velox::config::ConfigBase> conf) {
- std::unordered_map<std::string, std::string> hiveConfMap;
+namespace {
+void getS3HiveConfig(
+ std::shared_ptr<facebook::velox::config::ConfigBase> conf,
+ FileSystemType fsType,
+ std::unordered_map<std::string, std::string>& hiveConfMap) {
#ifdef ENABLE_S3
using namespace facebook::velox::filesystems;
std::string_view kSparkHadoopS3Prefix = "spark.hadoop.fs.s3a.";
@@ -161,7 +150,12 @@ std::shared_ptr<facebook::velox::config::ConfigBase>
getHiveConfig(
}
}
#endif
+}
+void getGcsHiveConfig(
+ std::shared_ptr<facebook::velox::config::ConfigBase> conf,
+ FileSystemType fsType,
+ std::unordered_map<std::string, std::string>& hiveConfMap) {
#ifdef ENABLE_GCS
//
https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/gcs/CONFIGURATION.md#api-client-configuration
auto gsStorageRootUrl =
conf->get<std::string>("spark.hadoop.fs.gs.storage.root.url");
@@ -204,7 +198,12 @@ std::shared_ptr<facebook::velox::config::ConfigBase>
getHiveConfig(
throw GlutenException("Conf spark.hadoop.fs.gs.auth.type is missing or
incorrect");
}
#endif
+}
+void getAbfsHiveConfig(
+ std::shared_ptr<facebook::velox::config::ConfigBase> conf,
+ FileSystemType fsType,
+ std::unordered_map<std::string, std::string>& hiveConfMap) {
#ifdef ENABLE_ABFS
std::string_view kSparkHadoopPrefix = "spark.hadoop.";
std::string_view kSparkHadoopAbfsPrefix = "spark.hadoop.fs.azure.";
@@ -215,6 +214,49 @@ std::shared_ptr<facebook::velox::config::ConfigBase>
getHiveConfig(
}
}
#endif
+}
+
+} // namespace
+
+std::string getConfigValue(
+ const std::unordered_map<std::string, std::string>& confMap,
+ const std::string& key,
+ const std::optional<std::string>& fallbackValue) {
+ auto got = confMap.find(key);
+ if (got == confMap.end()) {
+ if (fallbackValue == std::nullopt) {
+ throw std::runtime_error("No such config key: " + key);
+ }
+ return fallbackValue.value();
+ }
+ return got->second;
+}
+
+std::shared_ptr<facebook::velox::config::ConfigBase> getHiveConfig(
+ std::shared_ptr<facebook::velox::config::ConfigBase> conf,
+ FileSystemType fsType) {
+ std::unordered_map<std::string, std::string> hiveConfMap;
+
+ switch (fsType) {
+ case FileSystemType::kS3:
+ getS3HiveConfig(conf, fsType, hiveConfMap);
+ break;
+ case FileSystemType::kAbfs:
+ getAbfsHiveConfig(conf, fsType, hiveConfMap);
+ break;
+ case FileSystemType::kGcs:
+ getGcsHiveConfig(conf, fsType, hiveConfMap);
+ break;
+ case FileSystemType::kHdfs:
+ break;
+ case FileSystemType::kAll:
+ getS3HiveConfig(conf, fsType, hiveConfMap);
+ getAbfsHiveConfig(conf, fsType, hiveConfMap);
+ getGcsHiveConfig(conf, fsType, hiveConfMap);
+ break;
+ default:
+ GLUTEN_UNREACHABLE();
+ }
hiveConfMap[facebook::velox::connector::hive::HiveConfig::kEnableFileHandleCache]
=
conf->get<bool>(kVeloxFileHandleCacheEnabled,
kVeloxFileHandleCacheEnabledDefault) ? "true" : "false";
diff --git a/cpp/velox/utils/ConfigExtractor.h
b/cpp/velox/utils/ConfigExtractor.h
index 4cbfdf991f..5b10c714f2 100644
--- a/cpp/velox/utils/ConfigExtractor.h
+++ b/cpp/velox/utils/ConfigExtractor.h
@@ -28,12 +28,15 @@
namespace gluten {
+enum class FileSystemType : uint8_t { kHdfs, kS3, kAbfs, kGcs, kAll };
+
std::string getConfigValue(
const std::unordered_map<std::string, std::string>& confMap,
const std::string& key,
const std::optional<std::string>& fallbackValue);
std::shared_ptr<facebook::velox::config::ConfigBase> getHiveConfig(
- std::shared_ptr<facebook::velox::config::ConfigBase> conf);
+ std::shared_ptr<facebook::velox::config::ConfigBase> conf,
+ FileSystemType fsType = FileSystemType::kAll);
} // namespace gluten
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]