This is an automated email from the ASF dual-hosted git repository.
xiaokang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-graphar.git
The following commit(s) were added to refs/heads/main by this push:
new fe013bc5 feat(C++): remove hardcoded row group size, keep 64M default
(#872)
fe013bc5 is described below
commit fe013bc59d058d80742a47c2404c058b53ff6129
Author: Jason <[email protected]>
AuthorDate: Wed Feb 25 14:17:18 2026 +0800
feat(C++): remove hardcoded row group size, keep 64M default (#872)
---
cpp/src/graphar/filesystem.cc | 6 ++++--
cpp/src/graphar/writer_util.cc | 7 +++++++
cpp/src/graphar/writer_util.h | 3 ++-
3 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/cpp/src/graphar/filesystem.cc b/cpp/src/graphar/filesystem.cc
index 83f9b7dc..e1a5ef56 100644
--- a/cpp/src/graphar/filesystem.cc
+++ b/cpp/src/graphar/filesystem.cc
@@ -266,8 +266,9 @@ Status FileSystem::WriteTableToFile(
}
case FileType::PARQUET: {
auto schema = table->schema();
+ auto row_group_size = options->getParquetMaxRowGroupLength();
RETURN_NOT_ARROW_OK(parquet::arrow::WriteTable(
- *table, arrow::default_memory_pool(), output_stream, 64 * 1024 * 1024,
+ *table, arrow::default_memory_pool(), output_stream, row_group_size,
options->getParquetWriterProperties(),
options->getArrowWriterProperties()));
break;
@@ -300,8 +301,9 @@ Status FileSystem::WriteLabelTableToFile(
parquet::WriterProperties::Builder builder;
builder.compression(arrow::Compression::type::ZSTD); // enable compression
builder.encoding(parquet::Encoding::RLE);
+ auto row_group_size = builder.build()->max_row_group_length();
RETURN_NOT_ARROW_OK(parquet::arrow::WriteTable(
- *table, arrow::default_memory_pool(), output_stream, 64 * 1024 * 1024,
+ *table, arrow::default_memory_pool(), output_stream, row_group_size,
builder.build(), parquet::default_arrow_writer_properties()));
return Status::OK();
}
diff --git a/cpp/src/graphar/writer_util.cc b/cpp/src/graphar/writer_util.cc
index 380e3df0..4a2ecd03 100644
--- a/cpp/src/graphar/writer_util.cc
+++ b/cpp/src/graphar/writer_util.cc
@@ -86,6 +86,13 @@ WriterOptions::getParquetWriterProperties() const {
return builder.build();
}
+int64_t WriterOptions::getParquetMaxRowGroupLength() const {
+ if (parquetOption_) {
+ return parquetOption_->max_row_group_length;
+ }
+ return parquet::WriterProperties::Builder().build()->max_row_group_length();
+}
+
std::shared_ptr<parquet::ArrowWriterProperties>
WriterOptions::getArrowWriterProperties() const {
parquet::ArrowWriterProperties::Builder builder;
diff --git a/cpp/src/graphar/writer_util.h b/cpp/src/graphar/writer_util.h
index 1d2e6b12..457e3b41 100644
--- a/cpp/src/graphar/writer_util.h
+++ b/cpp/src/graphar/writer_util.h
@@ -94,7 +94,7 @@ class WriterOptions {
std::vector<::parquet::SortingColumn> sorting_columns;
int64_t dictionary_pagesize_limit = 1024 * 1024;
int64_t write_batch_size = 1024;
- int64_t max_row_group_length = 1024 * 1024;
+ int64_t max_row_group_length = 64 * 1024 * 1024;
int64_t data_pagesize = 1024 * 1024;
size_t max_statistics_size = 4096;
int compression_level = std::numeric_limits<int>::min();
@@ -429,6 +429,7 @@ class WriterOptions {
std::shared_ptr<parquet::WriterProperties> getParquetWriterProperties()
const;
std::shared_ptr<parquet::ArrowWriterProperties> getArrowWriterProperties()
const;
+ int64_t getParquetMaxRowGroupLength() const;
#ifdef ARROW_ORC
arrow::adapters::orc::WriteOptions getOrcOption() const;
#endif
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]