This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 80db102088 GH-48467: [C++][Parquet] Add BufferedStats API to 
RowGroupWriter (#49527)
80db102088 is described below

commit 80db1020881f461af3a300653cfd2333ef10a45e
Author: Wechar Yu <[email protected]>
AuthorDate: Fri Mar 27 22:34:16 2026 +0800

    GH-48467: [C++][Parquet] Add BufferedStats API to RowGroupWriter (#49527)
    
    ### Rationale for this change
    Expose an API for buffered bytes of values and levels in RowGroupWriter, 
it's useful in deciding whether a new row group is needed.
    Discussion in 
https://github.com/apache/arrow/pull/48468#issuecomment-4047682751
    
    ### What changes are included in this PR?
    Add a new API to return `BufferedStats` from `RowGroupWriter`.
    
    ### Are these changes tested?
    Test locally.
    
    ### Are there any user-facing changes?
    Yes, user could use the new API to implement their customized row group 
strategy.
    * GitHub Issue: #48467
    
    Lead-authored-by: wecharyu <[email protected]>
    Co-authored-by: Gang Wu <[email protected]>
    Co-authored-by: Zehua Zou <[email protected]>
    Signed-off-by: Gang Wu <[email protected]>
---
 cpp/src/parquet/column_writer.cc | 15 +++++++++++++++
 cpp/src/parquet/column_writer.h  |  9 +++++++++
 cpp/src/parquet/file_writer.cc   | 20 ++++++++++++++++++++
 cpp/src/parquet/file_writer.h    | 15 +++++++++++++++
 4 files changed, 59 insertions(+)

diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 1f1197f95a..61507c2ba1 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -1489,6 +1489,21 @@ class TypedColumnWriterImpl : public ColumnWriterImpl,
     return current_encoder_->EstimatedDataEncodedSize();
   }
 
+  int64_t estimated_buffered_def_level_bytes() const override {
+    return definition_levels_sink_.length();
+  }
+
+  int64_t estimated_buffered_rep_level_bytes() const override {
+    return repetition_levels_sink_.length();
+  }
+
+  int64_t estimated_buffered_dict_bytes() const override {
+    if (current_dict_encoder_) {
+      return current_dict_encoder_->dict_encoded_size();
+    }
+    return 0;
+  }
+
  protected:
   std::shared_ptr<Buffer> GetValuesBuffer() override {
     return current_encoder_->FlushValues();
diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h
index e0d4b5234b..5ad58c5ecf 100644
--- a/cpp/src/parquet/column_writer.h
+++ b/cpp/src/parquet/column_writer.h
@@ -165,6 +165,15 @@ class PARQUET_EXPORT ColumnWriter {
   /// \brief Estimated size of the values that are not written to a page yet.
   virtual int64_t estimated_buffered_value_bytes() const = 0;
 
+  /// \brief Estimated size of the definition levels that are not written to a 
page yet.
+  virtual int64_t estimated_buffered_def_level_bytes() const = 0;
+
+  /// \brief Estimated size of the repetition levels that are not written to a 
page yet.
+  virtual int64_t estimated_buffered_rep_level_bytes() const = 0;
+
+  /// \brief Estimated size of the dictionary that are not written to a page 
yet.
+  virtual int64_t estimated_buffered_dict_bytes() const = 0;
+
   /// \brief The file-level writer properties
   virtual const WriterProperties* properties() = 0;
 
diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc
index 842e667e8a..ec303408f3 100644
--- a/cpp/src/parquet/file_writer.cc
+++ b/cpp/src/parquet/file_writer.cc
@@ -69,6 +69,10 @@ int64_t RowGroupWriter::total_compressed_bytes_written() 
const {
   return contents_->total_compressed_bytes_written();
 }
 
+RowGroupWriter::BufferedStats RowGroupWriter::estimated_buffered_stats() const 
{
+  return contents_->EstimatedBufferedStats();
+}
+
 bool RowGroupWriter::buffered() const { return contents_->buffered(); }
 
 int RowGroupWriter::current_column() { return contents_->current_column(); }
@@ -198,6 +202,22 @@ class RowGroupSerializer : public RowGroupWriter::Contents 
{
     return total_compressed_bytes_written;
   }
 
+  RowGroupWriter::BufferedStats EstimatedBufferedStats() const override {
+    RowGroupWriter::BufferedStats stats;
+    if (closed_) {
+      return stats;
+    }
+    for (const auto& column_writer : column_writers_) {
+      if (column_writer) {
+        stats.def_level_bytes += 
column_writer->estimated_buffered_def_level_bytes();
+        stats.rep_level_bytes += 
column_writer->estimated_buffered_rep_level_bytes();
+        stats.value_bytes += column_writer->estimated_buffered_value_bytes();
+        stats.dict_bytes += column_writer->estimated_buffered_dict_bytes();
+      }
+    }
+    return stats;
+  }
+
   bool buffered() const override { return buffered_row_group_; }
 
   void Close() override {
diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h
index d5ea1d7c98..636a7e9957 100644
--- a/cpp/src/parquet/file_writer.h
+++ b/cpp/src/parquet/file_writer.h
@@ -36,6 +36,15 @@ static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 
'E'};
 
 class PARQUET_EXPORT RowGroupWriter {
  public:
+  // Estimated uncompressed byte sizes of data buffered by column writers
+  // that have not yet been serialized into pages.
+  struct BufferedStats {
+    int64_t def_level_bytes = 0;
+    int64_t rep_level_bytes = 0;
+    int64_t value_bytes = 0;
+    int64_t dict_bytes = 0;
+  };
+
   // Forward declare a virtual class 'Contents' to aid dependency injection 
and more
   // easily create test fixtures
   // An implementation of the Contents class is defined in the .cc file
@@ -58,6 +67,9 @@ class PARQUET_EXPORT RowGroupWriter {
     virtual int64_t total_compressed_bytes() const = 0;
     /// \brief total compressed bytes written by the page writer
     virtual int64_t total_compressed_bytes_written() const = 0;
+    /// \brief Estimated sizes of buffered data (levels, values, dict) not yet
+    /// written to pages.
+    virtual BufferedStats EstimatedBufferedStats() const = 0;
 
     virtual bool buffered() const = 0;
   };
@@ -99,6 +111,9 @@ class PARQUET_EXPORT RowGroupWriter {
   int64_t total_compressed_bytes() const;
   /// \brief total compressed bytes written by the page writer
   int64_t total_compressed_bytes_written() const;
+  /// \brief Estimated sizes of buffered data (levels, values, dict) not yet
+  /// written to pages.
+  BufferedStats estimated_buffered_stats() const;
 
   /// Returns whether the current RowGroupWriter is in the buffered mode and 
is created
   /// by calling ParquetFileWriter::AppendBufferedRowGroup.

Reply via email to