This is an automated email from the ASF dual-hosted git repository.

wgtmac pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-format.git


The following commit(s) were added to refs/heads/master by this push:
     new c4b3ef2  Fix typos, grammar, and comment inconsistencies in 
parquet.thrift (#573)
c4b3ef2 is described below

commit c4b3ef2f79e1c6ae7418f5cb7b17dcc067b1f7ab
Author: Ismaël Mejía <[email protected]>
AuthorDate: Mon Jun 8 15:55:55 2026 +0200

    Fix typos, grammar, and comment inconsistencies in parquet.thrift (#573)
    
    - Fix typos: "to be be", "documention", "not necessary"
    - Remove off-by-one in DataPageHeaderV2 is_compressed comment
    - Fix article agreement ("a element" -> "an element", "a OffsetIndex" -> 
"an OffsetIndex")
    - Disambiguate compressed_page_size comment in PageLocation
    - Fix "edges interpolation" -> "edge interpolation" in Geospatial comments
    - Capitalize proper nouns: Hive, Pig; normalize GZIP casing
    - Add terminal periods for consistency
    - Clarify BIT_PACKED is superseded by RLE (cross-reference Encodings.md)
    - Missing space before parenthesis in frameworks list
    
    Thrift validation passes after these edits.
---
 src/main/thrift/parquet.thrift | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index 225f85f..fe259d6 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -41,7 +41,7 @@ enum Type {
 }
 
 /**
- * DEPRECATED: Common types used by frameworks(e.g. hive, pig) using parquet.
+ * DEPRECATED: Common types used by frameworks (e.g. Hive, Pig) using parquet.
  * ConvertedType is superseded by LogicalType.  This enum should not be 
extended.
  *
  * See LogicalTypes.md for conversion between ConvertedType and LogicalType.
@@ -431,7 +431,7 @@ enum EdgeInterpolationAlgorithm {
 /**
  * Embedded Geometry logical type annotation
  *
- * Geospatial features in the Well-Known Binary (WKB) format and edges 
interpolation
+ * Geospatial features in the Well-Known Binary (WKB) format and `edges` 
interpolation
  * is always linear/planar.
  *
  * A custom CRS can be set by the crs field. If unset, it defaults to 
"OGC:CRS84",
@@ -450,13 +450,13 @@ struct GeometryType {
  * Embedded Geography logical type annotation
  *
  * Geospatial features in the WKB format with an explicit 
(non-linear/non-planar)
- * edges interpolation algorithm.
+ * `edges` interpolation algorithm.
  *
  * A custom geographic CRS can be set by the crs field, where longitudes are
  * bound by [-180, 180] and latitudes are bound by [-90, 90]. If unset, the CRS
  * defaults to "OGC:CRS84".
  *
- * An optional algorithm can be set to correctly interpret edges interpolation
+ * An optional algorithm can be set to correctly interpret `edges` 
interpolation
  * of the geometries. If unset, the algorithm defaults to SPHERICAL.
  *
  * Allowed for physical type: BYTE_ARRAY.
@@ -504,7 +504,7 @@ union LogicalType {
 }
 
 /**
- * Represents a element inside a schema definition.
+ * Represents an element inside a schema definition.
  *  - if it is a group (inner node) then type is undefined and num_children is 
defined
  *  - if it is a primitive type (leaf) then type is defined and num_children 
is undefined
  * the nodes are listed in depth first traversal order.
@@ -583,15 +583,15 @@ enum Encoding {
   PLAIN = 0;
 
   /** Group VarInt encoding for INT32/INT64.
-   * This encoding is deprecated. It was never used
+   * This encoding is deprecated. It was never used.
    */
   //  GROUP_VAR_INT = 1;
 
   /**
-   * Deprecated: Dictionary encoding. The values in the dictionary are encoded 
in the
+   * DEPRECATED: Dictionary encoding. The values in the dictionary are encoded 
in the
    * plain type.
-   * in a data page use RLE_DICTIONARY instead.
-   * in a Dictionary page use PLAIN instead
+   * For a data page use RLE_DICTIONARY instead.
+   * For a Dictionary page use PLAIN instead.
    */
   PLAIN_DICTIONARY = 2;
 
@@ -600,8 +600,9 @@ enum Encoding {
    */
   RLE = 3;
 
-  /** Bit packed encoding.  This can only be used if the data has a known max
+  /** DEPRECATED: Bit packed encoding.  This can only be used if the data has 
a known max
    * width.  Usable for definition/repetition levels encoding.
+   * Superseded by RLE (which is a hybrid of RLE and bit packing); see 
Encodings.md.
    */
   BIT_PACKED = 4;
 
@@ -679,7 +680,7 @@ struct DataPageHeader {
   /**
    * Number of values, including NULLs, in this data page.
    *
-   * If a OffsetIndex is present, a page must begin at a row
+   * If an OffsetIndex is present, a page must begin at a row
    * boundary (repetition_level = 0). Otherwise, pages may begin
    * within a row (repetition_level > 0).
    **/
@@ -752,7 +753,7 @@ struct DataPageHeaderV2 {
 
   /**  Whether the values are compressed.
   Which means the section of the page between
-  definition_levels_byte_length + repetition_levels_byte_length + 1 and 
compressed_page_size (included)
+  definition_levels_byte_length + repetition_levels_byte_length and 
compressed_page_size (included)
   is compressed with the compression_codec.
   If missing it is considered compressed */
   7: optional bool is_compressed = true;
@@ -816,10 +817,10 @@ struct PageHeader {
   /** Compressed (and potentially encrypted) page size in bytes, not including 
this header **/
   3: required i32 compressed_page_size
 
-  /** The 32-bit CRC checksum for the page, to be be calculated as follows:
+  /** The 32-bit CRC checksum for the page, to be calculated as follows:
    *
    * - The standard CRC32 algorithm is used (with polynomial 0x04C11DB7,
-   *   the same as in e.g. GZip).
+   *   the same as in e.g. GZIP).
    * - All page types can have a CRC (v1 and v2 data pages, dictionary pages,
    *   etc.).
    * - The CRC is computed on the serialization binary representation of the 
page
@@ -988,7 +989,7 @@ struct ColumnChunk {
     **/
   1: optional string file_path
 
-  /** Deprecated: Byte offset in file_path to the ColumnMetaData
+  /** DEPRECATED: Byte offset in file_path to the ColumnMetaData
    *
    * Past use of this field has been inconsistent, with some implementations
    * using it to point to the ColumnMetaData and some using it to point to
@@ -1201,8 +1202,8 @@ struct PageLocation {
   1: required i64 offset
 
   /**
-   * Size of the page, including header. Sum of compressed_page_size and header
-   * length
+   * Size of the page, including header. Equal to the sum of the page's
+   * PageHeader.compressed_page_size and the size of the serialized PageHeader.
    */
   2: required i32 compressed_page_size
 
@@ -1230,7 +1231,7 @@ struct OffsetIndex {
   /**
    * Unencoded/uncompressed size for BYTE_ARRAY types.
    *
-   * See documention for unencoded_byte_array_data_bytes in SizeStatistics for
+   * See documentation for unencoded_byte_array_data_bytes in SizeStatistics 
for
    * more details on this field.
    */
   2: optional list<i64> unencoded_byte_array_data_bytes
@@ -1260,7 +1261,7 @@ struct ColumnIndex {
    * Two lists containing lower and upper bounds for the values of each page
    * determined by the ColumnOrder of the column. These may be the actual
    * minimum and maximum values found on a page, but can also be (more compact)
-   * values that do not exist on a page. For example, instead of storing 
""Blart
+   * values that do not exist on a page. For example, instead of storing "Blart
    * Versenwald III", a writer may set min_values[i]="B", max_values[i]="C".
    * Such more compact values must still be valid values within the column's
    * logical type. Readers must make sure that list entries are populated 
before
@@ -1399,7 +1400,7 @@ struct FileMetaData {
    * Sort order used for the min_value and max_value fields in the Statistics
    * objects and the min_values and max_values fields in the ColumnIndex
    * objects of each column in this file. Sort orders are listed in the order
-   * matching the columns in the schema. The indexes are not necessary the same
+   * matching the columns in the schema. The indexes are not necessarily the 
same
    * though, because only leaf nodes of the schema are represented in the list
    * of sort orders.
    *

Reply via email to