This is an automated email from the ASF dual-hosted git repository.
wgtmac pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-format.git
The following commit(s) were added to refs/heads/master by this push:
new c4b3ef2 Fix typos, grammar, and comment inconsistencies in
parquet.thrift (#573)
c4b3ef2 is described below
commit c4b3ef2f79e1c6ae7418f5cb7b17dcc067b1f7ab
Author: Ismaël Mejía <[email protected]>
AuthorDate: Mon Jun 8 15:55:55 2026 +0200
Fix typos, grammar, and comment inconsistencies in parquet.thrift (#573)
- Fix typos: "to be be", "documention", "not necessary"
- Remove off-by-one in DataPageHeaderV2 is_compressed comment
- Fix article agreement ("a element" -> "an element", "a OffsetIndex" ->
"an OffsetIndex")
- Disambiguate compressed_page_size comment in PageLocation
- Fix "edges interpolation" -> "edge interpolation" in Geospatial comments
- Capitalize proper nouns: Hive, Pig; normalize GZIP casing
- Add terminal periods for consistency
- Clarify BIT_PACKED is superseded by RLE (cross-reference Encodings.md)
- Missing space before parenthesis in frameworks list
Thrift validation passes after these edits.
---
src/main/thrift/parquet.thrift | 41 +++++++++++++++++++++--------------------
1 file changed, 21 insertions(+), 20 deletions(-)
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index 225f85f..fe259d6 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -41,7 +41,7 @@ enum Type {
}
/**
- * DEPRECATED: Common types used by frameworks(e.g. hive, pig) using parquet.
+ * DEPRECATED: Common types used by frameworks (e.g. Hive, Pig) using parquet.
* ConvertedType is superseded by LogicalType. This enum should not be
extended.
*
* See LogicalTypes.md for conversion between ConvertedType and LogicalType.
@@ -431,7 +431,7 @@ enum EdgeInterpolationAlgorithm {
/**
* Embedded Geometry logical type annotation
*
- * Geospatial features in the Well-Known Binary (WKB) format and edges
interpolation
+ * Geospatial features in the Well-Known Binary (WKB) format and `edges`
interpolation
* is always linear/planar.
*
* A custom CRS can be set by the crs field. If unset, it defaults to
"OGC:CRS84",
@@ -450,13 +450,13 @@ struct GeometryType {
* Embedded Geography logical type annotation
*
* Geospatial features in the WKB format with an explicit
(non-linear/non-planar)
- * edges interpolation algorithm.
+ * `edges` interpolation algorithm.
*
* A custom geographic CRS can be set by the crs field, where longitudes are
* bound by [-180, 180] and latitudes are bound by [-90, 90]. If unset, the CRS
* defaults to "OGC:CRS84".
*
- * An optional algorithm can be set to correctly interpret edges interpolation
+ * An optional algorithm can be set to correctly interpret `edges`
interpolation
* of the geometries. If unset, the algorithm defaults to SPHERICAL.
*
* Allowed for physical type: BYTE_ARRAY.
@@ -504,7 +504,7 @@ union LogicalType {
}
/**
- * Represents a element inside a schema definition.
+ * Represents an element inside a schema definition.
* - if it is a group (inner node) then type is undefined and num_children is
defined
* - if it is a primitive type (leaf) then type is defined and num_children
is undefined
* the nodes are listed in depth first traversal order.
@@ -583,15 +583,15 @@ enum Encoding {
PLAIN = 0;
/** Group VarInt encoding for INT32/INT64.
- * This encoding is deprecated. It was never used
+ * This encoding is deprecated. It was never used.
*/
// GROUP_VAR_INT = 1;
/**
- * Deprecated: Dictionary encoding. The values in the dictionary are encoded
in the
+ * DEPRECATED: Dictionary encoding. The values in the dictionary are encoded
in the
* plain type.
- * in a data page use RLE_DICTIONARY instead.
- * in a Dictionary page use PLAIN instead
+ * For a data page use RLE_DICTIONARY instead.
+ * For a Dictionary page use PLAIN instead.
*/
PLAIN_DICTIONARY = 2;
@@ -600,8 +600,9 @@ enum Encoding {
*/
RLE = 3;
- /** Bit packed encoding. This can only be used if the data has a known max
+ /** DEPRECATED: Bit packed encoding. This can only be used if the data has
a known max
* width. Usable for definition/repetition levels encoding.
+ * Superseded by RLE (which is a hybrid of RLE and bit packing); see
Encodings.md.
*/
BIT_PACKED = 4;
@@ -679,7 +680,7 @@ struct DataPageHeader {
/**
* Number of values, including NULLs, in this data page.
*
- * If a OffsetIndex is present, a page must begin at a row
+ * If an OffsetIndex is present, a page must begin at a row
* boundary (repetition_level = 0). Otherwise, pages may begin
* within a row (repetition_level > 0).
**/
@@ -752,7 +753,7 @@ struct DataPageHeaderV2 {
/** Whether the values are compressed.
Which means the section of the page between
- definition_levels_byte_length + repetition_levels_byte_length + 1 and
compressed_page_size (included)
+ definition_levels_byte_length + repetition_levels_byte_length and
compressed_page_size (included)
is compressed with the compression_codec.
If missing it is considered compressed */
7: optional bool is_compressed = true;
@@ -816,10 +817,10 @@ struct PageHeader {
/** Compressed (and potentially encrypted) page size in bytes, not including
this header **/
3: required i32 compressed_page_size
- /** The 32-bit CRC checksum for the page, to be be calculated as follows:
+ /** The 32-bit CRC checksum for the page, to be calculated as follows:
*
* - The standard CRC32 algorithm is used (with polynomial 0x04C11DB7,
- * the same as in e.g. GZip).
+ * the same as in e.g. GZIP).
* - All page types can have a CRC (v1 and v2 data pages, dictionary pages,
* etc.).
* - The CRC is computed on the serialization binary representation of the
page
@@ -988,7 +989,7 @@ struct ColumnChunk {
**/
1: optional string file_path
- /** Deprecated: Byte offset in file_path to the ColumnMetaData
+ /** DEPRECATED: Byte offset in file_path to the ColumnMetaData
*
* Past use of this field has been inconsistent, with some implementations
* using it to point to the ColumnMetaData and some using it to point to
@@ -1201,8 +1202,8 @@ struct PageLocation {
1: required i64 offset
/**
- * Size of the page, including header. Sum of compressed_page_size and header
- * length
+ * Size of the page, including header. Equal to the sum of the page's
+ * PageHeader.compressed_page_size and the size of the serialized PageHeader.
*/
2: required i32 compressed_page_size
@@ -1230,7 +1231,7 @@ struct OffsetIndex {
/**
* Unencoded/uncompressed size for BYTE_ARRAY types.
*
- * See documention for unencoded_byte_array_data_bytes in SizeStatistics for
+ * See documentation for unencoded_byte_array_data_bytes in SizeStatistics
for
* more details on this field.
*/
2: optional list<i64> unencoded_byte_array_data_bytes
@@ -1260,7 +1261,7 @@ struct ColumnIndex {
* Two lists containing lower and upper bounds for the values of each page
* determined by the ColumnOrder of the column. These may be the actual
* minimum and maximum values found on a page, but can also be (more compact)
- * values that do not exist on a page. For example, instead of storing
""Blart
+ * values that do not exist on a page. For example, instead of storing "Blart
* Versenwald III", a writer may set min_values[i]="B", max_values[i]="C".
* Such more compact values must still be valid values within the column's
* logical type. Readers must make sure that list entries are populated
before
@@ -1399,7 +1400,7 @@ struct FileMetaData {
* Sort order used for the min_value and max_value fields in the Statistics
* objects and the min_values and max_values fields in the ColumnIndex
* objects of each column in this file. Sort orders are listed in the order
- * matching the columns in the schema. The indexes are not necessary the same
+ * matching the columns in the schema. The indexes are not necessarily the
same
* though, because only leaf nodes of the schema are represented in the list
* of sort orders.
*