rok commented on code in PR #544: URL: https://github.com/apache/parquet-format/pull/544#discussion_r2915242577
########## src/main/flatbuf/parquet3.fbs: ########## @@ -0,0 +1,604 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +namespace parquet.format; + +// The FlatBuffers footer preserves the same information as the Thrift Parquet footer, +// while removing duplicated fields, unused details, and inefficient encodings that +// waste space and memory. +// It can currently be attached as a footer extension, and may fully replace the +// Thrift footer in the future. +// +// Optimization notes: +// 1. Statistics use fixed-width integral types when possible; otherwise they are +// encoded as prefix + suffix. +// 2. ColumnChunk file_path and file_offset are removed since they are unused. +// 3. ColumnMetaData.encoding_stats are removed and replaced by +// ColumnMetaData.is_fully_dict_encoded. +// 4. ColumnMetaData.path_in_schema is removed since it can be derived from the schema. +// 5. ConvertedType is fully dropped as it is superseded by LogicalType. +// 6. Offset and column indexes are removed since they are small and their offsets +// alone take comparable space. + +/** + * Types supported by Parquet. These types are intended to be used in combination + * with the encodings to control the on disk storage format. + * For example INT16 is not included as a type since a good encoding of INT32 + * would handle this. + */ +enum Type : byte { + BOOLEAN = 0, + INT32 = 1, + INT64 = 2, + INT96 = 3, // deprecated, new Parquet writers should not write data in INT96 + FLOAT = 4, + DOUBLE = 5, + BYTE_ARRAY = 6, + FIXED_LEN_BYTE_ARRAY = 7, +} + +/** + * Representation of Schemas + */ +enum FieldRepetitionType : byte { + /** This field is required (can not be null) and each row has exactly 1 value. */ + REQUIRED = 0, + + /** The field is optional (can be null) and each row has 0 or 1 values. */ + OPTIONAL = 1, + + /** The field is repeated and can contain 0 or more values */ + REPEATED = 2, +} + +/** + * Encodings supported by Parquet. Not all encodings are valid for all types. These + * enums are also used to specify the encoding of definition and repetition levels. + * See the accompanying doc for the details of the more complicated encodings. + * Note: Match the thrift enum values so that we can cast between them. + */ +enum Encoding : byte { + /** Default encoding. + * BOOLEAN - 1 bit per value. 0 is false; 1 is true. + * INT32 - 4 bytes per value. Stored as little-endian. + * INT64 - 8 bytes per value. Stored as little-endian. + * FLOAT - 4 bytes per value. IEEE. Stored as little-endian. + * DOUBLE - 8 bytes per value. IEEE. Stored as little-endian. + * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. + * FIXED_LEN_BYTE_ARRAY - Just the bytes. + */ + PLAIN = 0, + + /** Group VarInt encoding for INT32/INT64. + * This encoding is deprecated. It was never used + */ + // GROUP_VAR_INT = 1, + + /** + * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the + * plain type. + * in a data page use RLE_DICTIONARY instead. + * in a Dictionary page use PLAIN instead + */ + PLAIN_DICTIONARY = 2, + + /** Group packed run length encoding. Usable for definition/repetition levels + * encoding and Booleans (on one bit: 0 is false; 1 is true.) + */ + RLE = 3, + + /** Bit packed encoding. This can only be used if the data has a known max + * width. Usable for definition/repetition levels encoding. + * This encoding is deprecated and is replaced by the RLE/bit-packing hybrid encoding. + */ + // BIT_PACKED = 4, + + /** Delta encoding for integers. This can be used for int columns and works best + * on sorted data + */ + DELTA_BINARY_PACKED = 5, + + /** Encoding for byte arrays to separate the length values and the data. The lengths + * are encoded using DELTA_BINARY_PACKED + */ + DELTA_LENGTH_BYTE_ARRAY = 6, + + /** Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. + * Suffixes are stored as delta length byte arrays. + */ + DELTA_BYTE_ARRAY = 7, + + /** Dictionary encoding: the ids are encoded using the RLE encoding + */ + RLE_DICTIONARY = 8, + + /** Encoding for fixed-width data (FLOAT, DOUBLE, INT32, INT64, FIXED_LEN_BYTE_ARRAY). + K byte-streams are created where K is the size in bytes of the data type. + The individual bytes of a value are scattered to the corresponding stream and + the streams are concatenated. + This itself does not reduce the size of the data but can lead to better compression + afterwards. + + Added in 2.8 for FLOAT and DOUBLE. + Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11. + */ + BYTE_STREAM_SPLIT = 9, +} + +/** + * Supported compression algorithms. + * + * Codecs added in format version X.Y can be read by readers based on X.Y and later. + * Codec support may vary between readers based on the format version and + * libraries available at runtime. + * + * See Compression.md for a detailed specification of these algorithms. + * Note: Match the thrift enum values so that we can cast between them. + */ +enum CompressionCodec : byte { + UNCOMPRESSED = 0, + SNAPPY = 1, + GZIP = 2, + LZO = 3, + BROTLI = 4, // Added in 2.4 + LZ4 = 5, // DEPRECATED (Added in 2.4) + ZSTD = 6, // Added in 2.4 + LZ4_RAW = 7, // Added in 2.9 +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Logical types. +/////////////////////////////////////////////////////////////////////////////////////////////////// + +table Empty {} + +/** + * Decimal logical type annotation + * + * Scale must be zero or a positive integer less than or equal to the precision. + * Precision must be a non-zero positive integer. + * + * To maintain forward-compatibility in v1, implementations using this logical + * type must also set scale and precision on the annotated SchemaElement. + * + * Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY. + */ +table DecimalOptions { + precision: int; + scale: int; +} + +/** Time units for logical types */ +enum TimeUnit : byte { + MILLIS = 0, + MICROS = 1, + NANOS = 2, +} + +/** + * Timestamp logical type annotation + * + * Allowed for physical types: INT64 + */ +table TimeOptions { + is_adjusted_to_utc: bool; + unit: TimeUnit; +} + +/** + * Integer logical type annotation + * + * bitWidth must be 8, 16, 32, or 64. + * + * Allowed for physical types: INT32, INT64 + */ +table IntOptions { + bit_width: byte = 8; + is_signed: bool; +} + +/** + * Embedded Variant logical type annotation + */ +table VariantType { + // The version of the variant specification that the variant was + // written with. + specification_version: byte = null; +} + +/** Edge interpolation algorithm for Geography logical type */ +enum EdgeInterpolationAlgorithm : byte { + SPHERICAL = 0, + VINCENTY = 1, + THOMAS = 2, + ANDOYER = 3, + KARNEY = 4, +} + +/** + * Embedded Geometry logical type annotation + * + * Geospatial features in the Well-Known Binary (WKB) format and edges interpolation + * is always linear/planar. + * + * A custom CRS can be set by the crs field. If unset, it defaults to "OGC:CRS84", + * which means that the geometries must be stored in longitude, latitude based on + * the WGS84 datum. + * + * Allowed for physical type: BYTE_ARRAY. + * + * See Geospatial.md for details. + */ +table GeometryType { + crs: string; +} + +/** + * Embedded Geography logical type annotation + * + * Geospatial features in the WKB format with an explicit (non-linear/non-planar) + * edges interpolation algorithm. + * + * A custom geographic CRS can be set by the crs field, where longitudes are + * bound by [-180, 180] and latitudes are bound by [-90, 90]. If unset, the CRS + * defaults to "OGC:CRS84". + * + * An optional algorithm can be set to correctly interpret edges interpolation + * of the geometries. If unset, the algorithm defaults to SPHERICAL. + * + * Allowed for physical type: BYTE_ARRAY. + * + * See Geospatial.md for details. + */ +table GeographyType { + crs: string; + algorithm: EdgeInterpolationAlgorithm; +} + +/** + * LogicalType annotations to replace ConvertedType. + */ +union LogicalType { + StringType:Empty, + MapType:Empty, + ListType:Empty, + EnumType:Empty, + DecimalType:DecimalOptions, + DateType:Empty, + TimeType:TimeOptions, + TimestampType:TimeOptions, + IntType:IntOptions, + NullType:Empty, + JsonType:Empty, + BsonType:Empty, + UUIDType:Empty, + Float16Type:Empty, + VariantType:VariantType, + GeometryType:GeometryType, + GeographyType:GeographyType, +} + +table Statistics { + null_count: int = null; + // Store min/max values as fixed-width entities depending on the physical type. + // If min_len/max_len is present then the corresponding min/max value is present. + // + // - BOOLEAN: none + // - INT32/FLOAT: min_lo4/max_lo4 (little-endian, 4 bytes) + // - INT64/DOUBLE: min_lo8/max_lo8 (little-endian, 8 bytes) + // - INT96: lo4 contains the low 4 bytes, lo8 contains the high 8 bytes (little-endian, 12 bytes total) + // - FIXED_LEN_BYTE_ARRAY: + // - BYTE_ARRAY: + // prefix: the longest common prefix of min and max values + // lo8+hi8: zero-padded 16 bytes (big-endian) of the suffix after removing the prefix + // min_len/max_len: the length of the suffix of the original value after removing the prefix. + // If > 16 then the value stored in lo8+hi8 is a truncated approximation (inexact). + // If <= 16 then the value is exact. + // + // Example for BYTE_ARRAY with min="apple" and max="application": + // prefix = "appl" (longest common prefix) + // min suffix = "e" (1 byte), max suffix = "ication" (7 bytes) + // min_lo8 = big-endian encoding of "e" zero-padded to 16 bytes + // min_len = 1 (exact, since 1 <= 16) + // max_lo8 = big-endian encoding of "ication" zero-padded to 16 bytes + // max_len = 7 (exact, since 7 <= 16) + // + // Example for INT32 with min=42: + // min_lo4 = 0x2A000000 (42 in little-endian) + min_lo4: uint; + min_lo8: ulong; + min_hi8: ulong; + min_len: byte = null; + max_lo4: uint; + max_lo8: ulong; + max_hi8: ulong; + max_len: byte = null; + prefix: string; Review Comment: Shouldn't this be `binary`? ########## src/main/flatbuf/parquet3.fbs: ########## @@ -0,0 +1,604 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +namespace parquet.format; + +// The FlatBuffers footer preserves the same information as the Thrift Parquet footer, +// while removing duplicated fields, unused details, and inefficient encodings that +// waste space and memory. +// It can currently be attached as a footer extension, and may fully replace the +// Thrift footer in the future. +// +// Optimization notes: +// 1. Statistics use fixed-width integral types when possible; otherwise they are +// encoded as prefix + suffix. +// 2. ColumnChunk file_path and file_offset are removed since they are unused. +// 3. ColumnMetaData.encoding_stats are removed and replaced by +// ColumnMetaData.is_fully_dict_encoded. +// 4. ColumnMetaData.path_in_schema is removed since it can be derived from the schema. +// 5. ConvertedType is fully dropped as it is superseded by LogicalType. +// 6. Offset and column indexes are removed since they are small and their offsets +// alone take comparable space. + +/** + * Types supported by Parquet. These types are intended to be used in combination + * with the encodings to control the on disk storage format. + * For example INT16 is not included as a type since a good encoding of INT32 + * would handle this. + */ +enum Type : byte { + BOOLEAN = 0, + INT32 = 1, + INT64 = 2, + INT96 = 3, // deprecated, new Parquet writers should not write data in INT96 + FLOAT = 4, + DOUBLE = 5, + BYTE_ARRAY = 6, + FIXED_LEN_BYTE_ARRAY = 7, +} + +/** + * Representation of Schemas + */ +enum FieldRepetitionType : byte { + /** This field is required (can not be null) and each row has exactly 1 value. */ + REQUIRED = 0, + + /** The field is optional (can be null) and each row has 0 or 1 values. */ + OPTIONAL = 1, + + /** The field is repeated and can contain 0 or more values */ + REPEATED = 2, +} + +/** + * Encodings supported by Parquet. Not all encodings are valid for all types. These + * enums are also used to specify the encoding of definition and repetition levels. + * See the accompanying doc for the details of the more complicated encodings. + * Note: Match the thrift enum values so that we can cast between them. + */ +enum Encoding : byte { + /** Default encoding. + * BOOLEAN - 1 bit per value. 0 is false; 1 is true. + * INT32 - 4 bytes per value. Stored as little-endian. + * INT64 - 8 bytes per value. Stored as little-endian. + * FLOAT - 4 bytes per value. IEEE. Stored as little-endian. + * DOUBLE - 8 bytes per value. IEEE. Stored as little-endian. + * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. + * FIXED_LEN_BYTE_ARRAY - Just the bytes. + */ + PLAIN = 0, + + /** Group VarInt encoding for INT32/INT64. + * This encoding is deprecated. It was never used + */ + // GROUP_VAR_INT = 1, + + /** + * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the + * plain type. + * in a data page use RLE_DICTIONARY instead. + * in a Dictionary page use PLAIN instead + */ + PLAIN_DICTIONARY = 2, + + /** Group packed run length encoding. Usable for definition/repetition levels + * encoding and Booleans (on one bit: 0 is false; 1 is true.) + */ + RLE = 3, + + /** Bit packed encoding. This can only be used if the data has a known max + * width. Usable for definition/repetition levels encoding. + * This encoding is deprecated and is replaced by the RLE/bit-packing hybrid encoding. + */ + // BIT_PACKED = 4, + + /** Delta encoding for integers. This can be used for int columns and works best + * on sorted data + */ + DELTA_BINARY_PACKED = 5, + + /** Encoding for byte arrays to separate the length values and the data. The lengths + * are encoded using DELTA_BINARY_PACKED + */ + DELTA_LENGTH_BYTE_ARRAY = 6, + + /** Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. + * Suffixes are stored as delta length byte arrays. + */ + DELTA_BYTE_ARRAY = 7, + + /** Dictionary encoding: the ids are encoded using the RLE encoding + */ + RLE_DICTIONARY = 8, + + /** Encoding for fixed-width data (FLOAT, DOUBLE, INT32, INT64, FIXED_LEN_BYTE_ARRAY). + K byte-streams are created where K is the size in bytes of the data type. + The individual bytes of a value are scattered to the corresponding stream and + the streams are concatenated. + This itself does not reduce the size of the data but can lead to better compression + afterwards. + + Added in 2.8 for FLOAT and DOUBLE. + Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11. + */ + BYTE_STREAM_SPLIT = 9, +} + +/** + * Supported compression algorithms. + * + * Codecs added in format version X.Y can be read by readers based on X.Y and later. + * Codec support may vary between readers based on the format version and + * libraries available at runtime. + * + * See Compression.md for a detailed specification of these algorithms. + * Note: Match the thrift enum values so that we can cast between them. + */ +enum CompressionCodec : byte { + UNCOMPRESSED = 0, + SNAPPY = 1, + GZIP = 2, + LZO = 3, + BROTLI = 4, // Added in 2.4 + LZ4 = 5, // DEPRECATED (Added in 2.4) + ZSTD = 6, // Added in 2.4 + LZ4_RAW = 7, // Added in 2.9 +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Logical types. +/////////////////////////////////////////////////////////////////////////////////////////////////// + +table Empty {} + +/** + * Decimal logical type annotation + * + * Scale must be zero or a positive integer less than or equal to the precision. + * Precision must be a non-zero positive integer. + * + * To maintain forward-compatibility in v1, implementations using this logical + * type must also set scale and precision on the annotated SchemaElement. + * + * Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY. + */ +table DecimalOptions { + precision: int; + scale: int; +} Review Comment: In thrift these are in reverse order. Any reason to change that here? https://github.com/apache/parquet-format/blob/38818fa0e7efd54b535001a4448030a40619c2a3/src/main/thrift/parquet.thrift#L343-L346 ########## src/main/flatbuf/parquet3.fbs: ########## @@ -0,0 +1,604 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +namespace parquet.format; + +// The FlatBuffers footer preserves the same information as the Thrift Parquet footer, +// while removing duplicated fields, unused details, and inefficient encodings that +// waste space and memory. +// It can currently be attached as a footer extension, and may fully replace the +// Thrift footer in the future. +// +// Optimization notes: +// 1. Statistics use fixed-width integral types when possible; otherwise they are +// encoded as prefix + suffix. Review Comment: `SizeStatistics` and `Statistics.distinct_count` are removed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
