gene-db commented on code in PR #3197:
URL: https://github.com/apache/parquet-java/pull/3197#discussion_r2056800469


##########
parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java:
##########
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.util.UUID;
+
+/**
+ * This Variant class holds the Variant-encoded value and metadata binary 
values.
+ */
+public final class Variant {
+  /** The buffer that contains the Variant value. */
+  final ByteBuffer value;
+
+  /** The buffer that contains the Variant metadata. */
+  final ByteBuffer metadata;
+
+  /**
+   * The threshold to switch from linear search to binary search when looking 
up a field by key in
+   * an object. This is a performance optimization to avoid the overhead of 
binary search for a
+   * short list.
+   */
+  static final int BINARY_SEARCH_THRESHOLD = 32;
+
+  public Variant(byte[] value, byte[] metadata) {
+    this(value, 0, value.length, metadata, 0, metadata.length);
+  }
+
+  public Variant(byte[] value, int valuePos, int valueLength, byte[] metadata, 
int metadataPos, int metadataLength) {
+    this(ByteBuffer.wrap(value, valuePos, valueLength), 
ByteBuffer.wrap(metadata, metadataPos, metadataLength));
+  }
+
+  public Variant(ByteBuffer value, ByteBuffer metadata) {
+    // THe buffers are read single-byte at a time, so the endianness of the 
input buffers

Review Comment:
   Fixed.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java:
##########
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.util.UUID;
+
+/**
+ * This Variant class holds the Variant-encoded value and metadata binary 
values.
+ */
+public final class Variant {
+  /** The buffer that contains the Variant value. */
+  final ByteBuffer value;
+
+  /** The buffer that contains the Variant metadata. */
+  final ByteBuffer metadata;
+
+  /**
+   * The threshold to switch from linear search to binary search when looking 
up a field by key in
+   * an object. This is a performance optimization to avoid the overhead of 
binary search for a
+   * short list.
+   */
+  static final int BINARY_SEARCH_THRESHOLD = 32;
+
+  public Variant(byte[] value, byte[] metadata) {
+    this(value, 0, value.length, metadata, 0, metadata.length);
+  }
+
+  public Variant(byte[] value, int valuePos, int valueLength, byte[] metadata, 
int metadataPos, int metadataLength) {
+    this(ByteBuffer.wrap(value, valuePos, valueLength), 
ByteBuffer.wrap(metadata, metadataPos, metadataLength));
+  }
+
+  public Variant(ByteBuffer value, ByteBuffer metadata) {
+    // THe buffers are read single-byte at a time, so the endianness of the 
input buffers
+    // are not important.
+    this.value = value.asReadOnlyBuffer();
+    this.metadata = metadata.asReadOnlyBuffer();
+
+    // There is currently only one allowed version.
+    if ((metadata.get(metadata.position()) & VariantUtil.VERSION_MASK) != 
VariantUtil.VERSION) {
+      throw new UnsupportedOperationException(String.format(
+          "Unsupported variant metadata version: %d",
+          metadata.get(metadata.position()) & VariantUtil.VERSION_MASK));
+    }
+  }
+
+  /**
+   * @return the boolean value
+   */
+  public boolean getBoolean() {
+    return VariantUtil.getBoolean(value);
+  }
+
+  /**
+   * @return the byte value
+   */
+  public byte getByte() {
+    long longValue = VariantUtil.getLong(value);
+    if (longValue < Byte.MIN_VALUE || longValue > Byte.MAX_VALUE) {
+      throw new IllegalStateException("Value out of range for byte: " + 
longValue);
+    }
+    return (byte) longValue;
+  }
+
+  /**
+   * @return the short value
+   */
+  public short getShort() {
+    long longValue = VariantUtil.getLong(value);
+    if (longValue < Short.MIN_VALUE || longValue > Short.MAX_VALUE) {
+      throw new IllegalStateException("Value out of range for short: " + 
longValue);
+    }
+    return (short) longValue;
+  }
+
+  /**
+   * @return the int value
+   */
+  public int getInt() {
+    long longValue = VariantUtil.getLong(value);
+    if (longValue < Integer.MIN_VALUE || longValue > Integer.MAX_VALUE) {
+      throw new IllegalStateException("Value out of range for int: " + 
longValue);
+    }
+    return (int) longValue;
+  }
+
+  /**
+   * @return the long value
+   */
+  public long getLong() {
+    return VariantUtil.getLong(value);
+  }
+
+  /**
+   * @return the double value
+   */
+  public double getDouble() {
+    return VariantUtil.getDouble(value);
+  }
+
+  /**
+   * @return the decimal value
+   */
+  public BigDecimal getDecimal() {
+    return VariantUtil.getDecimal(value);
+  }
+
+  /**
+   * @return the float value
+   */
+  public float getFloat() {
+    return VariantUtil.getFloat(value);
+  }
+
+  /**
+   * @return the binary value
+   */
+  public ByteBuffer getBinary() {
+    return VariantUtil.getBinary(value);
+  }
+
+  /**
+   * @return the UUID value
+   */
+  public UUID getUUID() {
+    return VariantUtil.getUUID(value);
+  }
+
+  /**
+   * @return the string value
+   */
+  public String getString() {
+    return VariantUtil.getString(value);
+  }
+
+  /**
+   * The value type of Variant value. It is determined by the header byte.
+   */
+  public enum Type {
+    OBJECT,
+    ARRAY,
+    NULL,
+    BOOLEAN,
+    BYTE,
+    SHORT,
+    INT,
+    LONG,
+    STRING,
+    DOUBLE,
+    DECIMAL4,
+    DECIMAL8,
+    DECIMAL16,
+    DATE,
+    TIMESTAMP_TZ,
+    TIMESTAMP_NTZ,
+    FLOAT,
+    BINARY,
+    TIME,
+    TIMESTAMP_NANOS,
+    TIMESTAMP_NANOS_NTZ,
+    UUID
+  }
+
+  /**
+   * @return the type of the variant value
+   */
+  public Type getType() {
+    return VariantUtil.getType(value);
+  }
+
+  /**
+   * @return the number of object fields in the variant. `getType()` must be 
`Type.OBJECT`.
+   */
+  public int numObjectElements() {
+    return VariantUtil.getObjectInfo(value).numElements;
+  }
+
+  /**
+   * Returns the object field Variant value whose key is equal to `key`.
+   * Return null if the key is not found. `getType()` must be `Type.OBJECT`.
+   * @param key the key to look up
+   * @return the field value whose key is equal to `key`, or null if key is 
not found
+   */
+  public Variant getFieldByKey(String key) {
+    VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value);
+    // Use linear search for a short list. Switch to binary search when the 
length reaches
+    // `BINARY_SEARCH_THRESHOLD`.
+    if (info.numElements < BINARY_SEARCH_THRESHOLD) {
+      for (int i = 0; i < info.numElements; ++i) {
+        ObjectField field = getFieldAtIndex(
+            i,
+            value,
+            metadata,
+            info.idSize,
+            info.offsetSize,
+            value.position() + info.idStartOffset,
+            value.position() + info.offsetStartOffset,
+            value.position() + info.dataStartOffset);
+        if (field.key.equals(key)) {
+          return field.value;
+        }
+      }
+    } else {
+      int low = 0;
+      int high = info.numElements - 1;
+      while (low <= high) {
+        // Use unsigned right shift to compute the middle of `low` and `high`. 
This is not only a
+        // performance optimization, because it can properly handle the case 
where `low + high`
+        // overflows int.
+        int mid = (low + high) >>> 1;
+        ObjectField field = getFieldAtIndex(
+            mid,
+            value,
+            metadata,
+            info.idSize,
+            info.offsetSize,
+            value.position() + info.idStartOffset,
+            value.position() + info.offsetStartOffset,
+            value.position() + info.dataStartOffset);
+        int cmp = field.key.compareTo(key);
+        if (cmp < 0) {
+          low = mid + 1;
+        } else if (cmp > 0) {
+          high = mid - 1;
+        } else {
+          return field.value;
+        }
+      }
+    }
+    return null;
+  }
+
+  /**
+   * A field in a Variant object.
+   */
+  public static final class ObjectField {

Review Comment:
   Updated.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java:
##########
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.util.UUID;
+
+/**
+ * This Variant class holds the Variant-encoded value and metadata binary 
values.
+ */
+public final class Variant {
+  /** The buffer that contains the Variant value. */
+  final ByteBuffer value;
+
+  /** The buffer that contains the Variant metadata. */
+  final ByteBuffer metadata;
+
+  /**
+   * The threshold to switch from linear search to binary search when looking 
up a field by key in
+   * an object. This is a performance optimization to avoid the overhead of 
binary search for a
+   * short list.
+   */
+  static final int BINARY_SEARCH_THRESHOLD = 32;
+
+  public Variant(byte[] value, byte[] metadata) {
+    this(value, 0, value.length, metadata, 0, metadata.length);
+  }
+
+  public Variant(byte[] value, int valuePos, int valueLength, byte[] metadata, 
int metadataPos, int metadataLength) {
+    this(ByteBuffer.wrap(value, valuePos, valueLength), 
ByteBuffer.wrap(metadata, metadataPos, metadataLength));
+  }
+
+  public Variant(ByteBuffer value, ByteBuffer metadata) {
+    // THe buffers are read single-byte at a time, so the endianness of the 
input buffers
+    // are not important.

Review Comment:
   Fixed.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java:
##########
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.util.UUID;
+
+/**
+ * This Variant class holds the Variant-encoded value and metadata binary 
values.
+ */
+public final class Variant {
+  /** The buffer that contains the Variant value. */
+  final ByteBuffer value;
+
+  /** The buffer that contains the Variant metadata. */
+  final ByteBuffer metadata;
+
+  /**
+   * The threshold to switch from linear search to binary search when looking 
up a field by key in
+   * an object. This is a performance optimization to avoid the overhead of 
binary search for a
+   * short list.
+   */
+  static final int BINARY_SEARCH_THRESHOLD = 32;
+
+  public Variant(byte[] value, byte[] metadata) {
+    this(value, 0, value.length, metadata, 0, metadata.length);
+  }
+
+  public Variant(byte[] value, int valuePos, int valueLength, byte[] metadata, 
int metadataPos, int metadataLength) {
+    this(ByteBuffer.wrap(value, valuePos, valueLength), 
ByteBuffer.wrap(metadata, metadataPos, metadataLength));
+  }
+
+  public Variant(ByteBuffer value, ByteBuffer metadata) {
+    // THe buffers are read single-byte at a time, so the endianness of the 
input buffers
+    // are not important.
+    this.value = value.asReadOnlyBuffer();
+    this.metadata = metadata.asReadOnlyBuffer();
+
+    // There is currently only one allowed version.
+    if ((metadata.get(metadata.position()) & VariantUtil.VERSION_MASK) != 
VariantUtil.VERSION) {
+      throw new UnsupportedOperationException(String.format(
+          "Unsupported variant metadata version: %d",
+          metadata.get(metadata.position()) & VariantUtil.VERSION_MASK));
+    }
+  }
+
+  /**
+   * @return the boolean value
+   */
+  public boolean getBoolean() {
+    return VariantUtil.getBoolean(value);
+  }
+
+  /**
+   * @return the byte value
+   */
+  public byte getByte() {
+    long longValue = VariantUtil.getLong(value);
+    if (longValue < Byte.MIN_VALUE || longValue > Byte.MAX_VALUE) {
+      throw new IllegalStateException("Value out of range for byte: " + 
longValue);
+    }
+    return (byte) longValue;
+  }
+
+  /**
+   * @return the short value
+   */
+  public short getShort() {
+    long longValue = VariantUtil.getLong(value);
+    if (longValue < Short.MIN_VALUE || longValue > Short.MAX_VALUE) {
+      throw new IllegalStateException("Value out of range for short: " + 
longValue);
+    }
+    return (short) longValue;
+  }
+
+  /**
+   * @return the int value
+   */
+  public int getInt() {
+    long longValue = VariantUtil.getLong(value);
+    if (longValue < Integer.MIN_VALUE || longValue > Integer.MAX_VALUE) {
+      throw new IllegalStateException("Value out of range for int: " + 
longValue);
+    }
+    return (int) longValue;
+  }
+
+  /**
+   * @return the long value
+   */
+  public long getLong() {
+    return VariantUtil.getLong(value);
+  }
+
+  /**
+   * @return the double value
+   */
+  public double getDouble() {
+    return VariantUtil.getDouble(value);
+  }
+
+  /**
+   * @return the decimal value
+   */
+  public BigDecimal getDecimal() {
+    return VariantUtil.getDecimal(value);
+  }
+
+  /**
+   * @return the float value
+   */
+  public float getFloat() {
+    return VariantUtil.getFloat(value);
+  }
+
+  /**
+   * @return the binary value
+   */
+  public ByteBuffer getBinary() {
+    return VariantUtil.getBinary(value);
+  }
+
+  /**
+   * @return the UUID value
+   */
+  public UUID getUUID() {
+    return VariantUtil.getUUID(value);
+  }
+
+  /**
+   * @return the string value
+   */
+  public String getString() {
+    return VariantUtil.getString(value);
+  }
+
+  /**
+   * The value type of Variant value. It is determined by the header byte.
+   */
+  public enum Type {
+    OBJECT,
+    ARRAY,
+    NULL,
+    BOOLEAN,
+    BYTE,
+    SHORT,
+    INT,
+    LONG,
+    STRING,
+    DOUBLE,
+    DECIMAL4,
+    DECIMAL8,
+    DECIMAL16,
+    DATE,
+    TIMESTAMP_TZ,
+    TIMESTAMP_NTZ,
+    FLOAT,
+    BINARY,
+    TIME,
+    TIMESTAMP_NANOS,
+    TIMESTAMP_NANOS_NTZ,
+    UUID
+  }
+
+  /**
+   * @return the type of the variant value
+   */
+  public Type getType() {
+    return VariantUtil.getType(value);
+  }
+
+  /**
+   * @return the number of object fields in the variant. `getType()` must be 
`Type.OBJECT`.

Review Comment:
   This will throw an `IllegalArgumentException`. Updated the comments.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/MalformedVariantException.java:
##########
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+/**
+ * An exception indicating that the Variant is malformed.
+ */
+public class MalformedVariantException extends RuntimeException {

Review Comment:
   oops, forgot this one. Removed.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java:
##########
@@ -0,0 +1,659 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+
+/**
+ * This class defines constants related to the Variant format and provides 
functions for
+ * manipulating Variant binaries.
+ *
+ * A Variant is made up of 2 binaries: value and metadata. A Variant value 
consists of a one-byte
+ * header and a number of content bytes (can be zero). The header byte is 
divided into upper 6 bits
+ * (called "type info") and lower 2 bits (called "basic type"). The content 
format is explained in
+ * the below constants for all possible basic type and type info values.
+ *
+ * The Variant metadata includes a version id and a dictionary of distinct 
strings (case-sensitive).
+ * Its binary format is:
+ * - Version: 1-byte unsigned integer. The only acceptable value is 1 
currently.
+ * - Dictionary size: 4-byte little-endian unsigned integer. The number of 
keys in the
+ *                    dictionary.
+ * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. 
`offsets[i]` represents the
+ * starting position of string i, counting starting from the address of 
`offsets[0]`. Strings
+ * must be stored contiguously, so we don’t need to store the string size, 
instead, we compute it
+ * with `offset[i + 1] - offset[i]`.
+ * - UTF-8 string data.
+ */
+class VariantUtil {
+  static final int BASIC_TYPE_BITS = 2;
+  static final int BASIC_TYPE_MASK = 0b00000011;
+  static final int PRIMITIVE_TYPE_MASK = 0b00111111;
+  /** The inclusive maximum value of the type info value. It is the size limit 
of `SHORT_STR`. */
+  static final int MAX_SHORT_STR_SIZE = 0b00111111;
+
+  // The basic types
+
+  /**
+   * Primitive value.
+   * The type info value must be one of the values in the "Primitive" section 
below.
+   */
+  static final int PRIMITIVE = 0;
+  /**
+   * Short string value.
+   * The type info value is the string size, which must be in `[0, 
MAX_SHORT_STR_SIZE]`.
+   * The string content bytes directly follow the header byte.
+   */
+  static final int SHORT_STR = 1;
+  /**
+   * Object value.
+   * The content contains a size, a list of field ids, a list of field 
offsets, and
+   * the actual field values. The list of field ids has `size` ids, while the 
list of field offsets
+   * has `size + 1` offsets, where the last offset represents the total size 
of the field values
+   * data. The list of fields ids must be sorted by the field name in 
alphabetical order.
+   * Duplicate field names within one object are not allowed.
+   * 5 bits in the type info are used to specify the integer type of the 
object header. It is
+   * 0_b4_b3b2_b1b0 (MSB is 0), where:
+   *   - b4: the integer type of size. When it is 0/1, `size` is a 
little-endian 1/4-byte
+   *         unsigned integer.
+   *   - b3b2: the integer type of ids. When the 2 bits are 0/1/2, the id list 
contains
+   *           1/2/3-byte little-endian unsigned integers.
+   *   - b1b0: the integer type of offset. When the 2 bits are 0/1/2, the 
offset list contains
+   *           1/2/3-byte little-endian unsigned integers.
+   */
+  static final int OBJECT = 2;
+  /**
+   * Array value.
+   * The content contains a size, a list of field offsets, and the actual 
element values.
+   * It is similar to an object without the id list. The length of the offset 
list
+   * is `size + 1`, where the last offset represent the total size of the 
element data.
+   * Its type info is: 000_b2_b1b0:
+   *   - b2: the type of size.
+   *   - b1b0: the integer type of offset.
+   */
+  static final int ARRAY = 3;
+
+  // The primitive types
+
+  /** JSON Null value. Empty content. */
+  static final int NULL = 0;
+  /** True value. Empty content. */
+  static final int TRUE = 1;
+  /** False value. Empty content. */
+  static final int FALSE = 2;
+  /** 1-byte little-endian signed integer. */
+  static final int INT8 = 3;
+  /** 2-byte little-endian signed integer. */
+  static final int INT16 = 4;
+  /** 4-byte little-endian signed integer. */
+  static final int INT32 = 5;
+  /** 4-byte little-endian signed integer. */
+  static final int INT64 = 6;
+  /** 8-byte IEEE double. */
+  static final int DOUBLE = 7;
+  /** 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed 
integer. */
+  static final int DECIMAL4 = 8;
+  /** 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed 
integer. */
+  static final int DECIMAL8 = 9;
+  /** 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed 
integer. */
+  static final int DECIMAL16 = 10;
+  /**
+   * Date value. Content is 4-byte little-endian signed integer that 
represents the
+   * number of days from the Unix epoch.
+   */
+  static final int DATE = 11;
+  /**
+   * Timestamp value. Content is 8-byte little-endian signed integer that 
represents the number of
+   * microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is 
displayed to users in
+   * their local time zones and may be displayed differently depending on the 
execution environment.
+   */
+  static final int TIMESTAMP_TZ = 12;
+  /**
+   * Timestamp_ntz value. It has the same content as `TIMESTAMP` but should 
always be interpreted
+   * as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NTZ = 13;
+  /** 4-byte IEEE float. */
+  static final int FLOAT = 14;
+  /**
+   * Binary value. The content is (4-byte little-endian unsigned integer 
representing the binary
+   * size) + (size bytes of binary content).
+   */
+  static final int BINARY = 15;
+  /**
+   * Long string value. The content is (4-byte little-endian unsigned integer 
representing the
+   * string size) + (size bytes of string content).
+   */
+  static final int LONG_STR = 16;
+  /**
+   * Time value. Values can be from 00:00:00 to 23:59:59.999999.
+   * Content is 8-byte little-endian unsigned integer that represents the 
number of microseconds
+   * since midnight.
+   */
+  static final int TIME = 17;
+  /**
+   * Timestamp nanos value. Similar to `TIMESTAMP`, but represents the number 
of nanoseconds
+   * elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC.
+   */
+  static final int TIMESTAMP_NANOS = 18;

Review Comment:
   Renamed.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java:
##########
@@ -0,0 +1,659 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+
+/**
+ * This class defines constants related to the Variant format and provides 
functions for
+ * manipulating Variant binaries.
+ *
+ * A Variant is made up of 2 binaries: value and metadata. A Variant value 
consists of a one-byte
+ * header and a number of content bytes (can be zero). The header byte is 
divided into upper 6 bits
+ * (called "type info") and lower 2 bits (called "basic type"). The content 
format is explained in
+ * the below constants for all possible basic type and type info values.
+ *
+ * The Variant metadata includes a version id and a dictionary of distinct 
strings (case-sensitive).
+ * Its binary format is:
+ * - Version: 1-byte unsigned integer. The only acceptable value is 1 
currently.
+ * - Dictionary size: 4-byte little-endian unsigned integer. The number of 
keys in the
+ *                    dictionary.
+ * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. 
`offsets[i]` represents the
+ * starting position of string i, counting starting from the address of 
`offsets[0]`. Strings
+ * must be stored contiguously, so we don’t need to store the string size, 
instead, we compute it
+ * with `offset[i + 1] - offset[i]`.
+ * - UTF-8 string data.
+ */
+class VariantUtil {
+  static final int BASIC_TYPE_BITS = 2;
+  static final int BASIC_TYPE_MASK = 0b00000011;
+  static final int PRIMITIVE_TYPE_MASK = 0b00111111;
+  /** The inclusive maximum value of the type info value. It is the size limit 
of `SHORT_STR`. */
+  static final int MAX_SHORT_STR_SIZE = 0b00111111;
+
+  // The basic types
+
+  /**
+   * Primitive value.
+   * The type info value must be one of the values in the "Primitive" section 
below.
+   */
+  static final int PRIMITIVE = 0;
+  /**
+   * Short string value.
+   * The type info value is the string size, which must be in `[0, 
MAX_SHORT_STR_SIZE]`.
+   * The string content bytes directly follow the header byte.
+   */
+  static final int SHORT_STR = 1;
+  /**
+   * Object value.
+   * The content contains a size, a list of field ids, a list of field 
offsets, and
+   * the actual field values. The list of field ids has `size` ids, while the 
list of field offsets
+   * has `size + 1` offsets, where the last offset represents the total size 
of the field values
+   * data. The list of fields ids must be sorted by the field name in 
alphabetical order.
+   * Duplicate field names within one object are not allowed.
+   * 5 bits in the type info are used to specify the integer type of the 
object header. It is
+   * 0_b4_b3b2_b1b0 (MSB is 0), where:

Review Comment:
   It was supposed to mean "most significant bit"



##########
parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java:
##########
@@ -0,0 +1,659 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+
+/**
+ * This class defines constants related to the Variant format and provides 
functions for
+ * manipulating Variant binaries.
+ *
+ * A Variant is made up of 2 binaries: value and metadata. A Variant value 
consists of a one-byte
+ * header and a number of content bytes (can be zero). The header byte is 
divided into upper 6 bits
+ * (called "type info") and lower 2 bits (called "basic type"). The content 
format is explained in
+ * the below constants for all possible basic type and type info values.
+ *
+ * The Variant metadata includes a version id and a dictionary of distinct 
strings (case-sensitive).
+ * Its binary format is:
+ * - Version: 1-byte unsigned integer. The only acceptable value is 1 
currently.
+ * - Dictionary size: 4-byte little-endian unsigned integer. The number of 
keys in the
+ *                    dictionary.
+ * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. 
`offsets[i]` represents the
+ * starting position of string i, counting starting from the address of 
`offsets[0]`. Strings
+ * must be stored contiguously, so we don’t need to store the string size, 
instead, we compute it
+ * with `offset[i + 1] - offset[i]`.
+ * - UTF-8 string data.
+ */
+class VariantUtil {
+  static final int BASIC_TYPE_BITS = 2;
+  static final int BASIC_TYPE_MASK = 0b00000011;
+  static final int PRIMITIVE_TYPE_MASK = 0b00111111;
+  /** The inclusive maximum value of the type info value. It is the size limit 
of `SHORT_STR`. */
+  static final int MAX_SHORT_STR_SIZE = 0b00111111;
+
+  // The basic types
+
+  /**
+   * Primitive value.
+   * The type info value must be one of the values in the "Primitive" section 
below.
+   */
+  static final int PRIMITIVE = 0;
+  /**
+   * Short string value.
+   * The type info value is the string size, which must be in `[0, 
MAX_SHORT_STR_SIZE]`.
+   * The string content bytes directly follow the header byte.
+   */
+  static final int SHORT_STR = 1;
+  /**
+   * Object value.
+   * The content contains a size, a list of field ids, a list of field 
offsets, and
+   * the actual field values. The list of field ids has `size` ids, while the 
list of field offsets
+   * has `size + 1` offsets, where the last offset represents the total size 
of the field values
+   * data. The list of fields ids must be sorted by the field name in 
alphabetical order.
+   * Duplicate field names within one object are not allowed.
+   * 5 bits in the type info are used to specify the integer type of the 
object header. It is
+   * 0_b4_b3b2_b1b0 (MSB is 0), where:
+   *   - b4: the integer type of size. When it is 0/1, `size` is a 
little-endian 1/4-byte
+   *         unsigned integer.
+   *   - b3b2: the integer type of ids. When the 2 bits are 0/1/2, the id list 
contains
+   *           1/2/3-byte little-endian unsigned integers.
+   *   - b1b0: the integer type of offset. When the 2 bits are 0/1/2, the 
offset list contains
+   *           1/2/3-byte little-endian unsigned integers.
+   */
+  static final int OBJECT = 2;
+  /**
+   * Array value.
+   * The content contains a size, a list of field offsets, and the actual 
element values.
+   * It is similar to an object without the id list. The length of the offset 
list
+   * is `size + 1`, where the last offset represent the total size of the 
element data.
+   * Its type info is: 000_b2_b1b0:
+   *   - b2: the type of size.
+   *   - b1b0: the integer type of offset.
+   */
+  static final int ARRAY = 3;
+
+  // The primitive types
+
+  /** JSON Null value. Empty content. */
+  static final int NULL = 0;
+  /** True value. Empty content. */
+  static final int TRUE = 1;
+  /** False value. Empty content. */
+  static final int FALSE = 2;
+  /** 1-byte little-endian signed integer. */
+  static final int INT8 = 3;
+  /** 2-byte little-endian signed integer. */
+  static final int INT16 = 4;
+  /** 4-byte little-endian signed integer. */
+  static final int INT32 = 5;
+  /** 4-byte little-endian signed integer. */
+  static final int INT64 = 6;
+  /** 8-byte IEEE double. */
+  static final int DOUBLE = 7;
+  /** 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed 
integer. */
+  static final int DECIMAL4 = 8;
+  /** 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed 
integer. */
+  static final int DECIMAL8 = 9;
+  /** 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed 
integer. */
+  static final int DECIMAL16 = 10;
+  /**
+   * Date value. Content is 4-byte little-endian signed integer that 
represents the
+   * number of days from the Unix epoch.
+   */
+  static final int DATE = 11;
+  /**
+   * Timestamp value. Content is 8-byte little-endian signed integer that 
represents the number of
+   * microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is 
displayed to users in
+   * their local time zones and may be displayed differently depending on the 
execution environment.
+   */
+  static final int TIMESTAMP_TZ = 12;
+  /**
+   * Timestamp_ntz value. It has the same content as `TIMESTAMP` but should 
always be interpreted
+   * as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NTZ = 13;
+  /** 4-byte IEEE float. */
+  static final int FLOAT = 14;
+  /**
+   * Binary value. The content is (4-byte little-endian unsigned integer 
representing the binary
+   * size) + (size bytes of binary content).
+   */
+  static final int BINARY = 15;
+  /**
+   * Long string value. The content is (4-byte little-endian unsigned integer 
representing the
+   * string size) + (size bytes of string content).
+   */
+  static final int LONG_STR = 16;
+  /**
+   * Time value. Values can be from 00:00:00 to 23:59:59.999999.
+   * Content is 8-byte little-endian unsigned integer that represents the 
number of microseconds
+   * since midnight.
+   */
+  static final int TIME = 17;
+  /**
+   * Timestamp nanos value. Similar to `TIMESTAMP`, but represents the number 
of nanoseconds
+   * elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC.
+   */
+  static final int TIMESTAMP_NANOS = 18;
+  /**
+   * Timestamp nanos (without timestamp) value. It has the same content as 
`TIMESTAMP_NANOS` but
+   * should always be interpreted as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NANOS_NTZ = 19;
+  /**
+   * UUID value. The content is a 16-byte binary, encoded using big-endian.
+   * For example, UUID 00112233-4455-6677-8899-aabbccddeeff is encoded as the 
bytes
+   * 00 11 22 33 44 55 66 77 88 99 aa bb cc dd ee ff.
+   */
+  static final int UUID = 20;
+
+  // The metadata version.
+  static final byte VERSION = 1;
+  // The lower 4 bits of the first metadata byte contain the version.
+  static final byte VERSION_MASK = 0x0F;
+
+  // Constants for various unsigned integer sizes.
+  static final int U8_MAX = 0xFF;
+  static final int U16_MAX = 0xFFFF;
+  static final int U24_MAX = 0xFFFFFF;
+  static final int U8_SIZE = 1;
+  static final int U16_SIZE = 2;
+  static final int U24_SIZE = 3;
+  static final int U32_SIZE = 4;
+
+  // Max decimal precision for each decimal type.
+  static final int MAX_DECIMAL4_PRECISION = 9;
+  static final int MAX_DECIMAL8_PRECISION = 18;
+  static final int MAX_DECIMAL16_PRECISION = 38;
+
+  // The size (in bytes) of a UUID.
+  static final int UUID_SIZE = 16;
+
+  static byte primitiveHeader(int type) {
+    return (byte) (type << 2 | PRIMITIVE);
+  }
+
+  static byte shortStrHeader(int size) {
+    return (byte) (size << 2 | SHORT_STR);
+  }
+
+  static byte objectHeader(boolean largeSize, int idSize, int offsetSize) {
+    return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4))
+        | ((idSize - 1) << (BASIC_TYPE_BITS + 2))
+        | ((offsetSize - 1) << BASIC_TYPE_BITS)
+        | OBJECT);
+  }
+
+  static byte arrayHeader(boolean largeSize, int offsetSize) {
+    return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | 
((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY);
+  }
+
+  /**
+   * Check the validity of an array index `pos`.
+   * @param pos The index to check
+   * @param length The length of the array
+   * @throws IllegalArgumentException if the index is out of bound
+   */
+  static void checkIndex(int pos, int length) {
+    if (pos < 0 || pos >= length) {
+      throw new IllegalArgumentException(
+          String.format("Invalid byte-array offset (%d). length: %d", pos, 
length));
+    }
+  }
+
+  /**
+   * Reads a little-endian signed long value from `buffer[pos, pos + 
numBytes)`.
+   * @param buffer The ByteBuffer to read from
+   * @param pos The starting index of the buffer to read from
+   * @param numBytes The number of bytes to read
+   * @return The long value
+   */
+  static long readLong(ByteBuffer buffer, int pos, int numBytes) {
+    checkIndex(pos, buffer.limit());
+    checkIndex(pos + numBytes - 1, buffer.limit());
+    long result = 0;
+    // All bytes except the most significant byte should be unsigned-extended 
and shifted
+    // (so we need & 0xFF`). The most significant byte should be sign-extended 
and is handled
+    // after the loop.
+    for (int i = 0; i < numBytes - 1; ++i) {
+      long unsignedByteValue = buffer.get(pos + i) & 0xFF;
+      result |= unsignedByteValue << (8 * i);
+    }
+    long signedByteValue = buffer.get(pos + numBytes - 1);
+    result |= signedByteValue << (8 * (numBytes - 1));
+    return result;
+  }
+
+  /**
+   * Read a little-endian unsigned int value from `bytes[pos, pos + 
numBytes)`. The value must fit
+   * into a non-negative int (`[0, Integer.MAX_VALUE]`).
+   */
+  static int readUnsigned(ByteBuffer bytes, int pos, int numBytes) {
+    checkIndex(pos, bytes.limit());
+    checkIndex(pos + numBytes - 1, bytes.limit());
+    int result = 0;
+    // Similar to the `readLong` loop, but all bytes should be 
unsigned-extended.
+    for (int i = 0; i < numBytes; ++i) {
+      int unsignedByteValue = bytes.get(pos + i) & 0xFF;
+      result |= unsignedByteValue << (8 * i);
+    }
+    if (result < 0) {
+      throw new IllegalArgumentException(String.format("Failed to read 
unsigned int. numBytes: %d", numBytes));
+    }
+    return result;
+  }
+
+  /**
+   * Returns the value type of Variant value `value[pos...]`. It is only legal 
to call `get*` if
+   * `getType` returns the corresponding type. For example, it is only legal 
to call
+   * `getLong` if this method returns `Type.Long`.
+   * @param value The Variant value to get the type from
+   * @return The type of the Variant value
+   */
+  static Variant.Type getType(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    switch (basicType) {
+      case SHORT_STR:
+        return Variant.Type.STRING;
+      case OBJECT:
+        return Variant.Type.OBJECT;
+      case ARRAY:
+        return Variant.Type.ARRAY;
+      default:
+        switch (typeInfo) {
+          case NULL:
+            return Variant.Type.NULL;
+          case TRUE:
+          case FALSE:
+            return Variant.Type.BOOLEAN;
+          case INT8:
+            return Variant.Type.BYTE;
+          case INT16:
+            return Variant.Type.SHORT;
+          case INT32:
+            return Variant.Type.INT;
+          case INT64:
+            return Variant.Type.LONG;
+          case DOUBLE:
+            return Variant.Type.DOUBLE;
+          case DECIMAL4:
+            return Variant.Type.DECIMAL4;
+          case DECIMAL8:
+            return Variant.Type.DECIMAL8;
+          case DECIMAL16:
+            return Variant.Type.DECIMAL16;
+          case DATE:
+            return Variant.Type.DATE;
+          case TIMESTAMP_TZ:
+            return Variant.Type.TIMESTAMP_TZ;
+          case TIMESTAMP_NTZ:
+            return Variant.Type.TIMESTAMP_NTZ;
+          case FLOAT:
+            return Variant.Type.FLOAT;
+          case BINARY:
+            return Variant.Type.BINARY;
+          case LONG_STR:
+            return Variant.Type.STRING;
+          case TIME:
+            return Variant.Type.TIME;
+          case TIMESTAMP_NANOS:
+            return Variant.Type.TIMESTAMP_NANOS;
+          case TIMESTAMP_NANOS_NTZ:
+            return Variant.Type.TIMESTAMP_NANOS_NTZ;
+          case UUID:
+            return Variant.Type.UUID;
+          default:
+            throw new UnsupportedOperationException(
+                String.format("Unknown type in Variant. primitive type: %d", 
typeInfo));
+        }
+    }
+  }
+
+  private static IllegalArgumentException unexpectedType(Variant.Type type) {
+    return new IllegalArgumentException("Expected type to be " + type);

Review Comment:
   Yeah, updated the message.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java:
##########
@@ -0,0 +1,659 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+
+/**
+ * This class defines constants related to the Variant format and provides 
functions for
+ * manipulating Variant binaries.
+ *
+ * A Variant is made up of 2 binaries: value and metadata. A Variant value 
consists of a one-byte
+ * header and a number of content bytes (can be zero). The header byte is 
divided into upper 6 bits
+ * (called "type info") and lower 2 bits (called "basic type"). The content 
format is explained in
+ * the below constants for all possible basic type and type info values.
+ *
+ * The Variant metadata includes a version id and a dictionary of distinct 
strings (case-sensitive).
+ * Its binary format is:
+ * - Version: 1-byte unsigned integer. The only acceptable value is 1 
currently.
+ * - Dictionary size: 4-byte little-endian unsigned integer. The number of 
keys in the
+ *                    dictionary.
+ * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. 
`offsets[i]` represents the
+ * starting position of string i, counting starting from the address of 
`offsets[0]`. Strings
+ * must be stored contiguously, so we don’t need to store the string size, 
instead, we compute it
+ * with `offset[i + 1] - offset[i]`.
+ * - UTF-8 string data.
+ */
+class VariantUtil {
+  static final int BASIC_TYPE_BITS = 2;
+  static final int BASIC_TYPE_MASK = 0b00000011;
+  static final int PRIMITIVE_TYPE_MASK = 0b00111111;
+  /** The inclusive maximum value of the type info value. It is the size limit 
of `SHORT_STR`. */
+  static final int MAX_SHORT_STR_SIZE = 0b00111111;
+
+  // The basic types
+
+  /**
+   * Primitive value.
+   * The type info value must be one of the values in the "Primitive" section 
below.
+   */
+  static final int PRIMITIVE = 0;
+  /**
+   * Short string value.
+   * The type info value is the string size, which must be in `[0, 
MAX_SHORT_STR_SIZE]`.
+   * The string content bytes directly follow the header byte.
+   */
+  static final int SHORT_STR = 1;
+  /**
+   * Object value.
+   * The content contains a size, a list of field ids, a list of field 
offsets, and
+   * the actual field values. The list of field ids has `size` ids, while the 
list of field offsets
+   * has `size + 1` offsets, where the last offset represents the total size 
of the field values
+   * data. The list of fields ids must be sorted by the field name in 
alphabetical order.
+   * Duplicate field names within one object are not allowed.
+   * 5 bits in the type info are used to specify the integer type of the 
object header. It is
+   * 0_b4_b3b2_b1b0 (MSB is 0), where:
+   *   - b4: the integer type of size. When it is 0/1, `size` is a 
little-endian 1/4-byte
+   *         unsigned integer.
+   *   - b3b2: the integer type of ids. When the 2 bits are 0/1/2, the id list 
contains
+   *           1/2/3-byte little-endian unsigned integers.
+   *   - b1b0: the integer type of offset. When the 2 bits are 0/1/2, the 
offset list contains
+   *           1/2/3-byte little-endian unsigned integers.
+   */
+  static final int OBJECT = 2;
+  /**
+   * Array value.
+   * The content contains a size, a list of field offsets, and the actual 
element values.
+   * It is similar to an object without the id list. The length of the offset 
list
+   * is `size + 1`, where the last offset represent the total size of the 
element data.
+   * Its type info is: 000_b2_b1b0:
+   *   - b2: the type of size.
+   *   - b1b0: the integer type of offset.
+   */
+  static final int ARRAY = 3;
+
+  // The primitive types
+
+  /** JSON Null value. Empty content. */
+  static final int NULL = 0;
+  /** True value. Empty content. */
+  static final int TRUE = 1;
+  /** False value. Empty content. */
+  static final int FALSE = 2;
+  /** 1-byte little-endian signed integer. */
+  static final int INT8 = 3;
+  /** 2-byte little-endian signed integer. */
+  static final int INT16 = 4;
+  /** 4-byte little-endian signed integer. */
+  static final int INT32 = 5;
+  /** 4-byte little-endian signed integer. */
+  static final int INT64 = 6;
+  /** 8-byte IEEE double. */
+  static final int DOUBLE = 7;
+  /** 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed 
integer. */
+  static final int DECIMAL4 = 8;
+  /** 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed 
integer. */
+  static final int DECIMAL8 = 9;
+  /** 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed 
integer. */
+  static final int DECIMAL16 = 10;
+  /**
+   * Date value. Content is 4-byte little-endian signed integer that 
represents the
+   * number of days from the Unix epoch.
+   */
+  static final int DATE = 11;
+  /**
+   * Timestamp value. Content is 8-byte little-endian signed integer that 
represents the number of
+   * microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is 
displayed to users in
+   * their local time zones and may be displayed differently depending on the 
execution environment.
+   */
+  static final int TIMESTAMP_TZ = 12;
+  /**
+   * Timestamp_ntz value. It has the same content as `TIMESTAMP` but should 
always be interpreted
+   * as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NTZ = 13;
+  /** 4-byte IEEE float. */
+  static final int FLOAT = 14;
+  /**
+   * Binary value. The content is (4-byte little-endian unsigned integer 
representing the binary
+   * size) + (size bytes of binary content).
+   */
+  static final int BINARY = 15;
+  /**
+   * Long string value. The content is (4-byte little-endian unsigned integer 
representing the
+   * string size) + (size bytes of string content).
+   */
+  static final int LONG_STR = 16;
+  /**
+   * Time value. Values can be from 00:00:00 to 23:59:59.999999.
+   * Content is 8-byte little-endian unsigned integer that represents the 
number of microseconds
+   * since midnight.
+   */
+  static final int TIME = 17;
+  /**
+   * Timestamp nanos value. Similar to `TIMESTAMP`, but represents the number 
of nanoseconds
+   * elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC.
+   */
+  static final int TIMESTAMP_NANOS = 18;
+  /**
+   * Timestamp nanos (without timestamp) value. It has the same content as 
`TIMESTAMP_NANOS` but
+   * should always be interpreted as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NANOS_NTZ = 19;
+  /**
+   * UUID value. The content is a 16-byte binary, encoded using big-endian.
+   * For example, UUID 00112233-4455-6677-8899-aabbccddeeff is encoded as the 
bytes
+   * 00 11 22 33 44 55 66 77 88 99 aa bb cc dd ee ff.
+   */
+  static final int UUID = 20;
+
+  // The metadata version.
+  static final byte VERSION = 1;
+  // The lower 4 bits of the first metadata byte contain the version.
+  static final byte VERSION_MASK = 0x0F;
+
+  // Constants for various unsigned integer sizes.
+  static final int U8_MAX = 0xFF;
+  static final int U16_MAX = 0xFFFF;
+  static final int U24_MAX = 0xFFFFFF;
+  static final int U8_SIZE = 1;
+  static final int U16_SIZE = 2;
+  static final int U24_SIZE = 3;
+  static final int U32_SIZE = 4;
+
+  // Max decimal precision for each decimal type.
+  static final int MAX_DECIMAL4_PRECISION = 9;
+  static final int MAX_DECIMAL8_PRECISION = 18;
+  static final int MAX_DECIMAL16_PRECISION = 38;
+
+  // The size (in bytes) of a UUID.
+  static final int UUID_SIZE = 16;
+
+  static byte primitiveHeader(int type) {
+    return (byte) (type << 2 | PRIMITIVE);
+  }
+
+  static byte shortStrHeader(int size) {
+    return (byte) (size << 2 | SHORT_STR);
+  }
+
+  static byte objectHeader(boolean largeSize, int idSize, int offsetSize) {
+    return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4))
+        | ((idSize - 1) << (BASIC_TYPE_BITS + 2))
+        | ((offsetSize - 1) << BASIC_TYPE_BITS)
+        | OBJECT);
+  }
+
+  static byte arrayHeader(boolean largeSize, int offsetSize) {
+    return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | 
((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY);
+  }
+
+  /**
+   * Check the validity of an array index `pos`.
+   * @param pos The index to check
+   * @param length The length of the array
+   * @throws IllegalArgumentException if the index is out of bound
+   */
+  static void checkIndex(int pos, int length) {
+    if (pos < 0 || pos >= length) {
+      throw new IllegalArgumentException(
+          String.format("Invalid byte-array offset (%d). length: %d", pos, 
length));
+    }
+  }
+
+  /**
+   * Reads a little-endian signed long value from `buffer[pos, pos + 
numBytes)`.
+   * @param buffer The ByteBuffer to read from
+   * @param pos The starting index of the buffer to read from
+   * @param numBytes The number of bytes to read
+   * @return The long value
+   */
+  static long readLong(ByteBuffer buffer, int pos, int numBytes) {
+    checkIndex(pos, buffer.limit());
+    checkIndex(pos + numBytes - 1, buffer.limit());
+    long result = 0;
+    // All bytes except the most significant byte should be unsigned-extended 
and shifted
+    // (so we need & 0xFF`). The most significant byte should be sign-extended 
and is handled
+    // after the loop.
+    for (int i = 0; i < numBytes - 1; ++i) {
+      long unsignedByteValue = buffer.get(pos + i) & 0xFF;
+      result |= unsignedByteValue << (8 * i);
+    }
+    long signedByteValue = buffer.get(pos + numBytes - 1);
+    result |= signedByteValue << (8 * (numBytes - 1));
+    return result;
+  }
+
+  /**
+   * Read a little-endian unsigned int value from `bytes[pos, pos + 
numBytes)`. The value must fit
+   * into a non-negative int (`[0, Integer.MAX_VALUE]`).
+   */
+  static int readUnsigned(ByteBuffer bytes, int pos, int numBytes) {
+    checkIndex(pos, bytes.limit());
+    checkIndex(pos + numBytes - 1, bytes.limit());
+    int result = 0;
+    // Similar to the `readLong` loop, but all bytes should be 
unsigned-extended.
+    for (int i = 0; i < numBytes; ++i) {
+      int unsignedByteValue = bytes.get(pos + i) & 0xFF;
+      result |= unsignedByteValue << (8 * i);
+    }
+    if (result < 0) {
+      throw new IllegalArgumentException(String.format("Failed to read 
unsigned int. numBytes: %d", numBytes));
+    }
+    return result;
+  }
+
+  /**
+   * Returns the value type of Variant value `value[pos...]`. It is only legal 
to call `get*` if
+   * `getType` returns the corresponding type. For example, it is only legal 
to call
+   * `getLong` if this method returns `Type.Long`.
+   * @param value The Variant value to get the type from
+   * @return The type of the Variant value
+   */
+  static Variant.Type getType(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    switch (basicType) {
+      case SHORT_STR:
+        return Variant.Type.STRING;
+      case OBJECT:
+        return Variant.Type.OBJECT;
+      case ARRAY:
+        return Variant.Type.ARRAY;
+      default:
+        switch (typeInfo) {
+          case NULL:
+            return Variant.Type.NULL;
+          case TRUE:
+          case FALSE:
+            return Variant.Type.BOOLEAN;
+          case INT8:
+            return Variant.Type.BYTE;
+          case INT16:
+            return Variant.Type.SHORT;
+          case INT32:
+            return Variant.Type.INT;
+          case INT64:
+            return Variant.Type.LONG;
+          case DOUBLE:
+            return Variant.Type.DOUBLE;
+          case DECIMAL4:
+            return Variant.Type.DECIMAL4;
+          case DECIMAL8:
+            return Variant.Type.DECIMAL8;
+          case DECIMAL16:
+            return Variant.Type.DECIMAL16;
+          case DATE:
+            return Variant.Type.DATE;
+          case TIMESTAMP_TZ:
+            return Variant.Type.TIMESTAMP_TZ;
+          case TIMESTAMP_NTZ:
+            return Variant.Type.TIMESTAMP_NTZ;
+          case FLOAT:
+            return Variant.Type.FLOAT;
+          case BINARY:
+            return Variant.Type.BINARY;
+          case LONG_STR:
+            return Variant.Type.STRING;
+          case TIME:
+            return Variant.Type.TIME;
+          case TIMESTAMP_NANOS:
+            return Variant.Type.TIMESTAMP_NANOS;
+          case TIMESTAMP_NANOS_NTZ:
+            return Variant.Type.TIMESTAMP_NANOS_NTZ;
+          case UUID:
+            return Variant.Type.UUID;
+          default:
+            throw new UnsupportedOperationException(
+                String.format("Unknown type in Variant. primitive type: %d", 
typeInfo));
+        }
+    }
+  }
+
+  private static IllegalArgumentException unexpectedType(Variant.Type type) {
+    return new IllegalArgumentException("Expected type to be " + type);
+  }
+
+  private static IllegalArgumentException unexpectedType(Variant.Type[] types) 
{
+    return new IllegalArgumentException("Expected type to be one of: " + 
Arrays.toString(types));
+  }
+
+  static boolean getBoolean(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) {
+      throw unexpectedType(Variant.Type.BOOLEAN);
+    }
+    return typeInfo == TRUE;
+  }
+
+  /**
+   * Returns a long value from Variant value `value[pos...]`.
+   * It is only legal to call it if `getType` returns one of Type.BYTE, SHORT, 
INT, LONG,
+   * DATE, TIMESTAMP, TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, 
TIMESTAMP_NANOS_NTZ.
+   * If the type is `DATE`, the return value is guaranteed to fit into an int 
and
+   * represents the number of days from the Unix epoch.
+   * If the type is `TIMESTAMP/TIMESTAMP_NTZ`, the return value represents the 
number of
+   * microseconds from the Unix epoch.
+   * If the type is `TIME`, the return value represents the number of 
microseconds since midnight.
+   * If the type is `TIMESTAMP_NANOS/TIMESTAMP_NANOS_NTZ`, the return value 
represents the number of
+   * nanoseconds from the Unix epoch.
+   * @param value The Variant value
+   * @return The long value
+   */
+  static long getLong(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE) {
+      throw new IllegalStateException(
+          "Expect type to be one of: BYTE, SHORT, INT, LONG, TIMESTAMP, 
TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, TIMESTAMP_NANOS_NTZ");
+    }
+    switch (typeInfo) {
+      case INT8:
+        return readLong(value, value.position() + 1, 1);
+      case INT16:
+        return readLong(value, value.position() + 1, 2);
+      case INT32:
+      case DATE:
+        return readLong(value, value.position() + 1, 4);
+      case INT64:
+      case TIMESTAMP_TZ:
+      case TIMESTAMP_NTZ:
+      case TIME:
+      case TIMESTAMP_NANOS:
+      case TIMESTAMP_NANOS_NTZ:
+        return readLong(value, value.position() + 1, 8);
+      default:
+        throw new IllegalStateException(
+            "Expect type to be one of: BYTE, SHORT, INT, LONG, TIMESTAMP, 
TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, TIMESTAMP_NANOS_NTZ");
+    }
+  }
+
+  static double getDouble(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE || typeInfo != DOUBLE) {
+      throw unexpectedType(Variant.Type.DOUBLE);
+    }
+    return Double.longBitsToDouble(readLong(value, value.position() + 1, 8));
+  }
+
+  static BigDecimal getDecimalWithOriginalScale(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE) {
+      throw unexpectedType(
+          new Variant.Type[] {Variant.Type.DECIMAL4, Variant.Type.DECIMAL8, 
Variant.Type.DECIMAL16});
+    }
+    // Interpret the scale byte as unsigned. If it is a negative byte, the 
unsigned value must be
+    // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in 
`checkDecimal`.
+    int scale = value.get(value.position() + 1) & 0xFF;
+    BigDecimal result;
+    switch (typeInfo) {
+      case DECIMAL4:
+        result = BigDecimal.valueOf(readLong(value, value.position() + 2, 4), 
scale);
+        break;
+      case DECIMAL8:
+        result = BigDecimal.valueOf(readLong(value, value.position() + 2, 8), 
scale);
+        break;
+      case DECIMAL16:
+        checkIndex(value.position() + 17, value.limit());
+        byte[] bytes = new byte[16];
+        // Copy the bytes reversely because the `BigInteger` constructor 
expects a big-endian
+        // representation.
+        for (int i = 0; i < 16; ++i) {
+          bytes[i] = value.get(value.position() + 17 - i);
+        }
+        result = new BigDecimal(new BigInteger(bytes), scale);
+        break;
+      default:
+        throw unexpectedType(
+            new Variant.Type[] {Variant.Type.DECIMAL4, Variant.Type.DECIMAL8, 
Variant.Type.DECIMAL16});
+    }
+    return result;
+  }
+
+  static BigDecimal getDecimal(ByteBuffer value) {
+    return getDecimalWithOriginalScale(value);
+  }
+
+  static float getFloat(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE || typeInfo != FLOAT) {
+      throw unexpectedType(Variant.Type.FLOAT);
+    }
+    return Float.intBitsToFloat((int) readLong(value, value.position() + 1, 
4));
+  }
+
+  static ByteBuffer getBinary(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE || typeInfo != BINARY) {
+      throw unexpectedType(Variant.Type.BINARY);
+    }
+    int start = value.position() + 1 + U32_SIZE;
+    int length = readUnsigned(value, value.position() + 1, U32_SIZE);
+    checkIndex(start + length - 1, value.limit());
+    return slice(value, start);
+  }
+
+  static String getString(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType == SHORT_STR || (basicType == PRIMITIVE && typeInfo == 
LONG_STR)) {
+      int start;
+      int length;
+      if (basicType == SHORT_STR) {
+        start = value.position() + 1;
+        length = typeInfo;
+      } else {
+        start = value.position() + 1 + U32_SIZE;
+        length = readUnsigned(value, value.position() + 1, U32_SIZE);
+      }
+      checkIndex(start + length - 1, value.limit());
+      if (value.hasArray()) {
+        // If the buffer is backed by an array, we can use the array directly.
+        return new String(value.array(), value.arrayOffset() + start, length);
+      } else {
+        // If the buffer is not backed by an array, we need to copy the bytes 
into a new array.
+        byte[] valueArray = new byte[length];
+        slice(value, start).get(valueArray);
+        return new String(valueArray);
+      }
+    }
+    throw unexpectedType(Variant.Type.STRING);
+  }
+
+  static java.util.UUID getUUID(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE || typeInfo != UUID) {
+      throw unexpectedType(Variant.Type.UUID);

Review Comment:
   Updated the message to include the actual type.
   
   There is no compile time guarantee, but I believe all the `get*` methods 
check the type.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java:
##########
@@ -0,0 +1,659 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+
+/**
+ * This class defines constants related to the Variant format and provides 
functions for
+ * manipulating Variant binaries.
+ *
+ * A Variant is made up of 2 binaries: value and metadata. A Variant value 
consists of a one-byte
+ * header and a number of content bytes (can be zero). The header byte is 
divided into upper 6 bits
+ * (called "type info") and lower 2 bits (called "basic type"). The content 
format is explained in
+ * the below constants for all possible basic type and type info values.
+ *
+ * The Variant metadata includes a version id and a dictionary of distinct 
strings (case-sensitive).
+ * Its binary format is:
+ * - Version: 1-byte unsigned integer. The only acceptable value is 1 
currently.
+ * - Dictionary size: 4-byte little-endian unsigned integer. The number of 
keys in the
+ *                    dictionary.
+ * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. 
`offsets[i]` represents the
+ * starting position of string i, counting starting from the address of 
`offsets[0]`. Strings
+ * must be stored contiguously, so we don’t need to store the string size, 
instead, we compute it
+ * with `offset[i + 1] - offset[i]`.
+ * - UTF-8 string data.
+ */
+class VariantUtil {
+  static final int BASIC_TYPE_BITS = 2;
+  static final int BASIC_TYPE_MASK = 0b00000011;
+  static final int PRIMITIVE_TYPE_MASK = 0b00111111;
+  /** The inclusive maximum value of the type info value. It is the size limit 
of `SHORT_STR`. */
+  static final int MAX_SHORT_STR_SIZE = 0b00111111;
+
+  // The basic types
+
+  /**
+   * Primitive value.
+   * The type info value must be one of the values in the "Primitive" section 
below.
+   */
+  static final int PRIMITIVE = 0;
+  /**
+   * Short string value.
+   * The type info value is the string size, which must be in `[0, 
MAX_SHORT_STR_SIZE]`.
+   * The string content bytes directly follow the header byte.
+   */
+  static final int SHORT_STR = 1;
+  /**
+   * Object value.
+   * The content contains a size, a list of field ids, a list of field 
offsets, and
+   * the actual field values. The list of field ids has `size` ids, while the 
list of field offsets
+   * has `size + 1` offsets, where the last offset represents the total size 
of the field values
+   * data. The list of fields ids must be sorted by the field name in 
alphabetical order.
+   * Duplicate field names within one object are not allowed.
+   * 5 bits in the type info are used to specify the integer type of the 
object header. It is
+   * 0_b4_b3b2_b1b0 (MSB is 0), where:
+   *   - b4: the integer type of size. When it is 0/1, `size` is a 
little-endian 1/4-byte
+   *         unsigned integer.
+   *   - b3b2: the integer type of ids. When the 2 bits are 0/1/2, the id list 
contains
+   *           1/2/3-byte little-endian unsigned integers.
+   *   - b1b0: the integer type of offset. When the 2 bits are 0/1/2, the 
offset list contains
+   *           1/2/3-byte little-endian unsigned integers.
+   */
+  static final int OBJECT = 2;
+  /**
+   * Array value.
+   * The content contains a size, a list of field offsets, and the actual 
element values.
+   * It is similar to an object without the id list. The length of the offset 
list
+   * is `size + 1`, where the last offset represent the total size of the 
element data.
+   * Its type info is: 000_b2_b1b0:
+   *   - b2: the type of size.
+   *   - b1b0: the integer type of offset.
+   */
+  static final int ARRAY = 3;
+
+  // The primitive types
+
+  /** JSON Null value. Empty content. */
+  static final int NULL = 0;
+  /** True value. Empty content. */
+  static final int TRUE = 1;
+  /** False value. Empty content. */
+  static final int FALSE = 2;
+  /** 1-byte little-endian signed integer. */
+  static final int INT8 = 3;
+  /** 2-byte little-endian signed integer. */
+  static final int INT16 = 4;
+  /** 4-byte little-endian signed integer. */
+  static final int INT32 = 5;
+  /** 4-byte little-endian signed integer. */
+  static final int INT64 = 6;
+  /** 8-byte IEEE double. */
+  static final int DOUBLE = 7;
+  /** 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed 
integer. */
+  static final int DECIMAL4 = 8;
+  /** 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed 
integer. */
+  static final int DECIMAL8 = 9;
+  /** 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed 
integer. */
+  static final int DECIMAL16 = 10;
+  /**
+   * Date value. Content is 4-byte little-endian signed integer that 
represents the
+   * number of days from the Unix epoch.
+   */
+  static final int DATE = 11;
+  /**
+   * Timestamp value. Content is 8-byte little-endian signed integer that 
represents the number of
+   * microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is 
displayed to users in
+   * their local time zones and may be displayed differently depending on the 
execution environment.
+   */
+  static final int TIMESTAMP_TZ = 12;
+  /**
+   * Timestamp_ntz value. It has the same content as `TIMESTAMP` but should 
always be interpreted
+   * as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NTZ = 13;
+  /** 4-byte IEEE float. */
+  static final int FLOAT = 14;
+  /**
+   * Binary value. The content is (4-byte little-endian unsigned integer 
representing the binary
+   * size) + (size bytes of binary content).
+   */
+  static final int BINARY = 15;
+  /**
+   * Long string value. The content is (4-byte little-endian unsigned integer 
representing the
+   * string size) + (size bytes of string content).
+   */
+  static final int LONG_STR = 16;
+  /**
+   * Time value. Values can be from 00:00:00 to 23:59:59.999999.
+   * Content is 8-byte little-endian unsigned integer that represents the 
number of microseconds
+   * since midnight.
+   */
+  static final int TIME = 17;
+  /**
+   * Timestamp nanos value. Similar to `TIMESTAMP`, but represents the number 
of nanoseconds
+   * elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC.
+   */
+  static final int TIMESTAMP_NANOS = 18;
+  /**
+   * Timestamp nanos (without timestamp) value. It has the same content as 
`TIMESTAMP_NANOS` but
+   * should always be interpreted as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NANOS_NTZ = 19;
+  /**
+   * UUID value. The content is a 16-byte binary, encoded using big-endian.
+   * For example, UUID 00112233-4455-6677-8899-aabbccddeeff is encoded as the 
bytes
+   * 00 11 22 33 44 55 66 77 88 99 aa bb cc dd ee ff.
+   */
+  static final int UUID = 20;
+
+  // The metadata version.
+  static final byte VERSION = 1;
+  // The lower 4 bits of the first metadata byte contain the version.
+  static final byte VERSION_MASK = 0x0F;
+
+  // Constants for various unsigned integer sizes.
+  static final int U8_MAX = 0xFF;
+  static final int U16_MAX = 0xFFFF;
+  static final int U24_MAX = 0xFFFFFF;
+  static final int U8_SIZE = 1;
+  static final int U16_SIZE = 2;
+  static final int U24_SIZE = 3;
+  static final int U32_SIZE = 4;
+
+  // Max decimal precision for each decimal type.
+  static final int MAX_DECIMAL4_PRECISION = 9;
+  static final int MAX_DECIMAL8_PRECISION = 18;
+  static final int MAX_DECIMAL16_PRECISION = 38;
+
+  // The size (in bytes) of a UUID.
+  static final int UUID_SIZE = 16;
+
+  static byte primitiveHeader(int type) {
+    return (byte) (type << 2 | PRIMITIVE);
+  }
+
+  static byte shortStrHeader(int size) {
+    return (byte) (size << 2 | SHORT_STR);
+  }
+
+  static byte objectHeader(boolean largeSize, int idSize, int offsetSize) {
+    return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4))
+        | ((idSize - 1) << (BASIC_TYPE_BITS + 2))
+        | ((offsetSize - 1) << BASIC_TYPE_BITS)
+        | OBJECT);
+  }
+
+  static byte arrayHeader(boolean largeSize, int offsetSize) {
+    return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | 
((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY);
+  }
+
+  /**
+   * Check the validity of an array index `pos`.
+   * @param pos The index to check
+   * @param length The length of the array
+   * @throws IllegalArgumentException if the index is out of bound
+   */
+  static void checkIndex(int pos, int length) {
+    if (pos < 0 || pos >= length) {
+      throw new IllegalArgumentException(
+          String.format("Invalid byte-array offset (%d). length: %d", pos, 
length));
+    }
+  }
+
+  /**
+   * Reads a little-endian signed long value from `buffer[pos, pos + 
numBytes)`.
+   * @param buffer The ByteBuffer to read from
+   * @param pos The starting index of the buffer to read from
+   * @param numBytes The number of bytes to read
+   * @return The long value
+   */
+  static long readLong(ByteBuffer buffer, int pos, int numBytes) {
+    checkIndex(pos, buffer.limit());
+    checkIndex(pos + numBytes - 1, buffer.limit());
+    long result = 0;
+    // All bytes except the most significant byte should be unsigned-extended 
and shifted
+    // (so we need & 0xFF`). The most significant byte should be sign-extended 
and is handled
+    // after the loop.
+    for (int i = 0; i < numBytes - 1; ++i) {
+      long unsignedByteValue = buffer.get(pos + i) & 0xFF;
+      result |= unsignedByteValue << (8 * i);
+    }
+    long signedByteValue = buffer.get(pos + numBytes - 1);
+    result |= signedByteValue << (8 * (numBytes - 1));
+    return result;
+  }
+
+  /**
+   * Read a little-endian unsigned int value from `bytes[pos, pos + 
numBytes)`. The value must fit
+   * into a non-negative int (`[0, Integer.MAX_VALUE]`).
+   */
+  static int readUnsigned(ByteBuffer bytes, int pos, int numBytes) {
+    checkIndex(pos, bytes.limit());
+    checkIndex(pos + numBytes - 1, bytes.limit());
+    int result = 0;
+    // Similar to the `readLong` loop, but all bytes should be 
unsigned-extended.
+    for (int i = 0; i < numBytes; ++i) {
+      int unsignedByteValue = bytes.get(pos + i) & 0xFF;
+      result |= unsignedByteValue << (8 * i);
+    }
+    if (result < 0) {
+      throw new IllegalArgumentException(String.format("Failed to read 
unsigned int. numBytes: %d", numBytes));
+    }
+    return result;
+  }
+
+  /**
+   * Returns the value type of Variant value `value[pos...]`. It is only legal 
to call `get*` if
+   * `getType` returns the corresponding type. For example, it is only legal 
to call
+   * `getLong` if this method returns `Type.Long`.
+   * @param value The Variant value to get the type from
+   * @return The type of the Variant value
+   */
+  static Variant.Type getType(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    switch (basicType) {
+      case SHORT_STR:
+        return Variant.Type.STRING;
+      case OBJECT:
+        return Variant.Type.OBJECT;
+      case ARRAY:
+        return Variant.Type.ARRAY;
+      default:
+        switch (typeInfo) {
+          case NULL:
+            return Variant.Type.NULL;
+          case TRUE:
+          case FALSE:
+            return Variant.Type.BOOLEAN;
+          case INT8:
+            return Variant.Type.BYTE;
+          case INT16:
+            return Variant.Type.SHORT;
+          case INT32:
+            return Variant.Type.INT;
+          case INT64:
+            return Variant.Type.LONG;
+          case DOUBLE:
+            return Variant.Type.DOUBLE;
+          case DECIMAL4:
+            return Variant.Type.DECIMAL4;
+          case DECIMAL8:
+            return Variant.Type.DECIMAL8;
+          case DECIMAL16:
+            return Variant.Type.DECIMAL16;
+          case DATE:
+            return Variant.Type.DATE;
+          case TIMESTAMP_TZ:
+            return Variant.Type.TIMESTAMP_TZ;
+          case TIMESTAMP_NTZ:
+            return Variant.Type.TIMESTAMP_NTZ;
+          case FLOAT:
+            return Variant.Type.FLOAT;
+          case BINARY:
+            return Variant.Type.BINARY;
+          case LONG_STR:
+            return Variant.Type.STRING;
+          case TIME:
+            return Variant.Type.TIME;
+          case TIMESTAMP_NANOS:
+            return Variant.Type.TIMESTAMP_NANOS;
+          case TIMESTAMP_NANOS_NTZ:
+            return Variant.Type.TIMESTAMP_NANOS_NTZ;
+          case UUID:
+            return Variant.Type.UUID;
+          default:
+            throw new UnsupportedOperationException(
+                String.format("Unknown type in Variant. primitive type: %d", 
typeInfo));
+        }
+    }
+  }
+
+  private static IllegalArgumentException unexpectedType(Variant.Type type) {
+    return new IllegalArgumentException("Expected type to be " + type);
+  }
+
+  private static IllegalArgumentException unexpectedType(Variant.Type[] types) 
{
+    return new IllegalArgumentException("Expected type to be one of: " + 
Arrays.toString(types));
+  }
+
+  static boolean getBoolean(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) {
+      throw unexpectedType(Variant.Type.BOOLEAN);
+    }
+    return typeInfo == TRUE;
+  }
+
+  /**
+   * Returns a long value from Variant value `value[pos...]`.
+   * It is only legal to call it if `getType` returns one of Type.BYTE, SHORT, 
INT, LONG,
+   * DATE, TIMESTAMP, TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, 
TIMESTAMP_NANOS_NTZ.
+   * If the type is `DATE`, the return value is guaranteed to fit into an int 
and
+   * represents the number of days from the Unix epoch.
+   * If the type is `TIMESTAMP/TIMESTAMP_NTZ`, the return value represents the 
number of
+   * microseconds from the Unix epoch.
+   * If the type is `TIME`, the return value represents the number of 
microseconds since midnight.
+   * If the type is `TIMESTAMP_NANOS/TIMESTAMP_NANOS_NTZ`, the return value 
represents the number of
+   * nanoseconds from the Unix epoch.
+   * @param value The Variant value
+   * @return The long value
+   */
+  static long getLong(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE) {
+      throw new IllegalStateException(
+          "Expect type to be one of: BYTE, SHORT, INT, LONG, TIMESTAMP, 
TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, TIMESTAMP_NANOS_NTZ");
+    }
+    switch (typeInfo) {
+      case INT8:
+        return readLong(value, value.position() + 1, 1);

Review Comment:
   I added `getShort` (and others) to `VariantUtil` to clean this up.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java:
##########
@@ -0,0 +1,659 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+
+/**
+ * This class defines constants related to the Variant format and provides 
functions for
+ * manipulating Variant binaries.
+ *
+ * A Variant is made up of 2 binaries: value and metadata. A Variant value 
consists of a one-byte
+ * header and a number of content bytes (can be zero). The header byte is 
divided into upper 6 bits
+ * (called "type info") and lower 2 bits (called "basic type"). The content 
format is explained in
+ * the below constants for all possible basic type and type info values.
+ *
+ * The Variant metadata includes a version id and a dictionary of distinct 
strings (case-sensitive).
+ * Its binary format is:
+ * - Version: 1-byte unsigned integer. The only acceptable value is 1 
currently.
+ * - Dictionary size: 4-byte little-endian unsigned integer. The number of 
keys in the
+ *                    dictionary.
+ * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. 
`offsets[i]` represents the
+ * starting position of string i, counting starting from the address of 
`offsets[0]`. Strings
+ * must be stored contiguously, so we don’t need to store the string size, 
instead, we compute it
+ * with `offset[i + 1] - offset[i]`.
+ * - UTF-8 string data.
+ */
+class VariantUtil {
+  static final int BASIC_TYPE_BITS = 2;
+  static final int BASIC_TYPE_MASK = 0b00000011;
+  static final int PRIMITIVE_TYPE_MASK = 0b00111111;
+  /** The inclusive maximum value of the type info value. It is the size limit 
of `SHORT_STR`. */
+  static final int MAX_SHORT_STR_SIZE = 0b00111111;
+
+  // The basic types
+
+  /**
+   * Primitive value.
+   * The type info value must be one of the values in the "Primitive" section 
below.
+   */
+  static final int PRIMITIVE = 0;
+  /**
+   * Short string value.
+   * The type info value is the string size, which must be in `[0, 
MAX_SHORT_STR_SIZE]`.
+   * The string content bytes directly follow the header byte.
+   */
+  static final int SHORT_STR = 1;
+  /**
+   * Object value.
+   * The content contains a size, a list of field ids, a list of field 
offsets, and
+   * the actual field values. The list of field ids has `size` ids, while the 
list of field offsets
+   * has `size + 1` offsets, where the last offset represents the total size 
of the field values
+   * data. The list of fields ids must be sorted by the field name in 
alphabetical order.
+   * Duplicate field names within one object are not allowed.
+   * 5 bits in the type info are used to specify the integer type of the 
object header. It is
+   * 0_b4_b3b2_b1b0 (MSB is 0), where:
+   *   - b4: the integer type of size. When it is 0/1, `size` is a 
little-endian 1/4-byte
+   *         unsigned integer.
+   *   - b3b2: the integer type of ids. When the 2 bits are 0/1/2, the id list 
contains
+   *           1/2/3-byte little-endian unsigned integers.
+   *   - b1b0: the integer type of offset. When the 2 bits are 0/1/2, the 
offset list contains
+   *           1/2/3-byte little-endian unsigned integers.
+   */
+  static final int OBJECT = 2;
+  /**
+   * Array value.
+   * The content contains a size, a list of field offsets, and the actual 
element values.
+   * It is similar to an object without the id list. The length of the offset 
list
+   * is `size + 1`, where the last offset represent the total size of the 
element data.
+   * Its type info is: 000_b2_b1b0:
+   *   - b2: the type of size.
+   *   - b1b0: the integer type of offset.
+   */
+  static final int ARRAY = 3;
+
+  // The primitive types
+
+  /** JSON Null value. Empty content. */
+  static final int NULL = 0;
+  /** True value. Empty content. */
+  static final int TRUE = 1;
+  /** False value. Empty content. */
+  static final int FALSE = 2;
+  /** 1-byte little-endian signed integer. */
+  static final int INT8 = 3;
+  /** 2-byte little-endian signed integer. */
+  static final int INT16 = 4;
+  /** 4-byte little-endian signed integer. */
+  static final int INT32 = 5;
+  /** 4-byte little-endian signed integer. */
+  static final int INT64 = 6;
+  /** 8-byte IEEE double. */
+  static final int DOUBLE = 7;
+  /** 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed 
integer. */
+  static final int DECIMAL4 = 8;
+  /** 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed 
integer. */
+  static final int DECIMAL8 = 9;
+  /** 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed 
integer. */
+  static final int DECIMAL16 = 10;
+  /**
+   * Date value. Content is 4-byte little-endian signed integer that 
represents the
+   * number of days from the Unix epoch.
+   */
+  static final int DATE = 11;
+  /**
+   * Timestamp value. Content is 8-byte little-endian signed integer that 
represents the number of
+   * microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is 
displayed to users in
+   * their local time zones and may be displayed differently depending on the 
execution environment.
+   */
+  static final int TIMESTAMP_TZ = 12;
+  /**
+   * Timestamp_ntz value. It has the same content as `TIMESTAMP` but should 
always be interpreted
+   * as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NTZ = 13;
+  /** 4-byte IEEE float. */
+  static final int FLOAT = 14;
+  /**
+   * Binary value. The content is (4-byte little-endian unsigned integer 
representing the binary
+   * size) + (size bytes of binary content).
+   */
+  static final int BINARY = 15;
+  /**
+   * Long string value. The content is (4-byte little-endian unsigned integer 
representing the
+   * string size) + (size bytes of string content).
+   */
+  static final int LONG_STR = 16;
+  /**
+   * Time value. Values can be from 00:00:00 to 23:59:59.999999.
+   * Content is 8-byte little-endian unsigned integer that represents the 
number of microseconds
+   * since midnight.
+   */
+  static final int TIME = 17;
+  /**
+   * Timestamp nanos value. Similar to `TIMESTAMP`, but represents the number 
of nanoseconds
+   * elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC.
+   */
+  static final int TIMESTAMP_NANOS = 18;
+  /**
+   * Timestamp nanos (without timestamp) value. It has the same content as 
`TIMESTAMP_NANOS` but
+   * should always be interpreted as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NANOS_NTZ = 19;
+  /**
+   * UUID value. The content is a 16-byte binary, encoded using big-endian.
+   * For example, UUID 00112233-4455-6677-8899-aabbccddeeff is encoded as the 
bytes
+   * 00 11 22 33 44 55 66 77 88 99 aa bb cc dd ee ff.
+   */
+  static final int UUID = 20;
+
+  // The metadata version.
+  static final byte VERSION = 1;
+  // The lower 4 bits of the first metadata byte contain the version.
+  static final byte VERSION_MASK = 0x0F;
+
+  // Constants for various unsigned integer sizes.
+  static final int U8_MAX = 0xFF;
+  static final int U16_MAX = 0xFFFF;
+  static final int U24_MAX = 0xFFFFFF;
+  static final int U8_SIZE = 1;
+  static final int U16_SIZE = 2;
+  static final int U24_SIZE = 3;
+  static final int U32_SIZE = 4;
+
+  // Max decimal precision for each decimal type.
+  static final int MAX_DECIMAL4_PRECISION = 9;
+  static final int MAX_DECIMAL8_PRECISION = 18;
+  static final int MAX_DECIMAL16_PRECISION = 38;
+
+  // The size (in bytes) of a UUID.
+  static final int UUID_SIZE = 16;
+
+  static byte primitiveHeader(int type) {
+    return (byte) (type << 2 | PRIMITIVE);
+  }
+
+  static byte shortStrHeader(int size) {
+    return (byte) (size << 2 | SHORT_STR);
+  }
+
+  static byte objectHeader(boolean largeSize, int idSize, int offsetSize) {
+    return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4))
+        | ((idSize - 1) << (BASIC_TYPE_BITS + 2))
+        | ((offsetSize - 1) << BASIC_TYPE_BITS)
+        | OBJECT);
+  }
+
+  static byte arrayHeader(boolean largeSize, int offsetSize) {
+    return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | 
((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY);
+  }
+
+  /**
+   * Check the validity of an array index `pos`.
+   * @param pos The index to check
+   * @param length The length of the array
+   * @throws IllegalArgumentException if the index is out of bound
+   */
+  static void checkIndex(int pos, int length) {
+    if (pos < 0 || pos >= length) {
+      throw new IllegalArgumentException(
+          String.format("Invalid byte-array offset (%d). length: %d", pos, 
length));
+    }
+  }
+
+  /**
+   * Reads a little-endian signed long value from `buffer[pos, pos + 
numBytes)`.
+   * @param buffer The ByteBuffer to read from
+   * @param pos The starting index of the buffer to read from
+   * @param numBytes The number of bytes to read
+   * @return The long value
+   */
+  static long readLong(ByteBuffer buffer, int pos, int numBytes) {
+    checkIndex(pos, buffer.limit());
+    checkIndex(pos + numBytes - 1, buffer.limit());
+    long result = 0;
+    // All bytes except the most significant byte should be unsigned-extended 
and shifted
+    // (so we need & 0xFF`). The most significant byte should be sign-extended 
and is handled
+    // after the loop.
+    for (int i = 0; i < numBytes - 1; ++i) {
+      long unsignedByteValue = buffer.get(pos + i) & 0xFF;
+      result |= unsignedByteValue << (8 * i);
+    }
+    long signedByteValue = buffer.get(pos + numBytes - 1);
+    result |= signedByteValue << (8 * (numBytes - 1));
+    return result;
+  }
+
+  /**
+   * Read a little-endian unsigned int value from `bytes[pos, pos + 
numBytes)`. The value must fit
+   * into a non-negative int (`[0, Integer.MAX_VALUE]`).
+   */
+  static int readUnsigned(ByteBuffer bytes, int pos, int numBytes) {
+    checkIndex(pos, bytes.limit());
+    checkIndex(pos + numBytes - 1, bytes.limit());
+    int result = 0;
+    // Similar to the `readLong` loop, but all bytes should be 
unsigned-extended.
+    for (int i = 0; i < numBytes; ++i) {
+      int unsignedByteValue = bytes.get(pos + i) & 0xFF;
+      result |= unsignedByteValue << (8 * i);
+    }
+    if (result < 0) {
+      throw new IllegalArgumentException(String.format("Failed to read 
unsigned int. numBytes: %d", numBytes));
+    }
+    return result;
+  }
+
+  /**
+   * Returns the value type of Variant value `value[pos...]`. It is only legal 
to call `get*` if
+   * `getType` returns the corresponding type. For example, it is only legal 
to call
+   * `getLong` if this method returns `Type.Long`.
+   * @param value The Variant value to get the type from
+   * @return The type of the Variant value
+   */
+  static Variant.Type getType(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    switch (basicType) {
+      case SHORT_STR:
+        return Variant.Type.STRING;
+      case OBJECT:
+        return Variant.Type.OBJECT;
+      case ARRAY:
+        return Variant.Type.ARRAY;
+      default:
+        switch (typeInfo) {
+          case NULL:
+            return Variant.Type.NULL;
+          case TRUE:
+          case FALSE:
+            return Variant.Type.BOOLEAN;
+          case INT8:
+            return Variant.Type.BYTE;
+          case INT16:
+            return Variant.Type.SHORT;
+          case INT32:
+            return Variant.Type.INT;
+          case INT64:
+            return Variant.Type.LONG;
+          case DOUBLE:
+            return Variant.Type.DOUBLE;
+          case DECIMAL4:
+            return Variant.Type.DECIMAL4;
+          case DECIMAL8:
+            return Variant.Type.DECIMAL8;
+          case DECIMAL16:
+            return Variant.Type.DECIMAL16;
+          case DATE:
+            return Variant.Type.DATE;
+          case TIMESTAMP_TZ:
+            return Variant.Type.TIMESTAMP_TZ;
+          case TIMESTAMP_NTZ:
+            return Variant.Type.TIMESTAMP_NTZ;
+          case FLOAT:
+            return Variant.Type.FLOAT;
+          case BINARY:
+            return Variant.Type.BINARY;
+          case LONG_STR:
+            return Variant.Type.STRING;
+          case TIME:
+            return Variant.Type.TIME;
+          case TIMESTAMP_NANOS:
+            return Variant.Type.TIMESTAMP_NANOS;
+          case TIMESTAMP_NANOS_NTZ:
+            return Variant.Type.TIMESTAMP_NANOS_NTZ;
+          case UUID:
+            return Variant.Type.UUID;
+          default:
+            throw new UnsupportedOperationException(
+                String.format("Unknown type in Variant. primitive type: %d", 
typeInfo));
+        }
+    }
+  }
+
+  private static IllegalArgumentException unexpectedType(Variant.Type type) {
+    return new IllegalArgumentException("Expected type to be " + type);
+  }
+
+  private static IllegalArgumentException unexpectedType(Variant.Type[] types) 
{
+    return new IllegalArgumentException("Expected type to be one of: " + 
Arrays.toString(types));
+  }
+
+  static boolean getBoolean(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) {
+      throw unexpectedType(Variant.Type.BOOLEAN);
+    }
+    return typeInfo == TRUE;
+  }
+
+  /**
+   * Returns a long value from Variant value `value[pos...]`.
+   * It is only legal to call it if `getType` returns one of Type.BYTE, SHORT, 
INT, LONG,
+   * DATE, TIMESTAMP, TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, 
TIMESTAMP_NANOS_NTZ.
+   * If the type is `DATE`, the return value is guaranteed to fit into an int 
and
+   * represents the number of days from the Unix epoch.
+   * If the type is `TIMESTAMP/TIMESTAMP_NTZ`, the return value represents the 
number of
+   * microseconds from the Unix epoch.
+   * If the type is `TIME`, the return value represents the number of 
microseconds since midnight.
+   * If the type is `TIMESTAMP_NANOS/TIMESTAMP_NANOS_NTZ`, the return value 
represents the number of
+   * nanoseconds from the Unix epoch.
+   * @param value The Variant value
+   * @return The long value
+   */
+  static long getLong(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE) {
+      throw new IllegalStateException(
+          "Expect type to be one of: BYTE, SHORT, INT, LONG, TIMESTAMP, 
TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, TIMESTAMP_NANOS_NTZ");

Review Comment:
   Updated to use `TIMESTAMP_TZ` and `TIMESTAMP_NANOS_TZ`.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java:
##########
@@ -0,0 +1,659 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+
+/**
+ * This class defines constants related to the Variant format and provides 
functions for
+ * manipulating Variant binaries.
+ *
+ * A Variant is made up of 2 binaries: value and metadata. A Variant value 
consists of a one-byte
+ * header and a number of content bytes (can be zero). The header byte is 
divided into upper 6 bits
+ * (called "type info") and lower 2 bits (called "basic type"). The content 
format is explained in
+ * the below constants for all possible basic type and type info values.
+ *
+ * The Variant metadata includes a version id and a dictionary of distinct 
strings (case-sensitive).
+ * Its binary format is:
+ * - Version: 1-byte unsigned integer. The only acceptable value is 1 
currently.
+ * - Dictionary size: 4-byte little-endian unsigned integer. The number of 
keys in the
+ *                    dictionary.
+ * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. 
`offsets[i]` represents the
+ * starting position of string i, counting starting from the address of 
`offsets[0]`. Strings
+ * must be stored contiguously, so we don’t need to store the string size, 
instead, we compute it
+ * with `offset[i + 1] - offset[i]`.
+ * - UTF-8 string data.
+ */
+class VariantUtil {
+  static final int BASIC_TYPE_BITS = 2;
+  static final int BASIC_TYPE_MASK = 0b00000011;
+  static final int PRIMITIVE_TYPE_MASK = 0b00111111;
+  /** The inclusive maximum value of the type info value. It is the size limit 
of `SHORT_STR`. */
+  static final int MAX_SHORT_STR_SIZE = 0b00111111;
+
+  // The basic types
+
+  /**
+   * Primitive value.
+   * The type info value must be one of the values in the "Primitive" section 
below.
+   */
+  static final int PRIMITIVE = 0;
+  /**
+   * Short string value.
+   * The type info value is the string size, which must be in `[0, 
MAX_SHORT_STR_SIZE]`.
+   * The string content bytes directly follow the header byte.
+   */
+  static final int SHORT_STR = 1;
+  /**
+   * Object value.
+   * The content contains a size, a list of field ids, a list of field 
offsets, and
+   * the actual field values. The list of field ids has `size` ids, while the 
list of field offsets
+   * has `size + 1` offsets, where the last offset represents the total size 
of the field values
+   * data. The list of fields ids must be sorted by the field name in 
alphabetical order.
+   * Duplicate field names within one object are not allowed.
+   * 5 bits in the type info are used to specify the integer type of the 
object header. It is
+   * 0_b4_b3b2_b1b0 (MSB is 0), where:
+   *   - b4: the integer type of size. When it is 0/1, `size` is a 
little-endian 1/4-byte
+   *         unsigned integer.
+   *   - b3b2: the integer type of ids. When the 2 bits are 0/1/2, the id list 
contains
+   *           1/2/3-byte little-endian unsigned integers.
+   *   - b1b0: the integer type of offset. When the 2 bits are 0/1/2, the 
offset list contains
+   *           1/2/3-byte little-endian unsigned integers.
+   */
+  static final int OBJECT = 2;
+  /**
+   * Array value.
+   * The content contains a size, a list of field offsets, and the actual 
element values.
+   * It is similar to an object without the id list. The length of the offset 
list
+   * is `size + 1`, where the last offset represent the total size of the 
element data.
+   * Its type info is: 000_b2_b1b0:
+   *   - b2: the type of size.
+   *   - b1b0: the integer type of offset.
+   */
+  static final int ARRAY = 3;
+
+  // The primitive types
+
+  /** JSON Null value. Empty content. */
+  static final int NULL = 0;
+  /** True value. Empty content. */
+  static final int TRUE = 1;
+  /** False value. Empty content. */
+  static final int FALSE = 2;
+  /** 1-byte little-endian signed integer. */
+  static final int INT8 = 3;
+  /** 2-byte little-endian signed integer. */
+  static final int INT16 = 4;
+  /** 4-byte little-endian signed integer. */
+  static final int INT32 = 5;
+  /** 4-byte little-endian signed integer. */
+  static final int INT64 = 6;
+  /** 8-byte IEEE double. */
+  static final int DOUBLE = 7;
+  /** 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed 
integer. */
+  static final int DECIMAL4 = 8;
+  /** 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed 
integer. */
+  static final int DECIMAL8 = 9;
+  /** 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed 
integer. */
+  static final int DECIMAL16 = 10;
+  /**
+   * Date value. Content is 4-byte little-endian signed integer that 
represents the
+   * number of days from the Unix epoch.
+   */
+  static final int DATE = 11;
+  /**
+   * Timestamp value. Content is 8-byte little-endian signed integer that 
represents the number of
+   * microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is 
displayed to users in
+   * their local time zones and may be displayed differently depending on the 
execution environment.
+   */
+  static final int TIMESTAMP_TZ = 12;
+  /**
+   * Timestamp_ntz value. It has the same content as `TIMESTAMP` but should 
always be interpreted
+   * as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NTZ = 13;
+  /** 4-byte IEEE float. */
+  static final int FLOAT = 14;
+  /**
+   * Binary value. The content is (4-byte little-endian unsigned integer 
representing the binary
+   * size) + (size bytes of binary content).
+   */
+  static final int BINARY = 15;
+  /**
+   * Long string value. The content is (4-byte little-endian unsigned integer 
representing the
+   * string size) + (size bytes of string content).
+   */
+  static final int LONG_STR = 16;
+  /**
+   * Time value. Values can be from 00:00:00 to 23:59:59.999999.
+   * Content is 8-byte little-endian unsigned integer that represents the 
number of microseconds
+   * since midnight.
+   */
+  static final int TIME = 17;
+  /**
+   * Timestamp nanos value. Similar to `TIMESTAMP`, but represents the number 
of nanoseconds
+   * elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC.
+   */
+  static final int TIMESTAMP_NANOS = 18;
+  /**
+   * Timestamp nanos (without timestamp) value. It has the same content as 
`TIMESTAMP_NANOS` but
+   * should always be interpreted as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NANOS_NTZ = 19;
+  /**
+   * UUID value. The content is a 16-byte binary, encoded using big-endian.
+   * For example, UUID 00112233-4455-6677-8899-aabbccddeeff is encoded as the 
bytes
+   * 00 11 22 33 44 55 66 77 88 99 aa bb cc dd ee ff.
+   */
+  static final int UUID = 20;
+
+  // The metadata version.
+  static final byte VERSION = 1;
+  // The lower 4 bits of the first metadata byte contain the version.
+  static final byte VERSION_MASK = 0x0F;
+
+  // Constants for various unsigned integer sizes.
+  static final int U8_MAX = 0xFF;
+  static final int U16_MAX = 0xFFFF;
+  static final int U24_MAX = 0xFFFFFF;
+  static final int U8_SIZE = 1;
+  static final int U16_SIZE = 2;
+  static final int U24_SIZE = 3;
+  static final int U32_SIZE = 4;
+
+  // Max decimal precision for each decimal type.
+  static final int MAX_DECIMAL4_PRECISION = 9;
+  static final int MAX_DECIMAL8_PRECISION = 18;
+  static final int MAX_DECIMAL16_PRECISION = 38;
+
+  // The size (in bytes) of a UUID.
+  static final int UUID_SIZE = 16;
+
+  static byte primitiveHeader(int type) {
+    return (byte) (type << 2 | PRIMITIVE);
+  }
+
+  static byte shortStrHeader(int size) {
+    return (byte) (size << 2 | SHORT_STR);
+  }
+
+  static byte objectHeader(boolean largeSize, int idSize, int offsetSize) {
+    return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4))
+        | ((idSize - 1) << (BASIC_TYPE_BITS + 2))
+        | ((offsetSize - 1) << BASIC_TYPE_BITS)
+        | OBJECT);
+  }
+
+  static byte arrayHeader(boolean largeSize, int offsetSize) {
+    return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | 
((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY);
+  }
+
+  /**
+   * Check the validity of an array index `pos`.
+   * @param pos The index to check
+   * @param length The length of the array
+   * @throws IllegalArgumentException if the index is out of bound
+   */
+  static void checkIndex(int pos, int length) {
+    if (pos < 0 || pos >= length) {
+      throw new IllegalArgumentException(
+          String.format("Invalid byte-array offset (%d). length: %d", pos, 
length));
+    }
+  }
+
+  /**
+   * Reads a little-endian signed long value from `buffer[pos, pos + 
numBytes)`.
+   * @param buffer The ByteBuffer to read from
+   * @param pos The starting index of the buffer to read from
+   * @param numBytes The number of bytes to read
+   * @return The long value
+   */
+  static long readLong(ByteBuffer buffer, int pos, int numBytes) {
+    checkIndex(pos, buffer.limit());
+    checkIndex(pos + numBytes - 1, buffer.limit());
+    long result = 0;
+    // All bytes except the most significant byte should be unsigned-extended 
and shifted
+    // (so we need & 0xFF`). The most significant byte should be sign-extended 
and is handled
+    // after the loop.
+    for (int i = 0; i < numBytes - 1; ++i) {
+      long unsignedByteValue = buffer.get(pos + i) & 0xFF;
+      result |= unsignedByteValue << (8 * i);
+    }
+    long signedByteValue = buffer.get(pos + numBytes - 1);
+    result |= signedByteValue << (8 * (numBytes - 1));
+    return result;
+  }
+
+  /**
+   * Read a little-endian unsigned int value from `bytes[pos, pos + 
numBytes)`. The value must fit
+   * into a non-negative int (`[0, Integer.MAX_VALUE]`).
+   */
+  static int readUnsigned(ByteBuffer bytes, int pos, int numBytes) {
+    checkIndex(pos, bytes.limit());
+    checkIndex(pos + numBytes - 1, bytes.limit());
+    int result = 0;
+    // Similar to the `readLong` loop, but all bytes should be 
unsigned-extended.
+    for (int i = 0; i < numBytes; ++i) {
+      int unsignedByteValue = bytes.get(pos + i) & 0xFF;
+      result |= unsignedByteValue << (8 * i);
+    }
+    if (result < 0) {
+      throw new IllegalArgumentException(String.format("Failed to read 
unsigned int. numBytes: %d", numBytes));
+    }
+    return result;
+  }
+
+  /**
+   * Returns the value type of Variant value `value[pos...]`. It is only legal 
to call `get*` if
+   * `getType` returns the corresponding type. For example, it is only legal 
to call
+   * `getLong` if this method returns `Type.Long`.
+   * @param value The Variant value to get the type from
+   * @return The type of the Variant value
+   */
+  static Variant.Type getType(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    switch (basicType) {
+      case SHORT_STR:
+        return Variant.Type.STRING;
+      case OBJECT:
+        return Variant.Type.OBJECT;
+      case ARRAY:
+        return Variant.Type.ARRAY;
+      default:
+        switch (typeInfo) {
+          case NULL:
+            return Variant.Type.NULL;
+          case TRUE:
+          case FALSE:
+            return Variant.Type.BOOLEAN;
+          case INT8:
+            return Variant.Type.BYTE;
+          case INT16:
+            return Variant.Type.SHORT;
+          case INT32:
+            return Variant.Type.INT;
+          case INT64:
+            return Variant.Type.LONG;
+          case DOUBLE:
+            return Variant.Type.DOUBLE;
+          case DECIMAL4:
+            return Variant.Type.DECIMAL4;
+          case DECIMAL8:
+            return Variant.Type.DECIMAL8;
+          case DECIMAL16:
+            return Variant.Type.DECIMAL16;
+          case DATE:
+            return Variant.Type.DATE;
+          case TIMESTAMP_TZ:
+            return Variant.Type.TIMESTAMP_TZ;
+          case TIMESTAMP_NTZ:
+            return Variant.Type.TIMESTAMP_NTZ;
+          case FLOAT:
+            return Variant.Type.FLOAT;
+          case BINARY:
+            return Variant.Type.BINARY;
+          case LONG_STR:
+            return Variant.Type.STRING;
+          case TIME:
+            return Variant.Type.TIME;
+          case TIMESTAMP_NANOS:
+            return Variant.Type.TIMESTAMP_NANOS;
+          case TIMESTAMP_NANOS_NTZ:
+            return Variant.Type.TIMESTAMP_NANOS_NTZ;
+          case UUID:
+            return Variant.Type.UUID;
+          default:
+            throw new UnsupportedOperationException(
+                String.format("Unknown type in Variant. primitive type: %d", 
typeInfo));
+        }
+    }
+  }
+
+  private static IllegalArgumentException unexpectedType(Variant.Type type) {
+    return new IllegalArgumentException("Expected type to be " + type);
+  }
+
+  private static IllegalArgumentException unexpectedType(Variant.Type[] types) 
{
+    return new IllegalArgumentException("Expected type to be one of: " + 
Arrays.toString(types));
+  }
+
+  static boolean getBoolean(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) {
+      throw unexpectedType(Variant.Type.BOOLEAN);
+    }
+    return typeInfo == TRUE;
+  }
+
+  /**
+   * Returns a long value from Variant value `value[pos...]`.
+   * It is only legal to call it if `getType` returns one of Type.BYTE, SHORT, 
INT, LONG,
+   * DATE, TIMESTAMP, TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, 
TIMESTAMP_NANOS_NTZ.
+   * If the type is `DATE`, the return value is guaranteed to fit into an int 
and
+   * represents the number of days from the Unix epoch.
+   * If the type is `TIMESTAMP/TIMESTAMP_NTZ`, the return value represents the 
number of
+   * microseconds from the Unix epoch.
+   * If the type is `TIME`, the return value represents the number of 
microseconds since midnight.
+   * If the type is `TIMESTAMP_NANOS/TIMESTAMP_NANOS_NTZ`, the return value 
represents the number of
+   * nanoseconds from the Unix epoch.
+   * @param value The Variant value
+   * @return The long value
+   */
+  static long getLong(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE) {
+      throw new IllegalStateException(
+          "Expect type to be one of: BYTE, SHORT, INT, LONG, TIMESTAMP, 
TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, TIMESTAMP_NANOS_NTZ");
+    }
+    switch (typeInfo) {
+      case INT8:
+        return readLong(value, value.position() + 1, 1);
+      case INT16:
+        return readLong(value, value.position() + 1, 2);
+      case INT32:
+      case DATE:
+        return readLong(value, value.position() + 1, 4);
+      case INT64:
+      case TIMESTAMP_TZ:
+      case TIMESTAMP_NTZ:
+      case TIME:
+      case TIMESTAMP_NANOS:
+      case TIMESTAMP_NANOS_NTZ:
+        return readLong(value, value.position() + 1, 8);
+      default:
+        throw new IllegalStateException(

Review Comment:
   Updated to use that list one.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java:
##########
@@ -0,0 +1,659 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+
+/**
+ * This class defines constants related to the Variant format and provides 
functions for
+ * manipulating Variant binaries.
+ *
+ * A Variant is made up of 2 binaries: value and metadata. A Variant value 
consists of a one-byte
+ * header and a number of content bytes (can be zero). The header byte is 
divided into upper 6 bits
+ * (called "type info") and lower 2 bits (called "basic type"). The content 
format is explained in
+ * the below constants for all possible basic type and type info values.
+ *
+ * The Variant metadata includes a version id and a dictionary of distinct 
strings (case-sensitive).
+ * Its binary format is:
+ * - Version: 1-byte unsigned integer. The only acceptable value is 1 
currently.
+ * - Dictionary size: 4-byte little-endian unsigned integer. The number of 
keys in the
+ *                    dictionary.
+ * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. 
`offsets[i]` represents the
+ * starting position of string i, counting starting from the address of 
`offsets[0]`. Strings
+ * must be stored contiguously, so we don’t need to store the string size, 
instead, we compute it
+ * with `offset[i + 1] - offset[i]`.
+ * - UTF-8 string data.
+ */
+class VariantUtil {
+  static final int BASIC_TYPE_BITS = 2;
+  static final int BASIC_TYPE_MASK = 0b00000011;
+  static final int PRIMITIVE_TYPE_MASK = 0b00111111;
+  /** The inclusive maximum value of the type info value. It is the size limit 
of `SHORT_STR`. */
+  static final int MAX_SHORT_STR_SIZE = 0b00111111;
+
+  // The basic types
+
+  /**
+   * Primitive value.
+   * The type info value must be one of the values in the "Primitive" section 
below.
+   */
+  static final int PRIMITIVE = 0;
+  /**
+   * Short string value.
+   * The type info value is the string size, which must be in `[0, 
MAX_SHORT_STR_SIZE]`.
+   * The string content bytes directly follow the header byte.
+   */
+  static final int SHORT_STR = 1;
+  /**
+   * Object value.
+   * The content contains a size, a list of field ids, a list of field 
offsets, and
+   * the actual field values. The list of field ids has `size` ids, while the 
list of field offsets
+   * has `size + 1` offsets, where the last offset represents the total size 
of the field values
+   * data. The list of fields ids must be sorted by the field name in 
alphabetical order.
+   * Duplicate field names within one object are not allowed.
+   * 5 bits in the type info are used to specify the integer type of the 
object header. It is
+   * 0_b4_b3b2_b1b0 (MSB is 0), where:
+   *   - b4: the integer type of size. When it is 0/1, `size` is a 
little-endian 1/4-byte
+   *         unsigned integer.
+   *   - b3b2: the integer type of ids. When the 2 bits are 0/1/2, the id list 
contains
+   *           1/2/3-byte little-endian unsigned integers.
+   *   - b1b0: the integer type of offset. When the 2 bits are 0/1/2, the 
offset list contains
+   *           1/2/3-byte little-endian unsigned integers.
+   */
+  static final int OBJECT = 2;
+  /**
+   * Array value.
+   * The content contains a size, a list of field offsets, and the actual 
element values.
+   * It is similar to an object without the id list. The length of the offset 
list
+   * is `size + 1`, where the last offset represent the total size of the 
element data.
+   * Its type info is: 000_b2_b1b0:
+   *   - b2: the type of size.
+   *   - b1b0: the integer type of offset.
+   */
+  static final int ARRAY = 3;
+
+  // The primitive types
+
+  /** JSON Null value. Empty content. */
+  static final int NULL = 0;
+  /** True value. Empty content. */
+  static final int TRUE = 1;
+  /** False value. Empty content. */
+  static final int FALSE = 2;
+  /** 1-byte little-endian signed integer. */
+  static final int INT8 = 3;
+  /** 2-byte little-endian signed integer. */
+  static final int INT16 = 4;
+  /** 4-byte little-endian signed integer. */
+  static final int INT32 = 5;
+  /** 4-byte little-endian signed integer. */
+  static final int INT64 = 6;
+  /** 8-byte IEEE double. */
+  static final int DOUBLE = 7;
+  /** 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed 
integer. */
+  static final int DECIMAL4 = 8;
+  /** 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed 
integer. */
+  static final int DECIMAL8 = 9;
+  /** 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed 
integer. */
+  static final int DECIMAL16 = 10;
+  /**
+   * Date value. Content is 4-byte little-endian signed integer that 
represents the
+   * number of days from the Unix epoch.
+   */
+  static final int DATE = 11;
+  /**
+   * Timestamp value. Content is 8-byte little-endian signed integer that 
represents the number of
+   * microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is 
displayed to users in
+   * their local time zones and may be displayed differently depending on the 
execution environment.
+   */
+  static final int TIMESTAMP_TZ = 12;
+  /**
+   * Timestamp_ntz value. It has the same content as `TIMESTAMP` but should 
always be interpreted
+   * as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NTZ = 13;
+  /** 4-byte IEEE float. */
+  static final int FLOAT = 14;
+  /**
+   * Binary value. The content is (4-byte little-endian unsigned integer 
representing the binary
+   * size) + (size bytes of binary content).
+   */
+  static final int BINARY = 15;
+  /**
+   * Long string value. The content is (4-byte little-endian unsigned integer 
representing the
+   * string size) + (size bytes of string content).
+   */
+  static final int LONG_STR = 16;
+  /**
+   * Time value. Values can be from 00:00:00 to 23:59:59.999999.
+   * Content is 8-byte little-endian unsigned integer that represents the 
number of microseconds
+   * since midnight.
+   */
+  static final int TIME = 17;
+  /**
+   * Timestamp nanos value. Similar to `TIMESTAMP`, but represents the number 
of nanoseconds
+   * elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC.
+   */
+  static final int TIMESTAMP_NANOS = 18;
+  /**
+   * Timestamp nanos (without timestamp) value. It has the same content as 
`TIMESTAMP_NANOS` but
+   * should always be interpreted as if the local time zone is UTC.
+   */
+  static final int TIMESTAMP_NANOS_NTZ = 19;
+  /**
+   * UUID value. The content is a 16-byte binary, encoded using big-endian.
+   * For example, UUID 00112233-4455-6677-8899-aabbccddeeff is encoded as the 
bytes
+   * 00 11 22 33 44 55 66 77 88 99 aa bb cc dd ee ff.
+   */
+  static final int UUID = 20;
+
+  // The metadata version.
+  static final byte VERSION = 1;
+  // The lower 4 bits of the first metadata byte contain the version.
+  static final byte VERSION_MASK = 0x0F;
+
+  // Constants for various unsigned integer sizes.
+  static final int U8_MAX = 0xFF;
+  static final int U16_MAX = 0xFFFF;
+  static final int U24_MAX = 0xFFFFFF;
+  static final int U8_SIZE = 1;
+  static final int U16_SIZE = 2;
+  static final int U24_SIZE = 3;
+  static final int U32_SIZE = 4;
+
+  // Max decimal precision for each decimal type.
+  static final int MAX_DECIMAL4_PRECISION = 9;
+  static final int MAX_DECIMAL8_PRECISION = 18;
+  static final int MAX_DECIMAL16_PRECISION = 38;
+
+  // The size (in bytes) of a UUID.
+  static final int UUID_SIZE = 16;
+
+  static byte primitiveHeader(int type) {
+    return (byte) (type << 2 | PRIMITIVE);
+  }
+
+  static byte shortStrHeader(int size) {
+    return (byte) (size << 2 | SHORT_STR);
+  }
+
+  static byte objectHeader(boolean largeSize, int idSize, int offsetSize) {
+    return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4))
+        | ((idSize - 1) << (BASIC_TYPE_BITS + 2))
+        | ((offsetSize - 1) << BASIC_TYPE_BITS)
+        | OBJECT);
+  }
+
+  static byte arrayHeader(boolean largeSize, int offsetSize) {
+    return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | 
((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY);
+  }
+
+  /**
+   * Check the validity of an array index `pos`.
+   * @param pos The index to check
+   * @param length The length of the array
+   * @throws IllegalArgumentException if the index is out of bound
+   */
+  static void checkIndex(int pos, int length) {
+    if (pos < 0 || pos >= length) {
+      throw new IllegalArgumentException(
+          String.format("Invalid byte-array offset (%d). length: %d", pos, 
length));
+    }
+  }
+
+  /**
+   * Reads a little-endian signed long value from `buffer[pos, pos + 
numBytes)`.
+   * @param buffer The ByteBuffer to read from
+   * @param pos The starting index of the buffer to read from
+   * @param numBytes The number of bytes to read
+   * @return The long value
+   */
+  static long readLong(ByteBuffer buffer, int pos, int numBytes) {
+    checkIndex(pos, buffer.limit());
+    checkIndex(pos + numBytes - 1, buffer.limit());
+    long result = 0;
+    // All bytes except the most significant byte should be unsigned-extended 
and shifted
+    // (so we need & 0xFF`). The most significant byte should be sign-extended 
and is handled
+    // after the loop.
+    for (int i = 0; i < numBytes - 1; ++i) {
+      long unsignedByteValue = buffer.get(pos + i) & 0xFF;
+      result |= unsignedByteValue << (8 * i);
+    }
+    long signedByteValue = buffer.get(pos + numBytes - 1);
+    result |= signedByteValue << (8 * (numBytes - 1));
+    return result;
+  }
+
+  /**
+   * Read a little-endian unsigned int value from `bytes[pos, pos + 
numBytes)`. The value must fit
+   * into a non-negative int (`[0, Integer.MAX_VALUE]`).
+   */
+  static int readUnsigned(ByteBuffer bytes, int pos, int numBytes) {
+    checkIndex(pos, bytes.limit());
+    checkIndex(pos + numBytes - 1, bytes.limit());
+    int result = 0;
+    // Similar to the `readLong` loop, but all bytes should be 
unsigned-extended.
+    for (int i = 0; i < numBytes; ++i) {
+      int unsignedByteValue = bytes.get(pos + i) & 0xFF;
+      result |= unsignedByteValue << (8 * i);
+    }
+    if (result < 0) {
+      throw new IllegalArgumentException(String.format("Failed to read 
unsigned int. numBytes: %d", numBytes));
+    }
+    return result;
+  }
+
+  /**
+   * Returns the value type of Variant value `value[pos...]`. It is only legal 
to call `get*` if
+   * `getType` returns the corresponding type. For example, it is only legal 
to call
+   * `getLong` if this method returns `Type.Long`.
+   * @param value The Variant value to get the type from
+   * @return The type of the Variant value
+   */
+  static Variant.Type getType(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    switch (basicType) {
+      case SHORT_STR:
+        return Variant.Type.STRING;
+      case OBJECT:
+        return Variant.Type.OBJECT;
+      case ARRAY:
+        return Variant.Type.ARRAY;
+      default:
+        switch (typeInfo) {
+          case NULL:
+            return Variant.Type.NULL;
+          case TRUE:
+          case FALSE:
+            return Variant.Type.BOOLEAN;
+          case INT8:
+            return Variant.Type.BYTE;
+          case INT16:
+            return Variant.Type.SHORT;
+          case INT32:
+            return Variant.Type.INT;
+          case INT64:
+            return Variant.Type.LONG;
+          case DOUBLE:
+            return Variant.Type.DOUBLE;
+          case DECIMAL4:
+            return Variant.Type.DECIMAL4;
+          case DECIMAL8:
+            return Variant.Type.DECIMAL8;
+          case DECIMAL16:
+            return Variant.Type.DECIMAL16;
+          case DATE:
+            return Variant.Type.DATE;
+          case TIMESTAMP_TZ:
+            return Variant.Type.TIMESTAMP_TZ;
+          case TIMESTAMP_NTZ:
+            return Variant.Type.TIMESTAMP_NTZ;
+          case FLOAT:
+            return Variant.Type.FLOAT;
+          case BINARY:
+            return Variant.Type.BINARY;
+          case LONG_STR:
+            return Variant.Type.STRING;
+          case TIME:
+            return Variant.Type.TIME;
+          case TIMESTAMP_NANOS:
+            return Variant.Type.TIMESTAMP_NANOS;
+          case TIMESTAMP_NANOS_NTZ:
+            return Variant.Type.TIMESTAMP_NANOS_NTZ;
+          case UUID:
+            return Variant.Type.UUID;
+          default:
+            throw new UnsupportedOperationException(
+                String.format("Unknown type in Variant. primitive type: %d", 
typeInfo));
+        }
+    }
+  }
+
+  private static IllegalArgumentException unexpectedType(Variant.Type type) {
+    return new IllegalArgumentException("Expected type to be " + type);
+  }
+
+  private static IllegalArgumentException unexpectedType(Variant.Type[] types) 
{
+    return new IllegalArgumentException("Expected type to be one of: " + 
Arrays.toString(types));
+  }
+
+  static boolean getBoolean(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) {
+      throw unexpectedType(Variant.Type.BOOLEAN);
+    }
+    return typeInfo == TRUE;
+  }
+
+  /**
+   * Returns a long value from Variant value `value[pos...]`.
+   * It is only legal to call it if `getType` returns one of Type.BYTE, SHORT, 
INT, LONG,
+   * DATE, TIMESTAMP, TIMESTAMP_NTZ, TIME, TIMESTAMP_NANOS, 
TIMESTAMP_NANOS_NTZ.
+   * If the type is `DATE`, the return value is guaranteed to fit into an int 
and
+   * represents the number of days from the Unix epoch.
+   * If the type is `TIMESTAMP/TIMESTAMP_NTZ`, the return value represents the 
number of
+   * microseconds from the Unix epoch.
+   * If the type is `TIME`, the return value represents the number of 
microseconds since midnight.
+   * If the type is `TIMESTAMP_NANOS/TIMESTAMP_NANOS_NTZ`, the return value 
represents the number of
+   * nanoseconds from the Unix epoch.
+   * @param value The Variant value
+   * @return The long value
+   */
+  static long getLong(ByteBuffer value) {
+    checkIndex(value.position(), value.limit());
+    int basicType = value.get(value.position()) & BASIC_TYPE_MASK;
+    int typeInfo = (value.get(value.position()) >> BASIC_TYPE_BITS) & 
PRIMITIVE_TYPE_MASK;
+    if (basicType != PRIMITIVE) {
+      throw new IllegalStateException(

Review Comment:
   Forgot I added that one. Updated to use that list one.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java:
##########
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.util.UUID;
+
+/**
+ * This Variant class holds the Variant-encoded value and metadata binary 
values.
+ */
+public final class Variant {
+  /** The buffer that contains the Variant value. */
+  final ByteBuffer value;
+
+  /** The buffer that contains the Variant metadata. */
+  final ByteBuffer metadata;
+
+  /**
+   * The threshold to switch from linear search to binary search when looking 
up a field by key in
+   * an object. This is a performance optimization to avoid the overhead of 
binary search for a
+   * short list.
+   */
+  static final int BINARY_SEARCH_THRESHOLD = 32;
+
+  public Variant(byte[] value, byte[] metadata) {
+    this(value, 0, value.length, metadata, 0, metadata.length);
+  }
+
+  public Variant(byte[] value, int valuePos, int valueLength, byte[] metadata, 
int metadataPos, int metadataLength) {
+    this(ByteBuffer.wrap(value, valuePos, valueLength), 
ByteBuffer.wrap(metadata, metadataPos, metadataLength));
+  }
+
+  public Variant(ByteBuffer value, ByteBuffer metadata) {
+    // THe buffers are read single-byte at a time, so the endianness of the 
input buffers
+    // are not important.
+    this.value = value.asReadOnlyBuffer();
+    this.metadata = metadata.asReadOnlyBuffer();
+
+    // There is currently only one allowed version.
+    if ((metadata.get(metadata.position()) & VariantUtil.VERSION_MASK) != 
VariantUtil.VERSION) {
+      throw new UnsupportedOperationException(String.format(
+          "Unsupported variant metadata version: %d",
+          metadata.get(metadata.position()) & VariantUtil.VERSION_MASK));
+    }
+  }
+
+  /**
+   * @return the boolean value
+   */
+  public boolean getBoolean() {
+    return VariantUtil.getBoolean(value);
+  }
+
+  /**
+   * @return the byte value
+   */
+  public byte getByte() {
+    long longValue = VariantUtil.getLong(value);
+    if (longValue < Byte.MIN_VALUE || longValue > Byte.MAX_VALUE) {
+      throw new IllegalStateException("Value out of range for byte: " + 
longValue);
+    }
+    return (byte) longValue;
+  }
+
+  /**
+   * @return the short value
+   */
+  public short getShort() {
+    long longValue = VariantUtil.getLong(value);
+    if (longValue < Short.MIN_VALUE || longValue > Short.MAX_VALUE) {
+      throw new IllegalStateException("Value out of range for short: " + 
longValue);
+    }
+    return (short) longValue;
+  }
+
+  /**
+   * @return the int value
+   */
+  public int getInt() {
+    long longValue = VariantUtil.getLong(value);
+    if (longValue < Integer.MIN_VALUE || longValue > Integer.MAX_VALUE) {
+      throw new IllegalStateException("Value out of range for int: " + 
longValue);
+    }
+    return (int) longValue;
+  }
+
+  /**
+   * @return the long value
+   */
+  public long getLong() {
+    return VariantUtil.getLong(value);
+  }
+
+  /**
+   * @return the double value
+   */
+  public double getDouble() {
+    return VariantUtil.getDouble(value);
+  }
+
+  /**
+   * @return the decimal value
+   */
+  public BigDecimal getDecimal() {
+    return VariantUtil.getDecimal(value);
+  }
+
+  /**
+   * @return the float value
+   */
+  public float getFloat() {
+    return VariantUtil.getFloat(value);
+  }
+
+  /**
+   * @return the binary value
+   */
+  public ByteBuffer getBinary() {
+    return VariantUtil.getBinary(value);
+  }
+
+  /**
+   * @return the UUID value
+   */
+  public UUID getUUID() {
+    return VariantUtil.getUUID(value);
+  }
+
+  /**
+   * @return the string value
+   */
+  public String getString() {
+    return VariantUtil.getString(value);
+  }
+
+  /**
+   * The value type of Variant value. It is determined by the header byte.
+   */
+  public enum Type {
+    OBJECT,
+    ARRAY,
+    NULL,
+    BOOLEAN,
+    BYTE,
+    SHORT,
+    INT,
+    LONG,
+    STRING,
+    DOUBLE,
+    DECIMAL4,
+    DECIMAL8,
+    DECIMAL16,
+    DATE,
+    TIMESTAMP_TZ,
+    TIMESTAMP_NTZ,
+    FLOAT,
+    BINARY,
+    TIME,
+    TIMESTAMP_NANOS,

Review Comment:
   ahhh, good catch. Renamed.



##########
parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java:
##########
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.variant;
+
+import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.util.UUID;
+
+/**
+ * This Variant class holds the Variant-encoded value and metadata binary 
values.
+ */
+public final class Variant {
+  /** The buffer that contains the Variant value. */
+  final ByteBuffer value;
+
+  /** The buffer that contains the Variant metadata. */
+  final ByteBuffer metadata;
+
+  /**
+   * The threshold to switch from linear search to binary search when looking 
up a field by key in
+   * an object. This is a performance optimization to avoid the overhead of 
binary search for a
+   * short list.
+   */
+  static final int BINARY_SEARCH_THRESHOLD = 32;
+
+  public Variant(byte[] value, byte[] metadata) {
+    this(value, 0, value.length, metadata, 0, metadata.length);
+  }
+
+  public Variant(byte[] value, int valuePos, int valueLength, byte[] metadata, 
int metadataPos, int metadataLength) {
+    this(ByteBuffer.wrap(value, valuePos, valueLength), 
ByteBuffer.wrap(metadata, metadataPos, metadataLength));
+  }
+
+  public Variant(ByteBuffer value, ByteBuffer metadata) {
+    // THe buffers are read single-byte at a time, so the endianness of the 
input buffers
+    // are not important.
+    this.value = value.asReadOnlyBuffer();
+    this.metadata = metadata.asReadOnlyBuffer();
+
+    // There is currently only one allowed version.
+    if ((metadata.get(metadata.position()) & VariantUtil.VERSION_MASK) != 
VariantUtil.VERSION) {
+      throw new UnsupportedOperationException(String.format(
+          "Unsupported variant metadata version: %d",
+          metadata.get(metadata.position()) & VariantUtil.VERSION_MASK));
+    }
+  }
+
+  /**
+   * @return the boolean value
+   */
+  public boolean getBoolean() {
+    return VariantUtil.getBoolean(value);
+  }
+
+  /**
+   * @return the byte value
+   */
+  public byte getByte() {
+    long longValue = VariantUtil.getLong(value);
+    if (longValue < Byte.MIN_VALUE || longValue > Byte.MAX_VALUE) {
+      throw new IllegalStateException("Value out of range for byte: " + 
longValue);
+    }
+    return (byte) longValue;
+  }
+
+  /**
+   * @return the short value
+   */
+  public short getShort() {
+    long longValue = VariantUtil.getLong(value);
+    if (longValue < Short.MIN_VALUE || longValue > Short.MAX_VALUE) {
+      throw new IllegalStateException("Value out of range for short: " + 
longValue);
+    }
+    return (short) longValue;
+  }
+
+  /**
+   * @return the int value
+   */
+  public int getInt() {
+    long longValue = VariantUtil.getLong(value);
+    if (longValue < Integer.MIN_VALUE || longValue > Integer.MAX_VALUE) {
+      throw new IllegalStateException("Value out of range for int: " + 
longValue);
+    }
+    return (int) longValue;
+  }
+
+  /**
+   * @return the long value
+   */
+  public long getLong() {
+    return VariantUtil.getLong(value);
+  }
+
+  /**
+   * @return the double value
+   */
+  public double getDouble() {
+    return VariantUtil.getDouble(value);
+  }
+
+  /**
+   * @return the decimal value
+   */
+  public BigDecimal getDecimal() {
+    return VariantUtil.getDecimal(value);
+  }
+
+  /**
+   * @return the float value
+   */
+  public float getFloat() {
+    return VariantUtil.getFloat(value);
+  }
+
+  /**
+   * @return the binary value
+   */
+  public ByteBuffer getBinary() {
+    return VariantUtil.getBinary(value);
+  }
+
+  /**
+   * @return the UUID value
+   */
+  public UUID getUUID() {
+    return VariantUtil.getUUID(value);
+  }
+
+  /**
+   * @return the string value
+   */
+  public String getString() {
+    return VariantUtil.getString(value);
+  }
+
+  /**
+   * The value type of Variant value. It is determined by the header byte.
+   */
+  public enum Type {
+    OBJECT,
+    ARRAY,
+    NULL,
+    BOOLEAN,
+    BYTE,
+    SHORT,
+    INT,
+    LONG,
+    STRING,
+    DOUBLE,
+    DECIMAL4,
+    DECIMAL8,
+    DECIMAL16,
+    DATE,
+    TIMESTAMP_TZ,
+    TIMESTAMP_NTZ,
+    FLOAT,
+    BINARY,
+    TIME,
+    TIMESTAMP_NANOS,
+    TIMESTAMP_NANOS_NTZ,
+    UUID
+  }
+
+  /**
+   * @return the type of the variant value
+   */
+  public Type getType() {
+    return VariantUtil.getType(value);
+  }
+
+  /**
+   * @return the number of object fields in the variant. `getType()` must be 
`Type.OBJECT`.
+   */
+  public int numObjectElements() {
+    return VariantUtil.getObjectInfo(value).numElements;
+  }
+
+  /**
+   * Returns the object field Variant value whose key is equal to `key`.
+   * Return null if the key is not found. `getType()` must be `Type.OBJECT`.

Review Comment:
   This will throw an `IllegalArgumentException`. Updated the comments.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to