uuid test file (#92)

maplefu Wed, 13 Aug 2025 23:19:21 -0700

This is an automated email from the ASF dual-hosted git repository.

maplefu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-testing.git



The following commit(s) were added to refs/heads/master by this push:
     new 5cbfc43  Add Primitive time/timestamp_nanos/timestampntz_nanos/uuid 
test file (#92)
5cbfc43 is described below

commit 5cbfc43d488c9c8404a1a7088cca400ae095b831
Author: Congxian Qiu <[email protected]>
AuthorDate: Thu Aug 14 14:19:11 2025 +0800

    Add Primitive time/timestamp_nanos/timestampntz_nanos/uuid test file (#92)
    
    * add primitive_time
    
    * update data_dictionary and regen script
    
    * add timestamp_nanos(tz&ntz) and uuid binary artifacts
    
    * fix typo
    
    * update the primitive data order in data_dictionary.json
    
    * update readme
---
 variant/README.md                             |   4 ++++
 variant/data_dictionary.json                  |   8 ++++++--
 variant/primitive_time.metadata               | Bin 0 -> 3 bytes
 variant/primitive_time.value                  | Bin 0 -> 9 bytes
 variant/primitive_timestamp_nanos.metadata    | Bin 0 -> 3 bytes
 variant/primitive_timestamp_nanos.value       |   1 +
 variant/primitive_timestampntz_nanos.metadata | Bin 0 -> 3 bytes
 variant/primitive_timestampntz_nanos.value    |   1 +
 variant/primitive_uuid.metadata               | Bin 0 -> 3 bytes
 variant/primitive_uuid.value                  |   1 +
 variant/regen.py                              |   9 ++-------
 11 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/variant/README.md b/variant/README.md
index 418dfa7..c09c49a 100644
--- a/variant/README.md
+++ b/variant/README.md
@@ -69,5 +69,9 @@ resulting in a single `0` byte:
 echo -n 'a' | tr a '\0' > primitive_null.value
 ```
 
+### Modification 2: Created `TimeNTZ/Timestamp with timezone nanos/Timestamp 
without timezone nanos/UUID` with Iceberg test code
+
+Currently, Spark [does not 
support](https://github.com/apache/spark/blob/master/common/variant/README.md) 
Variant values containing UUID, Time, or nanosecond-precision Timestamp. the 
`primitive_time.[metadata/value]`, 
`primitive_timestamp_nanos.[metadata/value]`, 
`primitive_timestampntz_nanos.[metadata/value]` and 
`primitive_uuid.[metadata/data]` was generated by [Iceberg test 
code](https://github.com/apache/iceberg/blob/3a4215dbb714477c89681ab94f1197b6ebcbdfff/parquet/src/test/java/org/
 [...]
+
 [Variant]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
 [primitive types listed in the spec]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-primitive-type-basic_type0
diff --git a/variant/data_dictionary.json b/variant/data_dictionary.json
index bdd16ef..8a1faae 100644
--- a/variant/data_dictionary.json
+++ b/variant/data_dictionary.json
@@ -67,7 +67,11 @@
     "primitive_int8": 42,
     "primitive_null": null,
     "primitive_string": "This string is longer than 64 bytes and therefore 
does not fit in a short_string and it also includes several non ascii 
characters such as \ud83d\udc22, \ud83d\udc96, \u2665\ufe0f, \ud83c\udfa3 and 
\ud83e\udd26!!",
+    "primitive_time": "12:33:54:123456",
     "primitive_timestamp": "2025-04-16 12:34:56.78-04:00",
     "primitive_timestampntz": "2025-04-16 12:34:56.78",
-    "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)"
-}
\ No newline at end of file
+    "primitive_timestamp_nanos": "2024-11-07T12:33:54.123456789+00:00",
+    "primitive_timestampntz_nanos": "2024-11-07T12:33:54.123456789",
+    "primitive_uuid": "f24f9b64-81fa-49d1-b74e-8c09a6e31c56",
+    "short_string": "Less than 64 bytes (\u2764\ufe0f with utf8)",
+}
diff --git a/variant/primitive_time.metadata b/variant/primitive_time.metadata
new file mode 100644
index 0000000..12db478
Binary files /dev/null and b/variant/primitive_time.metadata differ
diff --git a/variant/primitive_time.value b/variant/primitive_time.value
new file mode 100644
index 0000000..9fa4fb3
Binary files /dev/null and b/variant/primitive_time.value differ
diff --git a/variant/primitive_timestamp_nanos.metadata 
b/variant/primitive_timestamp_nanos.metadata
new file mode 100644
index 0000000..12db478
Binary files /dev/null and b/variant/primitive_timestamp_nanos.metadata differ
diff --git a/variant/primitive_timestamp_nanos.value 
b/variant/primitive_timestamp_nanos.value
new file mode 100644
index 0000000..2e7e246
--- /dev/null
+++ b/variant/primitive_timestamp_nanos.value
@@ -0,0 +1 @@
+HA:l��
\ No newline at end of file
diff --git a/variant/primitive_timestampntz_nanos.metadata 
b/variant/primitive_timestampntz_nanos.metadata
new file mode 100644
index 0000000..12db478
Binary files /dev/null and b/variant/primitive_timestampntz_nanos.metadata 
differ
diff --git a/variant/primitive_timestampntz_nanos.value 
b/variant/primitive_timestampntz_nanos.value
new file mode 100644
index 0000000..1910207
--- /dev/null
+++ b/variant/primitive_timestampntz_nanos.value
@@ -0,0 +1 @@
+LA:l��
\ No newline at end of file
diff --git a/variant/primitive_uuid.metadata b/variant/primitive_uuid.metadata
new file mode 100644
index 0000000..12db478
Binary files /dev/null and b/variant/primitive_uuid.metadata differ
diff --git a/variant/primitive_uuid.value b/variant/primitive_uuid.value
new file mode 100644
index 0000000..314f3a6
--- /dev/null
+++ b/variant/primitive_uuid.value
@@ -0,0 +1 @@
+P�O�d��IѷN�    ��V
\ No newline at end of file
diff --git a/variant/regen.py b/variant/regen.py
index b776afd..d2e14f8 100644
--- a/variant/regen.py
+++ b/variant/regen.py
@@ -75,12 +75,7 @@ INSERT INTO T VALUES ('primitive_float', 
1234567890.1234::Float::Variant);
 INSERT INTO T VALUES ('primitive_binary', X'31337deadbeefcafe'::Variant);
 INSERT INTO T VALUES ('primitive_string', 'This string is longer than 64 bytes 
and therefore does not fit in a short_string and it also includes several non 
ascii characters such as 🐢, 💖, ♥️, 🎣 and 🤦!!'::Variant);
 
--- https://github.com/apache/parquet-testing/issues/79
--- is not clear how to create the following types using Spark SQL
--- TODO TimeNTZ                    (Type ID 17)
--- TODO 'timestamp with timezone (NANOS)'  (Type ID 18)
--- TODO 'timestamp with time zone (NANOS)' (Type ID 19)
--- TODO 'UUID'                     (Type ID 20)
+-- binary artifacts of 'TimeNTZ'/'timestamp with timezone (NANOS)'/'timestamp 
without time zone (NANOS)'/'UUID' was generated by the iceberg test code, 
please ref to https://github.com/apache/parquet-testing/pull/92 for more detail
 
 -------------------------------
 -- Short string (basic_type=1)
@@ -170,4 +165,4 @@ with open(f"data_dictionary.json", "w") as f:
 # Note: It is possible to write the output to a single parquet file, using a 
command
 # such as:
 # spark.sql("SELECT * FROM 
output").repartition(1).write.parquet('variant.parquet')
-# At the time of writing, this file does not have the logical type annotation 
for VARIANT
\ No newline at end of file
+# At the time of writing, this file does not have the logical type annotation 
for VARIANT

(parquet-testing) branch master updated: Add Primitive time/timestamp_nanos/timestampntz_nanos/uuid test file (#92)

Reply via email to